From abe0dc131b045cf213198597b71b2a72b469ae0c Mon Sep 17 00:00:00 2001
From: bdim404 <i@bdim.moe>
Date: Tue, 3 Feb 2026 15:44:25 +0800
Subject: [PATCH] feat: Implement Aliyun TTS backend integration and API key
 management

---
 qwen3-tts-backend/.env.example                |   4 +
 qwen3-tts-backend/api/auth.py                 |  80 +++-
 qwen3-tts-backend/api/tts.py                  | 232 +++++------
 qwen3-tts-backend/config.py                   |   8 +
 qwen3-tts-backend/core/security.py            |  24 ++
 qwen3-tts-backend/core/tts_service.py         | 371 ++++++++++++++++++
 qwen3-tts-backend/db/crud.py                  |  15 +
 qwen3-tts-backend/db/models.py                |   2 +
 qwen3-tts-backend/main.py                     |  22 ++
 qwen3-tts-backend/requirements.txt            |   2 +
 qwen3-tts-backend/schemas/tts.py              |   2 +
 qwen3-tts-backend/schemas/user.py             |   7 +
 qwen3-tts-frontend/index.html                 |   2 +-
 qwen3-tts-frontend/public/qwen.svg            |   1 +
 .../src/components/tts/CustomVoiceForm.tsx    |  19 +
 .../src/components/tts/VoiceCloneForm.tsx     |  18 +
 .../src/components/tts/VoiceDesignForm.tsx    |  19 +
 qwen3-tts-frontend/src/lib/api.ts             |   3 +
 qwen3-tts-frontend/src/types/tts.ts           |   3 +
 19 files changed, 716 insertions(+), 118 deletions(-)
 create mode 100644 qwen3-tts-backend/core/tts_service.py
 create mode 100644 qwen3-tts-frontend/public/qwen.svg

diff --git a/qwen3-tts-backend/.env.example b/qwen3-tts-backend/.env.example
index 7d4e8c2..b012d8b 100644
--- a/qwen3-tts-backend/.env.example
+++ b/qwen3-tts-backend/.env.example
@@ -20,3 +20,7 @@ BATCH_SIZE=4
 BATCH_WAIT_TIME=0.5
 MAX_TEXT_LENGTH=1000
 MAX_AUDIO_SIZE_MB=10
+
+ALIYUN_REGION=beijing
+
+DEFAULT_BACKEND=local
diff --git a/qwen3-tts-backend/api/auth.py b/qwen3-tts-backend/api/auth.py
index ec8a599..1869d77 100644
--- a/qwen3-tts-backend/api/auth.py
+++ b/qwen3-tts-backend/api/auth.py
@@ -14,8 +14,8 @@ from core.security import (
     decode_access_token
 )
 from db.database import get_db
-from db.crud import get_user_by_username, get_user_by_email, create_user, change_user_password
-from schemas.user import User, UserCreate, Token, PasswordChange
+from db.crud import get_user_by_username, get_user_by_email, create_user, change_user_password, update_user_aliyun_key
+from schemas.user import User, UserCreate, Token, PasswordChange, AliyunKeyUpdate, AliyunKeyVerifyResponse
 
 router = APIRouter(prefix="/auth", tags=["authentication"])
 
@@ -135,3 +135,79 @@ async def change_password(
         )
 
     return user
+
+@router.post("/aliyun-key", response_model=User)
+@limiter.limit("5/minute")
+async def set_aliyun_key(
+    request: Request,
+    key_data: AliyunKeyUpdate,
+    current_user: Annotated[User, Depends(get_current_user)],
+    db: Session = Depends(get_db)
+):
+    from core.security import encrypt_api_key
+    from core.tts_service import AliyunTTSBackend
+
+    api_key = key_data.api_key.strip()
+
+    aliyun_backend = AliyunTTSBackend(api_key=api_key, region=settings.ALIYUN_REGION)
+    health = await aliyun_backend.health_check()
+
+    if not health.get("available", False):
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            detail="Invalid Aliyun API key. Please check your API key and try again."
+        )
+
+    encrypted_key = encrypt_api_key(api_key)
+
+    user = update_user_aliyun_key(
+        db,
+        user_id=current_user.id,
+        encrypted_api_key=encrypted_key
+    )
+
+    if not user:
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail="User not found"
+        )
+
+    return user
+
+@router.get("/aliyun-key/verify", response_model=AliyunKeyVerifyResponse)
+@limiter.limit("10/minute")
+async def verify_aliyun_key(
+    request: Request,
+    current_user: Annotated[User, Depends(get_current_user)],
+    db: Session = Depends(get_db)
+):
+    from core.security import decrypt_api_key
+    from core.tts_service import AliyunTTSBackend
+
+    if not current_user.aliyun_api_key:
+        return AliyunKeyVerifyResponse(
+            valid=False,
+            message="No Aliyun API key configured"
+        )
+
+    api_key = decrypt_api_key(current_user.aliyun_api_key)
+
+    if not api_key:
+        return AliyunKeyVerifyResponse(
+            valid=False,
+            message="Failed to decrypt API key"
+        )
+
+    aliyun_backend = AliyunTTSBackend(api_key=api_key, region=settings.ALIYUN_REGION)
+    health = await aliyun_backend.health_check()
+
+    if health.get("available", False):
+        return AliyunKeyVerifyResponse(
+            valid=True,
+            message="Aliyun API key is valid and working"
+        )
+    else:
+        return AliyunKeyVerifyResponse(
+            valid=False,
+            message="Aliyun API key is not working. Please check your API key."
+        )
diff --git a/qwen3-tts-backend/api/tts.py b/qwen3-tts-backend/api/tts.py
index 004f84b..a584c9c 100644
--- a/qwen3-tts-backend/api/tts.py
+++ b/qwen3-tts-backend/api/tts.py
@@ -36,9 +36,12 @@ async def process_custom_voice_job(
     job_id: int,
     user_id: int,
     request_data: dict,
+    backend_type: str,
     db_url: str
 ):
     from core.database import SessionLocal
+    from core.tts_service import TTSServiceFactory
+    from core.security import decrypt_api_key
 
     db = SessionLocal()
     try:
@@ -51,42 +54,24 @@ async def process_custom_voice_job(
         job.started_at = datetime.utcnow()
         db.commit()
 
-        logger.info(f"Processing custom-voice job {job_id}")
+        logger.info(f"Processing custom-voice job {job_id} with backend {backend_type}")
 
-        model_manager = await ModelManager.get_instance()
-        await model_manager.load_model("custom-voice")
-        _, tts = await model_manager.get_current_model()
+        user_api_key = None
+        if backend_type == "aliyun":
+            user = db.query(User).filter(User.id == user_id).first()
+            if user and user.aliyun_api_key:
+                user_api_key = decrypt_api_key(user.aliyun_api_key)
 
-        if tts is None:
-            raise RuntimeError("Failed to load custom-voice model")
+        backend = await TTSServiceFactory.get_backend(backend_type, user_api_key)
 
-        result = tts.generate_custom_voice(
-            text=request_data['text'],
-            language=request_data['language'],
-            speaker=request_data['speaker'],
-            instruct=request_data.get('instruct', ''),
-            max_new_tokens=request_data['max_new_tokens'],
-            temperature=request_data['temperature'],
-            top_k=request_data['top_k'],
-            top_p=request_data['top_p'],
-            repetition_penalty=request_data['repetition_penalty']
-        )
-
-        import numpy as np
-        if isinstance(result, tuple):
-            audio_data = result[0]
-        elif isinstance(result, list):
-            audio_data = np.array(result)
-        else:
-            audio_data = result
-
-        from pathlib import Path
+        audio_bytes, sample_rate = await backend.generate_custom_voice(request_data)
 
         timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
         filename = f"{user_id}_{job_id}_{timestamp}.wav"
         output_path = Path(settings.OUTPUT_DIR) / filename
 
-        save_audio_file(audio_data, 24000, output_path)
+        with open(output_path, 'wb') as f:
+            f.write(audio_bytes)
 
         job.status = JobStatus.COMPLETED
         job.output_path = str(output_path)
@@ -112,9 +97,12 @@ async def process_voice_design_job(
     job_id: int,
     user_id: int,
     request_data: dict,
+    backend_type: str,
     db_url: str
 ):
     from core.database import SessionLocal
+    from core.tts_service import TTSServiceFactory
+    from core.security import decrypt_api_key
 
     db = SessionLocal()
     try:
@@ -127,41 +115,24 @@ async def process_voice_design_job(
         job.started_at = datetime.utcnow()
         db.commit()
 
-        logger.info(f"Processing voice-design job {job_id}")
+        logger.info(f"Processing voice-design job {job_id} with backend {backend_type}")
 
-        model_manager = await ModelManager.get_instance()
-        await model_manager.load_model("voice-design")
-        _, tts = await model_manager.get_current_model()
+        user_api_key = None
+        if backend_type == "aliyun":
+            user = db.query(User).filter(User.id == user_id).first()
+            if user and user.aliyun_api_key:
+                user_api_key = decrypt_api_key(user.aliyun_api_key)
 
-        if tts is None:
-            raise RuntimeError("Failed to load voice-design model")
+        backend = await TTSServiceFactory.get_backend(backend_type, user_api_key)
 
-        result = tts.generate_voice_design(
-            text=request_data['text'],
-            language=request_data['language'],
-            instruct=request_data['instruct'],
-            max_new_tokens=request_data['max_new_tokens'],
-            temperature=request_data['temperature'],
-            top_k=request_data['top_k'],
-            top_p=request_data['top_p'],
-            repetition_penalty=request_data['repetition_penalty']
-        )
-
-        import numpy as np
-        if isinstance(result, tuple):
-            audio_data = result[0]
-        elif isinstance(result, list):
-            audio_data = np.array(result)
-        else:
-            audio_data = result
-
-        from pathlib import Path
+        audio_bytes, sample_rate = await backend.generate_voice_design(request_data)
 
         timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
         filename = f"{user_id}_{job_id}_{timestamp}.wav"
         output_path = Path(settings.OUTPUT_DIR) / filename
 
-        save_audio_file(audio_data, 24000, output_path)
+        with open(output_path, 'wb') as f:
+            f.write(audio_bytes)
 
         job.status = JobStatus.COMPLETED
         job.output_path = str(output_path)
@@ -188,9 +159,11 @@ async def process_voice_clone_job(
     user_id: int,
     request_data: dict,
     ref_audio_path: str,
+    backend_type: str,
     db_url: str
 ):
     from core.database import SessionLocal
+    from core.tts_service import TTSServiceFactory
     import numpy as np
 
     db = SessionLocal()
@@ -204,7 +177,14 @@ async def process_voice_clone_job(
         job.started_at = datetime.utcnow()
         db.commit()
 
-        logger.info(f"Processing voice-clone job {job_id}")
+        logger.info(f"Processing voice-clone job {job_id} with backend {backend_type}")
+
+        from core.security import decrypt_api_key
+        user_api_key = None
+        if backend_type == "aliyun":
+            user = db.query(User).filter(User.id == user_id).first()
+            if user and user.aliyun_api_key:
+                user_api_key = decrypt_api_key(user.aliyun_api_key)
 
         with open(ref_audio_path, 'rb') as f:
             ref_audio_data = f.read()
@@ -212,49 +192,49 @@ async def process_voice_clone_job(
         cache_manager = await VoiceCacheManager.get_instance()
         ref_audio_hash = cache_manager.get_audio_hash(ref_audio_data)
 
-        x_vector = None
-        cache_id = None
-
-        if request_data.get('use_cache', True):
-            cached = await cache_manager.get_cache(user_id, ref_audio_hash, db)
-            if cached:
-                x_vector = cached['data']
-                cache_id = cached['cache_id']
-                cache_metrics.record_hit(user_id)
-                logger.info(f"Cache hit for job {job_id}, cache_id={cache_id}")
-
-        if x_vector is None:
-            cache_metrics.record_miss(user_id)
-            logger.info(f"Cache miss for job {job_id}, creating voice clone prompt")
-            ref_audio_array, ref_sr = process_ref_audio(ref_audio_data)
-
-            model_manager = await ModelManager.get_instance()
-            await model_manager.load_model("base")
-            _, tts = await model_manager.get_current_model()
-
-            if tts is None:
-                raise RuntimeError("Failed to load base model")
-
-            x_vector = tts.create_voice_clone_prompt(
-                ref_audio=(ref_audio_array, ref_sr),
-                ref_text=request_data.get('ref_text', ''),
-                x_vector_only_mode=request_data.get('x_vector_only_mode', False)
-            )
+        if request_data.get('x_vector_only_mode', False) and backend_type == "local":
+            x_vector = None
+            cache_id = None
 
             if request_data.get('use_cache', True):
-                features = extract_audio_features(ref_audio_array, ref_sr)
-                metadata = {
-                    'duration': features['duration'],
-                    'sample_rate': features['sample_rate'],
-                    'ref_text': request_data.get('ref_text', ''),
-                    'x_vector_only_mode': request_data.get('x_vector_only_mode', False)
-                }
-                cache_id = await cache_manager.set_cache(
-                    user_id, ref_audio_hash, x_vector, metadata, db
-                )
-                logger.info(f"Created cache for job {job_id}, cache_id={cache_id}")
+                cached = await cache_manager.get_cache(user_id, ref_audio_hash, db)
+                if cached:
+                    x_vector = cached['data']
+                    cache_id = cached['cache_id']
+                    cache_metrics.record_hit(user_id)
+                    logger.info(f"Cache hit for job {job_id}, cache_id={cache_id}")
+
+            if x_vector is None:
+                cache_metrics.record_miss(user_id)
+                logger.info(f"Cache miss for job {job_id}, creating voice clone prompt")
+                ref_audio_array, ref_sr = process_ref_audio(ref_audio_data)
+
+                model_manager = await ModelManager.get_instance()
+                await model_manager.load_model("base")
+                _, tts = await model_manager.get_current_model()
+
+                if tts is None:
+                    raise RuntimeError("Failed to load base model")
+
+                x_vector = tts.create_voice_clone_prompt(
+                    ref_audio=(ref_audio_array, ref_sr),
+                    ref_text=request_data.get('ref_text', ''),
+                    x_vector_only_mode=True
+                )
+
+                if request_data.get('use_cache', True):
+                    features = extract_audio_features(ref_audio_array, ref_sr)
+                    metadata = {
+                        'duration': features['duration'],
+                        'sample_rate': features['sample_rate'],
+                        'ref_text': request_data.get('ref_text', ''),
+                        'x_vector_only_mode': True
+                    }
+                    cache_id = await cache_manager.set_cache(
+                        user_id, ref_audio_hash, x_vector, metadata, db
+                    )
+                    logger.info(f"Created cache for job {job_id}, cache_id={cache_id}")
 
-        if request_data.get('x_vector_only_mode', False):
             job.status = JobStatus.COMPLETED
             job.output_path = f"x_vector_cached_{cache_id}"
             job.completed_at = datetime.utcnow()
@@ -262,31 +242,16 @@ async def process_voice_clone_job(
             logger.info(f"Job {job_id} completed (x_vector_only_mode)")
             return
 
-        model_manager = await ModelManager.get_instance()
-        await model_manager.load_model("base")
-        _, tts = await model_manager.get_current_model()
+        backend = await TTSServiceFactory.get_backend(backend_type, user_api_key)
 
-        if tts is None:
-            raise RuntimeError("Failed to load base model")
-
-        wavs, sample_rate = tts.generate_voice_clone(
-            text=request_data['text'],
-            language=request_data['language'],
-            voice_clone_prompt=x_vector,
-            max_new_tokens=request_data['max_new_tokens'],
-            temperature=request_data['temperature'],
-            top_k=request_data['top_k'],
-            top_p=request_data['top_p'],
-            repetition_penalty=request_data['repetition_penalty']
-        )
-
-        audio_data = wavs[0] if isinstance(wavs, list) else wavs
+        audio_bytes, sample_rate = await backend.generate_voice_clone(request_data, ref_audio_data)
 
         timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
         filename = f"{user_id}_{job_id}_{timestamp}.wav"
         output_path = Path(settings.OUTPUT_DIR) / filename
 
-        save_audio_file(audio_data, sample_rate, output_path)
+        with open(output_path, 'wb') as f:
+            f.write(audio_bytes)
 
         job.status = JobStatus.COMPLETED
         job.output_path = str(output_path)
@@ -319,6 +284,16 @@ async def create_custom_voice_job(
     current_user: User = Depends(get_current_user),
     db: Session = Depends(get_db)
 ):
+    from core.security import decrypt_api_key
+
+    backend_type = req_data.backend or settings.DEFAULT_BACKEND
+    if backend_type == "aliyun":
+        if not current_user.aliyun_api_key:
+            raise HTTPException(status_code=400, detail="Aliyun API key not configured. Please set your API key first.")
+        user_api_key = decrypt_api_key(current_user.aliyun_api_key)
+        if not user_api_key:
+            raise HTTPException(status_code=400, detail="Invalid Aliyun API key. Please update your API key.")
+
     try:
         validate_text_length(req_data.text)
         language = validate_language(req_data.language)
@@ -339,6 +314,7 @@ async def create_custom_voice_job(
         user_id=current_user.id,
         job_type="custom-voice",
         status=JobStatus.PENDING,
+        backend_type=backend_type,
         input_data="",
         input_params={
             "text": req_data.text,
@@ -365,6 +341,7 @@ async def create_custom_voice_job(
         job.id,
         current_user.id,
         request_data,
+        backend_type,
         str(settings.DATABASE_URL)
     )
 
@@ -384,6 +361,16 @@ async def create_voice_design_job(
     current_user: User = Depends(get_current_user),
     db: Session = Depends(get_db)
 ):
+    from core.security import decrypt_api_key
+
+    backend_type = req_data.backend or settings.DEFAULT_BACKEND
+    if backend_type == "aliyun":
+        if not current_user.aliyun_api_key:
+            raise HTTPException(status_code=400, detail="Aliyun API key not configured. Please set your API key first.")
+        user_api_key = decrypt_api_key(current_user.aliyun_api_key)
+        if not user_api_key:
+            raise HTTPException(status_code=400, detail="Invalid Aliyun API key. Please update your API key.")
+
     try:
         validate_text_length(req_data.text)
         language = validate_language(req_data.language)
@@ -406,6 +393,7 @@ async def create_voice_design_job(
         user_id=current_user.id,
         job_type="voice-design",
         status=JobStatus.PENDING,
+        backend_type=backend_type,
         input_data="",
         input_params={
             "text": req_data.text,
@@ -430,6 +418,7 @@ async def create_voice_design_job(
         job.id,
         current_user.id,
         request_data,
+        backend_type,
         str(settings.DATABASE_URL)
     )
 
@@ -455,10 +444,21 @@ async def create_voice_clone_job(
     top_k: Optional[int] = Form(default=50),
     top_p: Optional[float] = Form(default=1.0),
     repetition_penalty: Optional[float] = Form(default=1.05),
+    backend: Optional[str] = Form(default=None),
     background_tasks: BackgroundTasks = None,
     current_user: User = Depends(get_current_user),
     db: Session = Depends(get_db)
 ):
+    from core.security import decrypt_api_key
+
+    backend_type = backend or settings.DEFAULT_BACKEND
+    if backend_type == "aliyun":
+        if not current_user.aliyun_api_key:
+            raise HTTPException(status_code=400, detail="Aliyun API key not configured. Please set your API key first.")
+        user_api_key = decrypt_api_key(current_user.aliyun_api_key)
+        if not user_api_key:
+            raise HTTPException(status_code=400, detail="Invalid Aliyun API key. Please update your API key.")
+
     try:
         validate_text_length(text)
         language = validate_language(language)
@@ -486,6 +486,7 @@ async def create_voice_clone_job(
         user_id=current_user.id,
         job_type="voice-clone",
         status=JobStatus.PENDING,
+        backend_type=backend_type,
         input_data="",
         input_params={
             "text": text,
@@ -520,6 +521,7 @@ async def create_voice_clone_job(
         current_user.id,
         request_data,
         tmp_audio_path,
+        backend_type,
         str(settings.DATABASE_URL)
     )
 
diff --git a/qwen3-tts-backend/config.py b/qwen3-tts-backend/config.py
index 24c2572..6110973 100644
--- a/qwen3-tts-backend/config.py
+++ b/qwen3-tts-backend/config.py
@@ -36,6 +36,14 @@ class Settings(BaseSettings):
     MAX_TEXT_LENGTH: int = Field(default=1000)
     MAX_AUDIO_SIZE_MB: int = Field(default=10)
 
+    ALIYUN_REGION: str = Field(default="beijing")
+
+    ALIYUN_MODEL_FLASH: str = Field(default="qwen3-tts-flash-realtime")
+    ALIYUN_MODEL_VC: str = Field(default="qwen3-tts-vc-realtime-2026-01-15")
+    ALIYUN_MODEL_VD: str = Field(default="qwen3-tts-vd-realtime-2026-01-15")
+
+    DEFAULT_BACKEND: str = Field(default="local")
+
     class Config:
         env_file = ".env"
         case_sensitive = True
diff --git a/qwen3-tts-backend/core/security.py b/qwen3-tts-backend/core/security.py
index 610018b..7c61ee5 100644
--- a/qwen3-tts-backend/core/security.py
+++ b/qwen3-tts-backend/core/security.py
@@ -2,6 +2,9 @@ from datetime import datetime, timedelta
 from typing import Optional
 from jose import JWTError, jwt
 from passlib.context import CryptContext
+from cryptography.fernet import Fernet
+import base64
+import hashlib
 
 from config import settings
 
@@ -33,3 +36,24 @@ def decode_access_token(token: str) -> Optional[str]:
         return username
     except JWTError:
         return None
+
+def _get_fernet_key() -> bytes:
+    key = hashlib.sha256(settings.SECRET_KEY.encode()).digest()
+    return base64.urlsafe_b64encode(key)
+
+def encrypt_api_key(api_key: str) -> str:
+    if not api_key:
+        return ""
+    fernet = Fernet(_get_fernet_key())
+    encrypted = fernet.encrypt(api_key.encode())
+    return encrypted.decode()
+
+def decrypt_api_key(encrypted_key: str) -> Optional[str]:
+    if not encrypted_key:
+        return None
+    try:
+        fernet = Fernet(_get_fernet_key())
+        decrypted = fernet.decrypt(encrypted_key.encode())
+        return decrypted.decode()
+    except Exception:
+        return None
diff --git a/qwen3-tts-backend/core/tts_service.py b/qwen3-tts-backend/core/tts_service.py
new file mode 100644
index 0000000..1e0d982
--- /dev/null
+++ b/qwen3-tts-backend/core/tts_service.py
@@ -0,0 +1,371 @@
+import time
+import logging
+from abc import ABC, abstractmethod
+from typing import Tuple, Optional
+import websockets
+import json
+import base64
+
+logger = logging.getLogger(__name__)
+
+
+class TTSBackend(ABC):
+    @abstractmethod
+    async def generate_custom_voice(self, params: dict) -> Tuple[bytes, int]:
+        pass
+
+    @abstractmethod
+    async def generate_voice_design(self, params: dict) -> Tuple[bytes, int]:
+        pass
+
+    @abstractmethod
+    async def generate_voice_clone(self, params: dict, ref_audio_bytes: bytes) -> Tuple[bytes, int]:
+        pass
+
+    @abstractmethod
+    async def health_check(self) -> dict:
+        pass
+
+
+class LocalTTSBackend(TTSBackend):
+    def __init__(self):
+        self.model_manager = None
+
+    async def initialize(self):
+        from core.model_manager import ModelManager
+        self.model_manager = await ModelManager.get_instance()
+
+    async def generate_custom_voice(self, params: dict) -> Tuple[bytes, int]:
+        await self.model_manager.load_model("custom-voice")
+        _, tts = await self.model_manager.get_current_model()
+
+        result = tts.generate_custom_voice(
+            text=params['text'],
+            language=params['language'],
+            speaker=params['speaker'],
+            instruct=params.get('instruct', ''),
+            max_new_tokens=params['max_new_tokens'],
+            temperature=params['temperature'],
+            top_k=params['top_k'],
+            top_p=params['top_p'],
+            repetition_penalty=params['repetition_penalty']
+        )
+
+        import numpy as np
+        if isinstance(result, tuple):
+            audio_data = result[0]
+        elif isinstance(result, list):
+            audio_data = np.array(result)
+        else:
+            audio_data = result
+
+        return self._numpy_to_bytes(audio_data), 24000
+
+    async def generate_voice_design(self, params: dict) -> Tuple[bytes, int]:
+        await self.model_manager.load_model("voice-design")
+        _, tts = await self.model_manager.get_current_model()
+
+        result = tts.generate_voice_design(
+            text=params['text'],
+            language=params['language'],
+            instruct=params['instruct'],
+            max_new_tokens=params['max_new_tokens'],
+            temperature=params['temperature'],
+            top_k=params['top_k'],
+            top_p=params['top_p'],
+            repetition_penalty=params['repetition_penalty']
+        )
+
+        import numpy as np
+        audio_data = result[0] if isinstance(result, tuple) else result
+        return self._numpy_to_bytes(audio_data), 24000
+
+    async def generate_voice_clone(self, params: dict, ref_audio_bytes: bytes) -> Tuple[bytes, int]:
+        from utils.audio import process_ref_audio
+
+        ref_audio_array, ref_sr = process_ref_audio(ref_audio_bytes)
+
+        await self.model_manager.load_model("base")
+        _, tts = await self.model_manager.get_current_model()
+
+        x_vector = tts.create_voice_clone_prompt(
+            ref_audio=(ref_audio_array, ref_sr),
+            ref_text=params.get('ref_text', ''),
+            x_vector_only_mode=False
+        )
+
+        wavs, sample_rate = tts.generate_voice_clone(
+            text=params['text'],
+            language=params['language'],
+            voice_clone_prompt=x_vector,
+            max_new_tokens=params['max_new_tokens'],
+            temperature=params['temperature'],
+            top_k=params['top_k'],
+            top_p=params['top_p'],
+            repetition_penalty=params['repetition_penalty']
+        )
+
+        audio_data = wavs[0] if isinstance(wavs, list) else wavs
+        return self._numpy_to_bytes(audio_data), sample_rate
+
+    async def health_check(self) -> dict:
+        return {
+            "available": self.model_manager is not None,
+            "current_model": self.model_manager.current_model_name if self.model_manager else None
+        }
+
+    @staticmethod
+    def _numpy_to_bytes(audio_array) -> bytes:
+        import numpy as np
+        import io
+        import scipy.io.wavfile
+
+        buffer = io.BytesIO()
+        scipy.io.wavfile.write(buffer, 24000, (audio_array * 32767).astype(np.int16))
+        buffer.seek(0)
+        return buffer.read()
+
+
+class AliyunTTSBackend(TTSBackend):
+    def __init__(self, api_key: str, region: str):
+        self.api_key = api_key
+        self.region = region
+        self.ws_url = self._get_ws_url(region)
+        self.http_url = self._get_http_url(region)
+
+    def _get_ws_url(self, region: str) -> str:
+        if region == "beijing":
+            return "wss://dashscope.aliyuncs.com/api-ws/v1/realtime"
+        else:
+            return "wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime"
+
+    def _get_http_url(self, region: str) -> str:
+        if region == "beijing":
+            return "https://dashscope.aliyuncs.com/api/v1/services/audio/tts/customization"
+        else:
+            return "https://dashscope-intl.aliyuncs.com/api/v1/services/audio/tts/customization"
+
+    async def generate_custom_voice(self, params: dict) -> Tuple[bytes, int]:
+        from core.config import settings
+
+        voice = self._map_speaker(params['speaker'])
+        model = settings.ALIYUN_MODEL_FLASH
+
+        return await self._generate_via_websocket(
+            model=model,
+            text=params['text'],
+            voice=voice,
+            language=params['language']
+        )
+
+    async def generate_voice_design(self, params: dict) -> Tuple[bytes, int]:
+        from core.config import settings
+
+        voice_id = await self._create_voice_design(
+            instruct=params['instruct'],
+            preview_text=params['text']
+        )
+
+        model = settings.ALIYUN_MODEL_VD
+
+        return await self._generate_via_websocket(
+            model=model,
+            text=params['text'],
+            voice=voice_id,
+            language=params['language']
+        )
+
+    async def generate_voice_clone(self, params: dict, ref_audio_bytes: bytes) -> Tuple[bytes, int]:
+        from core.config import settings
+
+        voice_id = await self._create_voice_clone(ref_audio_bytes)
+
+        model = settings.ALIYUN_MODEL_VC
+
+        return await self._generate_via_websocket(
+            model=model,
+            text=params['text'],
+            voice=voice_id,
+            language=params['language']
+        )
+
+    async def _generate_via_websocket(
+        self,
+        model: str,
+        text: str,
+        voice: str,
+        language: str
+    ) -> Tuple[bytes, int]:
+        audio_chunks = []
+
+        url = f"{self.ws_url}?model={model}"
+        headers = {"Authorization": f"Bearer {self.api_key}"}
+
+        async with websockets.connect(url, additional_headers=headers) as ws:
+            await ws.send(json.dumps({
+                "type": "session.update",
+                "session": {
+                    "mode": "server_commit",
+                    "voice": voice,
+                    "language_type": language,
+                    "response_format": "pcm",
+                    "sample_rate": 24000
+                }
+            }))
+
+            await ws.send(json.dumps({
+                "type": "input_text_buffer.append",
+                "text": text
+            }))
+
+            await ws.send(json.dumps({
+                "type": "session.finish"
+            }))
+
+            async for message in ws:
+                event = json.loads(message)
+                event_type = event.get('type')
+
+                if event_type == 'response.audio.delta':
+                    audio_data = base64.b64decode(event['delta'])
+                    audio_chunks.append(audio_data)
+                elif event_type == 'session.finished':
+                    break
+                elif event_type == 'error':
+                    raise RuntimeError(f"Aliyun API error: {event.get('error')}")
+
+        pcm_data = b''.join(audio_chunks)
+        wav_bytes = self._pcm_to_wav(pcm_data, 24000)
+        return wav_bytes, 24000
+
+    async def _create_voice_clone(self, ref_audio_bytes: bytes) -> str:
+        from core.config import settings
+        import httpx
+
+        audio_b64 = base64.b64encode(ref_audio_bytes).decode()
+        data_uri = f"data:audio/wav;base64,{audio_b64}"
+
+        payload = {
+            "model": "qwen-voice-enrollment",
+            "input": {
+                "action": "create",
+                "target_model": settings.ALIYUN_MODEL_VC,
+                "preferred_name": f"clone_{int(time.time())}",
+                "audio": {"data": data_uri}
+            }
+        }
+
+        headers = {
+            "Authorization": f"Bearer {self.api_key}",
+            "Content-Type": "application/json"
+        }
+
+        async with httpx.AsyncClient() as client:
+            resp = await client.post(self.http_url, json=payload, headers=headers, timeout=60)
+            resp.raise_for_status()
+            result = resp.json()
+            return result['output']['voice']
+
+    async def _create_voice_design(self, instruct: str, preview_text: str) -> str:
+        from core.config import settings
+        import httpx
+
+        payload = {
+            "model": "qwen-voice-design",
+            "input": {
+                "action": "create",
+                "target_model": settings.ALIYUN_MODEL_VD,
+                "voice_prompt": instruct,
+                "preview_text": preview_text,
+                "preferred_name": f"design_{int(time.time())}",
+                "language": "zh"
+            },
+            "parameters": {
+                "sample_rate": 24000,
+                "response_format": "wav"
+            }
+        }
+
+        headers = {
+            "Authorization": f"Bearer {self.api_key}",
+            "Content-Type": "application/json"
+        }
+
+        async with httpx.AsyncClient() as client:
+            resp = await client.post(self.http_url, json=payload, headers=headers, timeout=60)
+            resp.raise_for_status()
+            result = resp.json()
+            return result['output']['voice']
+
+    async def health_check(self) -> dict:
+        try:
+            import httpx
+            async with httpx.AsyncClient() as client:
+                resp = await client.get(
+                    self.http_url.replace('/customization', '/health'),
+                    headers={"Authorization": f"Bearer {self.api_key}"},
+                    timeout=5
+                )
+                return {"available": resp.status_code < 500}
+        except:
+            return {"available": False}
+
+    @staticmethod
+    def _pcm_to_wav(pcm_data: bytes, sample_rate: int) -> bytes:
+        import io
+        import wave
+
+        wav_buffer = io.BytesIO()
+        with wave.open(wav_buffer, 'wb') as wav_file:
+            wav_file.setnchannels(1)
+            wav_file.setsampwidth(2)
+            wav_file.setframerate(sample_rate)
+            wav_file.writeframes(pcm_data)
+
+        wav_buffer.seek(0)
+        return wav_buffer.read()
+
+    @staticmethod
+    def _map_speaker(local_speaker: str) -> str:
+        mapping = {
+            "Vivian": "Cherry",
+            "Serena": "Lili",
+            "Uncle_Fu": "Longxiaochun",
+            "Dylan": "Longxiaochun",
+            "Female": "Cherry",
+            "Male": "Longxiaochun"
+        }
+        return mapping.get(local_speaker, "Cherry")
+
+
+class TTSServiceFactory:
+    _local_backend: Optional[LocalTTSBackend] = None
+    _aliyun_backend: Optional[AliyunTTSBackend] = None
+    _user_aliyun_backends: dict[str, AliyunTTSBackend] = {}
+
+    @classmethod
+    async def get_backend(cls, backend_type: str = None, user_api_key: Optional[str] = None) -> TTSBackend:
+        from core.config import settings
+
+        if backend_type is None:
+            backend_type = settings.DEFAULT_BACKEND
+
+        if backend_type == "local":
+            if cls._local_backend is None:
+                cls._local_backend = LocalTTSBackend()
+                await cls._local_backend.initialize()
+            return cls._local_backend
+
+        elif backend_type == "aliyun":
+            if not user_api_key:
+                raise ValueError("Aliyun backend requires user API key. Please set your API key first.")
+
+            if user_api_key not in cls._user_aliyun_backends:
+                cls._user_aliyun_backends[user_api_key] = AliyunTTSBackend(
+                    api_key=user_api_key,
+                    region=settings.ALIYUN_REGION
+                )
+            return cls._user_aliyun_backends[user_api_key]
+
+        else:
+            raise ValueError(f"Unknown backend type: {backend_type}")
diff --git a/qwen3-tts-backend/db/crud.py b/qwen3-tts-backend/db/crud.py
index 0a24e95..7bfc6e0 100644
--- a/qwen3-tts-backend/db/crud.py
+++ b/qwen3-tts-backend/db/crud.py
@@ -103,6 +103,21 @@ def change_user_password(
     db.refresh(user)
     return user
 
+def update_user_aliyun_key(
+    db: Session,
+    user_id: int,
+    encrypted_api_key: str
+) -> Optional[User]:
+    user = get_user_by_id(db, user_id)
+    if not user:
+        return None
+
+    user.aliyun_api_key = encrypted_api_key
+    user.updated_at = datetime.utcnow()
+    db.commit()
+    db.refresh(user)
+    return user
+
 def create_job(db: Session, user_id: int, job_type: str, input_data: Dict[str, Any]) -> Job:
     job = Job(
         user_id=user_id,
diff --git a/qwen3-tts-backend/db/models.py b/qwen3-tts-backend/db/models.py
index 4e7e631..42f2433 100644
--- a/qwen3-tts-backend/db/models.py
+++ b/qwen3-tts-backend/db/models.py
@@ -20,6 +20,7 @@ class User(Base):
     hashed_password = Column(String(255), nullable=False)
     is_active = Column(Boolean, default=True, nullable=False)
     is_superuser = Column(Boolean, default=False, nullable=False)
+    aliyun_api_key = Column(Text, nullable=True)
     created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
     updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
 
@@ -33,6 +34,7 @@ class Job(Base):
     user_id = Column(Integer, ForeignKey("users.id"), nullable=False, index=True)
     job_type = Column(String(50), nullable=False)
     status = Column(String(50), default="pending", nullable=False, index=True)
+    backend_type = Column(String(20), default="local", nullable=False)
     input_data = Column(Text, nullable=True)
     input_params = Column(JSON, nullable=True)
     output_path = Column(String(500), nullable=True)
diff --git a/qwen3-tts-backend/main.py b/qwen3-tts-backend/main.py
index 6fe8c7b..7644dac 100644
--- a/qwen3-tts-backend/main.py
+++ b/qwen3-tts-backend/main.py
@@ -194,6 +194,27 @@ async def health_check():
     if queue_length > 50:
         minor_issues.append("queue_congested")
 
+    backends_status = {}
+
+    try:
+        from core.tts_service import TTSServiceFactory
+
+        try:
+            local_backend = await TTSServiceFactory.get_backend("local")
+            local_health = await local_backend.health_check()
+            backends_status["local"] = local_health
+        except Exception as e:
+            backends_status["local"] = {"available": False, "error": str(e)}
+
+        backends_status["aliyun"] = {
+            "available": True,
+            "region": settings.ALIYUN_REGION,
+            "note": "Requires user API key configuration"
+        }
+    except Exception as e:
+        logger.error(f"Backend health check failed: {e}")
+        backends_status = {"error": str(e)}
+
     if critical_issues:
         status = "unhealthy"
     elif minor_issues:
@@ -211,6 +232,7 @@ async def health_check():
         "database_connected": database_connected,
         "cache_dir_writable": cache_dir_writable,
         "output_dir_writable": output_dir_writable,
+        "backends": backends_status,
         "issues": {
             "critical": critical_issues,
             "minor": minor_issues
diff --git a/qwen3-tts-backend/requirements.txt b/qwen3-tts-backend/requirements.txt
index 15105fc..5081f5a 100644
--- a/qwen3-tts-backend/requirements.txt
+++ b/qwen3-tts-backend/requirements.txt
@@ -6,6 +6,7 @@ python-multipart==0.0.12
 python-jose[cryptography]==3.3.0
 passlib==1.7.4
 bcrypt==3.2.2
+cryptography>=41.0.0
 sqlalchemy==2.0.35
 aiosqlite==0.20.0
 soundfile==0.12.1
@@ -17,3 +18,4 @@ pytest==8.3.0
 pytest-cov==4.1.0
 pytest-asyncio==0.23.0
 httpx==0.27.0
+websockets>=12.0
diff --git a/qwen3-tts-backend/schemas/tts.py b/qwen3-tts-backend/schemas/tts.py
index 4da189a..f1e12ec 100644
--- a/qwen3-tts-backend/schemas/tts.py
+++ b/qwen3-tts-backend/schemas/tts.py
@@ -24,6 +24,7 @@ class CustomVoiceRequest(BaseModel):
     top_k: Optional[int] = Field(default=50, ge=1, le=100)
     top_p: Optional[float] = Field(default=1.0, ge=0.0, le=1.0)
     repetition_penalty: Optional[float] = Field(default=1.05, ge=1.0, le=2.0)
+    backend: Optional[str] = Field(default=None, description="Backend type: local or aliyun")
 
 
 class VoiceDesignRequest(BaseModel):
@@ -35,6 +36,7 @@ class VoiceDesignRequest(BaseModel):
     top_k: Optional[int] = Field(default=50, ge=1, le=100)
     top_p: Optional[float] = Field(default=1.0, ge=0.0, le=1.0)
     repetition_penalty: Optional[float] = Field(default=1.05, ge=1.0, le=2.0)
+    backend: Optional[str] = Field(default=None)
 
 
 class VoiceCloneRequest(BaseModel):
diff --git a/qwen3-tts-backend/schemas/user.py b/qwen3-tts-backend/schemas/user.py
index 9eb2189..cdeeb34 100644
--- a/qwen3-tts-backend/schemas/user.py
+++ b/qwen3-tts-backend/schemas/user.py
@@ -111,3 +111,10 @@ class PasswordChange(BaseModel):
         if self.new_password != self.confirm_password:
             raise ValueError('Passwords do not match')
         return self
+
+class AliyunKeyUpdate(BaseModel):
+    api_key: str = Field(..., min_length=1, max_length=500)
+
+class AliyunKeyVerifyResponse(BaseModel):
+    valid: bool
+    message: str
diff --git a/qwen3-tts-frontend/index.html b/qwen3-tts-frontend/index.html
index c1f1c90..1d40e2e 100644
--- a/qwen3-tts-frontend/index.html
+++ b/qwen3-tts-frontend/index.html
@@ -2,7 +2,7 @@
 <html lang="en">
   <head>
     <meta charset="UTF-8" />
-    <link rel="icon" type="image/svg+xml" href="/vite.svg" />
+    <link rel="icon" type="image/svg+xml" href="/qwen.svg" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" />
     <link rel="preload" href="/fonts/noto-serif-regular.woff2" as="font" type="font/woff2" crossorigin>
     <title>Qwen3-TTS-WebUI</title>
diff --git a/qwen3-tts-frontend/public/qwen.svg b/qwen3-tts-frontend/public/qwen.svg
new file mode 100644
index 0000000..26de1c9
--- /dev/null
+++ b/qwen3-tts-frontend/public/qwen.svg
@@ -0,0 +1 @@
+Redirecting to /@lobehub/icons-static-svg@1.78.0/icons/qwen.svg
\ No newline at end of file
diff --git a/qwen3-tts-frontend/src/components/tts/CustomVoiceForm.tsx b/qwen3-tts-frontend/src/components/tts/CustomVoiceForm.tsx
index 022d90d..5b780b5 100644
--- a/qwen3-tts-frontend/src/components/tts/CustomVoiceForm.tsx
+++ b/qwen3-tts-frontend/src/components/tts/CustomVoiceForm.tsx
@@ -32,6 +32,7 @@ const formSchema = z.object({
   top_k: z.number().min(1).max(100).optional(),
   top_p: z.number().min(0).max(1).optional(),
   repetition_penalty: z.number().min(0).max(2).optional(),
+  backend: z.string().optional(),
 })
 
 type FormData = z.infer<typeof formSchema>
@@ -74,6 +75,7 @@ const CustomVoiceForm = forwardRef<CustomVoiceFormHandle>((_props, ref) => {
       top_k: 20,
       top_p: 0.7,
       repetition_penalty: 1.05,
+      backend: 'local',
     },
   })
 
@@ -88,6 +90,7 @@ const CustomVoiceForm = forwardRef<CustomVoiceFormHandle>((_props, ref) => {
       setValue('top_k', params.top_k || 20)
       setValue('top_p', params.top_p || 0.7)
       setValue('repetition_penalty', params.repetition_penalty || 1.05)
+      setValue('backend', params.backend || 'local')
     }
   }))
 
@@ -131,6 +134,22 @@ const CustomVoiceForm = forwardRef<CustomVoiceFormHandle>((_props, ref) => {
 
   return (
     <form onSubmit={handleSubmit(onSubmit)} className="space-y-2">
+      <div className="space-y-0.5">
+        <Label>后端选择</Label>
+        <Select
+          value={watch('backend')}
+          onValueChange={(value: string) => setValue('backend', value)}
+        >
+          <SelectTrigger>
+            <SelectValue />
+          </SelectTrigger>
+          <SelectContent>
+            <SelectItem value="local">本地模型</SelectItem>
+            <SelectItem value="aliyun">阿里云 API</SelectItem>
+          </SelectContent>
+        </Select>
+      </div>
+
       <div className="space-y-0.5">
         <IconLabel icon={Globe2} tooltip="语言" required />
         <Select
diff --git a/qwen3-tts-frontend/src/components/tts/VoiceCloneForm.tsx b/qwen3-tts-frontend/src/components/tts/VoiceCloneForm.tsx
index bb4120b..5345a0f 100644
--- a/qwen3-tts-frontend/src/components/tts/VoiceCloneForm.tsx
+++ b/qwen3-tts-frontend/src/components/tts/VoiceCloneForm.tsx
@@ -37,6 +37,7 @@ const formSchema = z.object({
   top_k: z.number().min(1).max(100).optional(),
   top_p: z.number().min(0).max(1).optional(),
   repetition_penalty: z.number().min(0).max(2).optional(),
+  backend: z.string().optional(),
 })
 
 type FormData = z.infer<typeof formSchema>
@@ -75,6 +76,7 @@ function VoiceCloneForm() {
       top_k: 20,
       top_p: 0.7,
       repetition_penalty: 1.05,
+      backend: 'local',
     } as Partial<FormData>,
   })
 
@@ -233,6 +235,22 @@ function VoiceCloneForm() {
 
       <div className={step === 2 ? 'block space-y-4' : 'hidden'}>
         {/* Step 2: Synthesis Options */}
+        <div className="space-y-0.5">
+          <Label>后端选择</Label>
+          <Select
+            value={watch('backend')}
+            onValueChange={(value: string) => setValue('backend', value)}
+          >
+            <SelectTrigger>
+              <SelectValue />
+            </SelectTrigger>
+            <SelectContent>
+              <SelectItem value="local">本地模型</SelectItem>
+              <SelectItem value="aliyun">阿里云 API</SelectItem>
+            </SelectContent>
+          </Select>
+        </div>
+
         <div className="space-y-0.5">
           <IconLabel icon={Globe2} tooltip="语言（可选）" />
           <Select
diff --git a/qwen3-tts-frontend/src/components/tts/VoiceDesignForm.tsx b/qwen3-tts-frontend/src/components/tts/VoiceDesignForm.tsx
index 9ee2e8f..a12e8d2 100644
--- a/qwen3-tts-frontend/src/components/tts/VoiceDesignForm.tsx
+++ b/qwen3-tts-frontend/src/components/tts/VoiceDesignForm.tsx
@@ -31,6 +31,7 @@ const formSchema = z.object({
   top_k: z.number().min(1).max(100).optional(),
   top_p: z.number().min(0).max(1).optional(),
   repetition_penalty: z.number().min(0).max(2).optional(),
+  backend: z.string().optional(),
 })
 
 type FormData = z.infer<typeof formSchema>
@@ -71,6 +72,7 @@ const VoiceDesignForm = forwardRef<VoiceDesignFormHandle>((_props, ref) => {
       top_k: 20,
       top_p: 0.7,
       repetition_penalty: 1.05,
+      backend: 'local',
     },
   })
 
@@ -84,6 +86,7 @@ const VoiceDesignForm = forwardRef<VoiceDesignFormHandle>((_props, ref) => {
       setValue('top_k', params.top_k || 20)
       setValue('top_p', params.top_p || 0.7)
       setValue('repetition_penalty', params.repetition_penalty || 1.05)
+      setValue('backend', params.backend || 'local')
     }
   }))
 
@@ -122,6 +125,22 @@ const VoiceDesignForm = forwardRef<VoiceDesignFormHandle>((_props, ref) => {
 
   return (
     <form onSubmit={handleSubmit(onSubmit)} className="space-y-2">
+      <div className="space-y-0.5">
+        <Label>后端选择</Label>
+        <Select
+          value={watch('backend')}
+          onValueChange={(value: string) => setValue('backend', value)}
+        >
+          <SelectTrigger>
+            <SelectValue />
+          </SelectTrigger>
+          <SelectContent>
+            <SelectItem value="local">本地模型</SelectItem>
+            <SelectItem value="aliyun">阿里云 API</SelectItem>
+          </SelectContent>
+        </Select>
+      </div>
+
       <div className="space-y-0.5">
         <IconLabel icon={Globe2} tooltip="语言" required />
         <Select
diff --git a/qwen3-tts-frontend/src/lib/api.ts b/qwen3-tts-frontend/src/lib/api.ts
index 0e37beb..3f7d969 100644
--- a/qwen3-tts-frontend/src/lib/api.ts
+++ b/qwen3-tts-frontend/src/lib/api.ts
@@ -247,6 +247,9 @@ export const ttsApi = {
     if (data.repetition_penalty !== undefined) {
       formData.append('repetition_penalty', String(data.repetition_penalty))
     }
+    if (data.backend) {
+      formData.append('backend', data.backend)
+    }
 
     const response = await apiClient.post<JobCreateResponse>(
       API_ENDPOINTS.TTS.VOICE_CLONE,
diff --git a/qwen3-tts-frontend/src/types/tts.ts b/qwen3-tts-frontend/src/types/tts.ts
index f58711e..cf8ae60 100644
--- a/qwen3-tts-frontend/src/types/tts.ts
+++ b/qwen3-tts-frontend/src/types/tts.ts
@@ -18,6 +18,7 @@ export interface CustomVoiceForm {
   top_k?: number
   top_p?: number
   repetition_penalty?: number
+  backend?: string
 }
 
 export interface VoiceDesignForm {
@@ -29,6 +30,7 @@ export interface VoiceDesignForm {
   top_k?: number
   top_p?: number
   repetition_penalty?: number
+  backend?: string
 }
 
 export interface VoiceCloneForm {
@@ -43,4 +45,5 @@ export interface VoiceCloneForm {
   top_k?: number
   top_p?: number
   repetition_penalty?: number
+  backend?: string
 }