From abe0dc131b045cf213198597b71b2a72b469ae0c Mon Sep 17 00:00:00 2001 From: bdim404 Date: Tue, 3 Feb 2026 15:44:25 +0800 Subject: [PATCH] feat: Implement Aliyun TTS backend integration and API key management --- qwen3-tts-backend/.env.example | 4 + qwen3-tts-backend/api/auth.py | 80 +++- qwen3-tts-backend/api/tts.py | 232 +++++------ qwen3-tts-backend/config.py | 8 + qwen3-tts-backend/core/security.py | 24 ++ qwen3-tts-backend/core/tts_service.py | 371 ++++++++++++++++++ qwen3-tts-backend/db/crud.py | 15 + qwen3-tts-backend/db/models.py | 2 + qwen3-tts-backend/main.py | 22 ++ qwen3-tts-backend/requirements.txt | 2 + qwen3-tts-backend/schemas/tts.py | 2 + qwen3-tts-backend/schemas/user.py | 7 + qwen3-tts-frontend/index.html | 2 +- qwen3-tts-frontend/public/qwen.svg | 1 + .../src/components/tts/CustomVoiceForm.tsx | 19 + .../src/components/tts/VoiceCloneForm.tsx | 18 + .../src/components/tts/VoiceDesignForm.tsx | 19 + qwen3-tts-frontend/src/lib/api.ts | 3 + qwen3-tts-frontend/src/types/tts.ts | 3 + 19 files changed, 716 insertions(+), 118 deletions(-) create mode 100644 qwen3-tts-backend/core/tts_service.py create mode 100644 qwen3-tts-frontend/public/qwen.svg diff --git a/qwen3-tts-backend/.env.example b/qwen3-tts-backend/.env.example index 7d4e8c2..b012d8b 100644 --- a/qwen3-tts-backend/.env.example +++ b/qwen3-tts-backend/.env.example @@ -20,3 +20,7 @@ BATCH_SIZE=4 BATCH_WAIT_TIME=0.5 MAX_TEXT_LENGTH=1000 MAX_AUDIO_SIZE_MB=10 + +ALIYUN_REGION=beijing + +DEFAULT_BACKEND=local diff --git a/qwen3-tts-backend/api/auth.py b/qwen3-tts-backend/api/auth.py index ec8a599..1869d77 100644 --- a/qwen3-tts-backend/api/auth.py +++ b/qwen3-tts-backend/api/auth.py @@ -14,8 +14,8 @@ from core.security import ( decode_access_token ) from db.database import get_db -from db.crud import get_user_by_username, get_user_by_email, create_user, change_user_password -from schemas.user import User, UserCreate, Token, PasswordChange +from db.crud import get_user_by_username, get_user_by_email, create_user, change_user_password, update_user_aliyun_key +from schemas.user import User, UserCreate, Token, PasswordChange, AliyunKeyUpdate, AliyunKeyVerifyResponse router = APIRouter(prefix="/auth", tags=["authentication"]) @@ -135,3 +135,79 @@ async def change_password( ) return user + +@router.post("/aliyun-key", response_model=User) +@limiter.limit("5/minute") +async def set_aliyun_key( + request: Request, + key_data: AliyunKeyUpdate, + current_user: Annotated[User, Depends(get_current_user)], + db: Session = Depends(get_db) +): + from core.security import encrypt_api_key + from core.tts_service import AliyunTTSBackend + + api_key = key_data.api_key.strip() + + aliyun_backend = AliyunTTSBackend(api_key=api_key, region=settings.ALIYUN_REGION) + health = await aliyun_backend.health_check() + + if not health.get("available", False): + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail="Invalid Aliyun API key. Please check your API key and try again." + ) + + encrypted_key = encrypt_api_key(api_key) + + user = update_user_aliyun_key( + db, + user_id=current_user.id, + encrypted_api_key=encrypted_key + ) + + if not user: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail="User not found" + ) + + return user + +@router.get("/aliyun-key/verify", response_model=AliyunKeyVerifyResponse) +@limiter.limit("10/minute") +async def verify_aliyun_key( + request: Request, + current_user: Annotated[User, Depends(get_current_user)], + db: Session = Depends(get_db) +): + from core.security import decrypt_api_key + from core.tts_service import AliyunTTSBackend + + if not current_user.aliyun_api_key: + return AliyunKeyVerifyResponse( + valid=False, + message="No Aliyun API key configured" + ) + + api_key = decrypt_api_key(current_user.aliyun_api_key) + + if not api_key: + return AliyunKeyVerifyResponse( + valid=False, + message="Failed to decrypt API key" + ) + + aliyun_backend = AliyunTTSBackend(api_key=api_key, region=settings.ALIYUN_REGION) + health = await aliyun_backend.health_check() + + if health.get("available", False): + return AliyunKeyVerifyResponse( + valid=True, + message="Aliyun API key is valid and working" + ) + else: + return AliyunKeyVerifyResponse( + valid=False, + message="Aliyun API key is not working. Please check your API key." + ) diff --git a/qwen3-tts-backend/api/tts.py b/qwen3-tts-backend/api/tts.py index 004f84b..a584c9c 100644 --- a/qwen3-tts-backend/api/tts.py +++ b/qwen3-tts-backend/api/tts.py @@ -36,9 +36,12 @@ async def process_custom_voice_job( job_id: int, user_id: int, request_data: dict, + backend_type: str, db_url: str ): from core.database import SessionLocal + from core.tts_service import TTSServiceFactory + from core.security import decrypt_api_key db = SessionLocal() try: @@ -51,42 +54,24 @@ async def process_custom_voice_job( job.started_at = datetime.utcnow() db.commit() - logger.info(f"Processing custom-voice job {job_id}") + logger.info(f"Processing custom-voice job {job_id} with backend {backend_type}") - model_manager = await ModelManager.get_instance() - await model_manager.load_model("custom-voice") - _, tts = await model_manager.get_current_model() + user_api_key = None + if backend_type == "aliyun": + user = db.query(User).filter(User.id == user_id).first() + if user and user.aliyun_api_key: + user_api_key = decrypt_api_key(user.aliyun_api_key) - if tts is None: - raise RuntimeError("Failed to load custom-voice model") + backend = await TTSServiceFactory.get_backend(backend_type, user_api_key) - result = tts.generate_custom_voice( - text=request_data['text'], - language=request_data['language'], - speaker=request_data['speaker'], - instruct=request_data.get('instruct', ''), - max_new_tokens=request_data['max_new_tokens'], - temperature=request_data['temperature'], - top_k=request_data['top_k'], - top_p=request_data['top_p'], - repetition_penalty=request_data['repetition_penalty'] - ) - - import numpy as np - if isinstance(result, tuple): - audio_data = result[0] - elif isinstance(result, list): - audio_data = np.array(result) - else: - audio_data = result - - from pathlib import Path + audio_bytes, sample_rate = await backend.generate_custom_voice(request_data) timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S") filename = f"{user_id}_{job_id}_{timestamp}.wav" output_path = Path(settings.OUTPUT_DIR) / filename - save_audio_file(audio_data, 24000, output_path) + with open(output_path, 'wb') as f: + f.write(audio_bytes) job.status = JobStatus.COMPLETED job.output_path = str(output_path) @@ -112,9 +97,12 @@ async def process_voice_design_job( job_id: int, user_id: int, request_data: dict, + backend_type: str, db_url: str ): from core.database import SessionLocal + from core.tts_service import TTSServiceFactory + from core.security import decrypt_api_key db = SessionLocal() try: @@ -127,41 +115,24 @@ async def process_voice_design_job( job.started_at = datetime.utcnow() db.commit() - logger.info(f"Processing voice-design job {job_id}") + logger.info(f"Processing voice-design job {job_id} with backend {backend_type}") - model_manager = await ModelManager.get_instance() - await model_manager.load_model("voice-design") - _, tts = await model_manager.get_current_model() + user_api_key = None + if backend_type == "aliyun": + user = db.query(User).filter(User.id == user_id).first() + if user and user.aliyun_api_key: + user_api_key = decrypt_api_key(user.aliyun_api_key) - if tts is None: - raise RuntimeError("Failed to load voice-design model") + backend = await TTSServiceFactory.get_backend(backend_type, user_api_key) - result = tts.generate_voice_design( - text=request_data['text'], - language=request_data['language'], - instruct=request_data['instruct'], - max_new_tokens=request_data['max_new_tokens'], - temperature=request_data['temperature'], - top_k=request_data['top_k'], - top_p=request_data['top_p'], - repetition_penalty=request_data['repetition_penalty'] - ) - - import numpy as np - if isinstance(result, tuple): - audio_data = result[0] - elif isinstance(result, list): - audio_data = np.array(result) - else: - audio_data = result - - from pathlib import Path + audio_bytes, sample_rate = await backend.generate_voice_design(request_data) timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S") filename = f"{user_id}_{job_id}_{timestamp}.wav" output_path = Path(settings.OUTPUT_DIR) / filename - save_audio_file(audio_data, 24000, output_path) + with open(output_path, 'wb') as f: + f.write(audio_bytes) job.status = JobStatus.COMPLETED job.output_path = str(output_path) @@ -188,9 +159,11 @@ async def process_voice_clone_job( user_id: int, request_data: dict, ref_audio_path: str, + backend_type: str, db_url: str ): from core.database import SessionLocal + from core.tts_service import TTSServiceFactory import numpy as np db = SessionLocal() @@ -204,7 +177,14 @@ async def process_voice_clone_job( job.started_at = datetime.utcnow() db.commit() - logger.info(f"Processing voice-clone job {job_id}") + logger.info(f"Processing voice-clone job {job_id} with backend {backend_type}") + + from core.security import decrypt_api_key + user_api_key = None + if backend_type == "aliyun": + user = db.query(User).filter(User.id == user_id).first() + if user and user.aliyun_api_key: + user_api_key = decrypt_api_key(user.aliyun_api_key) with open(ref_audio_path, 'rb') as f: ref_audio_data = f.read() @@ -212,49 +192,49 @@ async def process_voice_clone_job( cache_manager = await VoiceCacheManager.get_instance() ref_audio_hash = cache_manager.get_audio_hash(ref_audio_data) - x_vector = None - cache_id = None - - if request_data.get('use_cache', True): - cached = await cache_manager.get_cache(user_id, ref_audio_hash, db) - if cached: - x_vector = cached['data'] - cache_id = cached['cache_id'] - cache_metrics.record_hit(user_id) - logger.info(f"Cache hit for job {job_id}, cache_id={cache_id}") - - if x_vector is None: - cache_metrics.record_miss(user_id) - logger.info(f"Cache miss for job {job_id}, creating voice clone prompt") - ref_audio_array, ref_sr = process_ref_audio(ref_audio_data) - - model_manager = await ModelManager.get_instance() - await model_manager.load_model("base") - _, tts = await model_manager.get_current_model() - - if tts is None: - raise RuntimeError("Failed to load base model") - - x_vector = tts.create_voice_clone_prompt( - ref_audio=(ref_audio_array, ref_sr), - ref_text=request_data.get('ref_text', ''), - x_vector_only_mode=request_data.get('x_vector_only_mode', False) - ) + if request_data.get('x_vector_only_mode', False) and backend_type == "local": + x_vector = None + cache_id = None if request_data.get('use_cache', True): - features = extract_audio_features(ref_audio_array, ref_sr) - metadata = { - 'duration': features['duration'], - 'sample_rate': features['sample_rate'], - 'ref_text': request_data.get('ref_text', ''), - 'x_vector_only_mode': request_data.get('x_vector_only_mode', False) - } - cache_id = await cache_manager.set_cache( - user_id, ref_audio_hash, x_vector, metadata, db - ) - logger.info(f"Created cache for job {job_id}, cache_id={cache_id}") + cached = await cache_manager.get_cache(user_id, ref_audio_hash, db) + if cached: + x_vector = cached['data'] + cache_id = cached['cache_id'] + cache_metrics.record_hit(user_id) + logger.info(f"Cache hit for job {job_id}, cache_id={cache_id}") + + if x_vector is None: + cache_metrics.record_miss(user_id) + logger.info(f"Cache miss for job {job_id}, creating voice clone prompt") + ref_audio_array, ref_sr = process_ref_audio(ref_audio_data) + + model_manager = await ModelManager.get_instance() + await model_manager.load_model("base") + _, tts = await model_manager.get_current_model() + + if tts is None: + raise RuntimeError("Failed to load base model") + + x_vector = tts.create_voice_clone_prompt( + ref_audio=(ref_audio_array, ref_sr), + ref_text=request_data.get('ref_text', ''), + x_vector_only_mode=True + ) + + if request_data.get('use_cache', True): + features = extract_audio_features(ref_audio_array, ref_sr) + metadata = { + 'duration': features['duration'], + 'sample_rate': features['sample_rate'], + 'ref_text': request_data.get('ref_text', ''), + 'x_vector_only_mode': True + } + cache_id = await cache_manager.set_cache( + user_id, ref_audio_hash, x_vector, metadata, db + ) + logger.info(f"Created cache for job {job_id}, cache_id={cache_id}") - if request_data.get('x_vector_only_mode', False): job.status = JobStatus.COMPLETED job.output_path = f"x_vector_cached_{cache_id}" job.completed_at = datetime.utcnow() @@ -262,31 +242,16 @@ async def process_voice_clone_job( logger.info(f"Job {job_id} completed (x_vector_only_mode)") return - model_manager = await ModelManager.get_instance() - await model_manager.load_model("base") - _, tts = await model_manager.get_current_model() + backend = await TTSServiceFactory.get_backend(backend_type, user_api_key) - if tts is None: - raise RuntimeError("Failed to load base model") - - wavs, sample_rate = tts.generate_voice_clone( - text=request_data['text'], - language=request_data['language'], - voice_clone_prompt=x_vector, - max_new_tokens=request_data['max_new_tokens'], - temperature=request_data['temperature'], - top_k=request_data['top_k'], - top_p=request_data['top_p'], - repetition_penalty=request_data['repetition_penalty'] - ) - - audio_data = wavs[0] if isinstance(wavs, list) else wavs + audio_bytes, sample_rate = await backend.generate_voice_clone(request_data, ref_audio_data) timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S") filename = f"{user_id}_{job_id}_{timestamp}.wav" output_path = Path(settings.OUTPUT_DIR) / filename - save_audio_file(audio_data, sample_rate, output_path) + with open(output_path, 'wb') as f: + f.write(audio_bytes) job.status = JobStatus.COMPLETED job.output_path = str(output_path) @@ -319,6 +284,16 @@ async def create_custom_voice_job( current_user: User = Depends(get_current_user), db: Session = Depends(get_db) ): + from core.security import decrypt_api_key + + backend_type = req_data.backend or settings.DEFAULT_BACKEND + if backend_type == "aliyun": + if not current_user.aliyun_api_key: + raise HTTPException(status_code=400, detail="Aliyun API key not configured. Please set your API key first.") + user_api_key = decrypt_api_key(current_user.aliyun_api_key) + if not user_api_key: + raise HTTPException(status_code=400, detail="Invalid Aliyun API key. Please update your API key.") + try: validate_text_length(req_data.text) language = validate_language(req_data.language) @@ -339,6 +314,7 @@ async def create_custom_voice_job( user_id=current_user.id, job_type="custom-voice", status=JobStatus.PENDING, + backend_type=backend_type, input_data="", input_params={ "text": req_data.text, @@ -365,6 +341,7 @@ async def create_custom_voice_job( job.id, current_user.id, request_data, + backend_type, str(settings.DATABASE_URL) ) @@ -384,6 +361,16 @@ async def create_voice_design_job( current_user: User = Depends(get_current_user), db: Session = Depends(get_db) ): + from core.security import decrypt_api_key + + backend_type = req_data.backend or settings.DEFAULT_BACKEND + if backend_type == "aliyun": + if not current_user.aliyun_api_key: + raise HTTPException(status_code=400, detail="Aliyun API key not configured. Please set your API key first.") + user_api_key = decrypt_api_key(current_user.aliyun_api_key) + if not user_api_key: + raise HTTPException(status_code=400, detail="Invalid Aliyun API key. Please update your API key.") + try: validate_text_length(req_data.text) language = validate_language(req_data.language) @@ -406,6 +393,7 @@ async def create_voice_design_job( user_id=current_user.id, job_type="voice-design", status=JobStatus.PENDING, + backend_type=backend_type, input_data="", input_params={ "text": req_data.text, @@ -430,6 +418,7 @@ async def create_voice_design_job( job.id, current_user.id, request_data, + backend_type, str(settings.DATABASE_URL) ) @@ -455,10 +444,21 @@ async def create_voice_clone_job( top_k: Optional[int] = Form(default=50), top_p: Optional[float] = Form(default=1.0), repetition_penalty: Optional[float] = Form(default=1.05), + backend: Optional[str] = Form(default=None), background_tasks: BackgroundTasks = None, current_user: User = Depends(get_current_user), db: Session = Depends(get_db) ): + from core.security import decrypt_api_key + + backend_type = backend or settings.DEFAULT_BACKEND + if backend_type == "aliyun": + if not current_user.aliyun_api_key: + raise HTTPException(status_code=400, detail="Aliyun API key not configured. Please set your API key first.") + user_api_key = decrypt_api_key(current_user.aliyun_api_key) + if not user_api_key: + raise HTTPException(status_code=400, detail="Invalid Aliyun API key. Please update your API key.") + try: validate_text_length(text) language = validate_language(language) @@ -486,6 +486,7 @@ async def create_voice_clone_job( user_id=current_user.id, job_type="voice-clone", status=JobStatus.PENDING, + backend_type=backend_type, input_data="", input_params={ "text": text, @@ -520,6 +521,7 @@ async def create_voice_clone_job( current_user.id, request_data, tmp_audio_path, + backend_type, str(settings.DATABASE_URL) ) diff --git a/qwen3-tts-backend/config.py b/qwen3-tts-backend/config.py index 24c2572..6110973 100644 --- a/qwen3-tts-backend/config.py +++ b/qwen3-tts-backend/config.py @@ -36,6 +36,14 @@ class Settings(BaseSettings): MAX_TEXT_LENGTH: int = Field(default=1000) MAX_AUDIO_SIZE_MB: int = Field(default=10) + ALIYUN_REGION: str = Field(default="beijing") + + ALIYUN_MODEL_FLASH: str = Field(default="qwen3-tts-flash-realtime") + ALIYUN_MODEL_VC: str = Field(default="qwen3-tts-vc-realtime-2026-01-15") + ALIYUN_MODEL_VD: str = Field(default="qwen3-tts-vd-realtime-2026-01-15") + + DEFAULT_BACKEND: str = Field(default="local") + class Config: env_file = ".env" case_sensitive = True diff --git a/qwen3-tts-backend/core/security.py b/qwen3-tts-backend/core/security.py index 610018b..7c61ee5 100644 --- a/qwen3-tts-backend/core/security.py +++ b/qwen3-tts-backend/core/security.py @@ -2,6 +2,9 @@ from datetime import datetime, timedelta from typing import Optional from jose import JWTError, jwt from passlib.context import CryptContext +from cryptography.fernet import Fernet +import base64 +import hashlib from config import settings @@ -33,3 +36,24 @@ def decode_access_token(token: str) -> Optional[str]: return username except JWTError: return None + +def _get_fernet_key() -> bytes: + key = hashlib.sha256(settings.SECRET_KEY.encode()).digest() + return base64.urlsafe_b64encode(key) + +def encrypt_api_key(api_key: str) -> str: + if not api_key: + return "" + fernet = Fernet(_get_fernet_key()) + encrypted = fernet.encrypt(api_key.encode()) + return encrypted.decode() + +def decrypt_api_key(encrypted_key: str) -> Optional[str]: + if not encrypted_key: + return None + try: + fernet = Fernet(_get_fernet_key()) + decrypted = fernet.decrypt(encrypted_key.encode()) + return decrypted.decode() + except Exception: + return None diff --git a/qwen3-tts-backend/core/tts_service.py b/qwen3-tts-backend/core/tts_service.py new file mode 100644 index 0000000..1e0d982 --- /dev/null +++ b/qwen3-tts-backend/core/tts_service.py @@ -0,0 +1,371 @@ +import time +import logging +from abc import ABC, abstractmethod +from typing import Tuple, Optional +import websockets +import json +import base64 + +logger = logging.getLogger(__name__) + + +class TTSBackend(ABC): + @abstractmethod + async def generate_custom_voice(self, params: dict) -> Tuple[bytes, int]: + pass + + @abstractmethod + async def generate_voice_design(self, params: dict) -> Tuple[bytes, int]: + pass + + @abstractmethod + async def generate_voice_clone(self, params: dict, ref_audio_bytes: bytes) -> Tuple[bytes, int]: + pass + + @abstractmethod + async def health_check(self) -> dict: + pass + + +class LocalTTSBackend(TTSBackend): + def __init__(self): + self.model_manager = None + + async def initialize(self): + from core.model_manager import ModelManager + self.model_manager = await ModelManager.get_instance() + + async def generate_custom_voice(self, params: dict) -> Tuple[bytes, int]: + await self.model_manager.load_model("custom-voice") + _, tts = await self.model_manager.get_current_model() + + result = tts.generate_custom_voice( + text=params['text'], + language=params['language'], + speaker=params['speaker'], + instruct=params.get('instruct', ''), + max_new_tokens=params['max_new_tokens'], + temperature=params['temperature'], + top_k=params['top_k'], + top_p=params['top_p'], + repetition_penalty=params['repetition_penalty'] + ) + + import numpy as np + if isinstance(result, tuple): + audio_data = result[0] + elif isinstance(result, list): + audio_data = np.array(result) + else: + audio_data = result + + return self._numpy_to_bytes(audio_data), 24000 + + async def generate_voice_design(self, params: dict) -> Tuple[bytes, int]: + await self.model_manager.load_model("voice-design") + _, tts = await self.model_manager.get_current_model() + + result = tts.generate_voice_design( + text=params['text'], + language=params['language'], + instruct=params['instruct'], + max_new_tokens=params['max_new_tokens'], + temperature=params['temperature'], + top_k=params['top_k'], + top_p=params['top_p'], + repetition_penalty=params['repetition_penalty'] + ) + + import numpy as np + audio_data = result[0] if isinstance(result, tuple) else result + return self._numpy_to_bytes(audio_data), 24000 + + async def generate_voice_clone(self, params: dict, ref_audio_bytes: bytes) -> Tuple[bytes, int]: + from utils.audio import process_ref_audio + + ref_audio_array, ref_sr = process_ref_audio(ref_audio_bytes) + + await self.model_manager.load_model("base") + _, tts = await self.model_manager.get_current_model() + + x_vector = tts.create_voice_clone_prompt( + ref_audio=(ref_audio_array, ref_sr), + ref_text=params.get('ref_text', ''), + x_vector_only_mode=False + ) + + wavs, sample_rate = tts.generate_voice_clone( + text=params['text'], + language=params['language'], + voice_clone_prompt=x_vector, + max_new_tokens=params['max_new_tokens'], + temperature=params['temperature'], + top_k=params['top_k'], + top_p=params['top_p'], + repetition_penalty=params['repetition_penalty'] + ) + + audio_data = wavs[0] if isinstance(wavs, list) else wavs + return self._numpy_to_bytes(audio_data), sample_rate + + async def health_check(self) -> dict: + return { + "available": self.model_manager is not None, + "current_model": self.model_manager.current_model_name if self.model_manager else None + } + + @staticmethod + def _numpy_to_bytes(audio_array) -> bytes: + import numpy as np + import io + import scipy.io.wavfile + + buffer = io.BytesIO() + scipy.io.wavfile.write(buffer, 24000, (audio_array * 32767).astype(np.int16)) + buffer.seek(0) + return buffer.read() + + +class AliyunTTSBackend(TTSBackend): + def __init__(self, api_key: str, region: str): + self.api_key = api_key + self.region = region + self.ws_url = self._get_ws_url(region) + self.http_url = self._get_http_url(region) + + def _get_ws_url(self, region: str) -> str: + if region == "beijing": + return "wss://dashscope.aliyuncs.com/api-ws/v1/realtime" + else: + return "wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime" + + def _get_http_url(self, region: str) -> str: + if region == "beijing": + return "https://dashscope.aliyuncs.com/api/v1/services/audio/tts/customization" + else: + return "https://dashscope-intl.aliyuncs.com/api/v1/services/audio/tts/customization" + + async def generate_custom_voice(self, params: dict) -> Tuple[bytes, int]: + from core.config import settings + + voice = self._map_speaker(params['speaker']) + model = settings.ALIYUN_MODEL_FLASH + + return await self._generate_via_websocket( + model=model, + text=params['text'], + voice=voice, + language=params['language'] + ) + + async def generate_voice_design(self, params: dict) -> Tuple[bytes, int]: + from core.config import settings + + voice_id = await self._create_voice_design( + instruct=params['instruct'], + preview_text=params['text'] + ) + + model = settings.ALIYUN_MODEL_VD + + return await self._generate_via_websocket( + model=model, + text=params['text'], + voice=voice_id, + language=params['language'] + ) + + async def generate_voice_clone(self, params: dict, ref_audio_bytes: bytes) -> Tuple[bytes, int]: + from core.config import settings + + voice_id = await self._create_voice_clone(ref_audio_bytes) + + model = settings.ALIYUN_MODEL_VC + + return await self._generate_via_websocket( + model=model, + text=params['text'], + voice=voice_id, + language=params['language'] + ) + + async def _generate_via_websocket( + self, + model: str, + text: str, + voice: str, + language: str + ) -> Tuple[bytes, int]: + audio_chunks = [] + + url = f"{self.ws_url}?model={model}" + headers = {"Authorization": f"Bearer {self.api_key}"} + + async with websockets.connect(url, additional_headers=headers) as ws: + await ws.send(json.dumps({ + "type": "session.update", + "session": { + "mode": "server_commit", + "voice": voice, + "language_type": language, + "response_format": "pcm", + "sample_rate": 24000 + } + })) + + await ws.send(json.dumps({ + "type": "input_text_buffer.append", + "text": text + })) + + await ws.send(json.dumps({ + "type": "session.finish" + })) + + async for message in ws: + event = json.loads(message) + event_type = event.get('type') + + if event_type == 'response.audio.delta': + audio_data = base64.b64decode(event['delta']) + audio_chunks.append(audio_data) + elif event_type == 'session.finished': + break + elif event_type == 'error': + raise RuntimeError(f"Aliyun API error: {event.get('error')}") + + pcm_data = b''.join(audio_chunks) + wav_bytes = self._pcm_to_wav(pcm_data, 24000) + return wav_bytes, 24000 + + async def _create_voice_clone(self, ref_audio_bytes: bytes) -> str: + from core.config import settings + import httpx + + audio_b64 = base64.b64encode(ref_audio_bytes).decode() + data_uri = f"data:audio/wav;base64,{audio_b64}" + + payload = { + "model": "qwen-voice-enrollment", + "input": { + "action": "create", + "target_model": settings.ALIYUN_MODEL_VC, + "preferred_name": f"clone_{int(time.time())}", + "audio": {"data": data_uri} + } + } + + headers = { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json" + } + + async with httpx.AsyncClient() as client: + resp = await client.post(self.http_url, json=payload, headers=headers, timeout=60) + resp.raise_for_status() + result = resp.json() + return result['output']['voice'] + + async def _create_voice_design(self, instruct: str, preview_text: str) -> str: + from core.config import settings + import httpx + + payload = { + "model": "qwen-voice-design", + "input": { + "action": "create", + "target_model": settings.ALIYUN_MODEL_VD, + "voice_prompt": instruct, + "preview_text": preview_text, + "preferred_name": f"design_{int(time.time())}", + "language": "zh" + }, + "parameters": { + "sample_rate": 24000, + "response_format": "wav" + } + } + + headers = { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json" + } + + async with httpx.AsyncClient() as client: + resp = await client.post(self.http_url, json=payload, headers=headers, timeout=60) + resp.raise_for_status() + result = resp.json() + return result['output']['voice'] + + async def health_check(self) -> dict: + try: + import httpx + async with httpx.AsyncClient() as client: + resp = await client.get( + self.http_url.replace('/customization', '/health'), + headers={"Authorization": f"Bearer {self.api_key}"}, + timeout=5 + ) + return {"available": resp.status_code < 500} + except: + return {"available": False} + + @staticmethod + def _pcm_to_wav(pcm_data: bytes, sample_rate: int) -> bytes: + import io + import wave + + wav_buffer = io.BytesIO() + with wave.open(wav_buffer, 'wb') as wav_file: + wav_file.setnchannels(1) + wav_file.setsampwidth(2) + wav_file.setframerate(sample_rate) + wav_file.writeframes(pcm_data) + + wav_buffer.seek(0) + return wav_buffer.read() + + @staticmethod + def _map_speaker(local_speaker: str) -> str: + mapping = { + "Vivian": "Cherry", + "Serena": "Lili", + "Uncle_Fu": "Longxiaochun", + "Dylan": "Longxiaochun", + "Female": "Cherry", + "Male": "Longxiaochun" + } + return mapping.get(local_speaker, "Cherry") + + +class TTSServiceFactory: + _local_backend: Optional[LocalTTSBackend] = None + _aliyun_backend: Optional[AliyunTTSBackend] = None + _user_aliyun_backends: dict[str, AliyunTTSBackend] = {} + + @classmethod + async def get_backend(cls, backend_type: str = None, user_api_key: Optional[str] = None) -> TTSBackend: + from core.config import settings + + if backend_type is None: + backend_type = settings.DEFAULT_BACKEND + + if backend_type == "local": + if cls._local_backend is None: + cls._local_backend = LocalTTSBackend() + await cls._local_backend.initialize() + return cls._local_backend + + elif backend_type == "aliyun": + if not user_api_key: + raise ValueError("Aliyun backend requires user API key. Please set your API key first.") + + if user_api_key not in cls._user_aliyun_backends: + cls._user_aliyun_backends[user_api_key] = AliyunTTSBackend( + api_key=user_api_key, + region=settings.ALIYUN_REGION + ) + return cls._user_aliyun_backends[user_api_key] + + else: + raise ValueError(f"Unknown backend type: {backend_type}") diff --git a/qwen3-tts-backend/db/crud.py b/qwen3-tts-backend/db/crud.py index 0a24e95..7bfc6e0 100644 --- a/qwen3-tts-backend/db/crud.py +++ b/qwen3-tts-backend/db/crud.py @@ -103,6 +103,21 @@ def change_user_password( db.refresh(user) return user +def update_user_aliyun_key( + db: Session, + user_id: int, + encrypted_api_key: str +) -> Optional[User]: + user = get_user_by_id(db, user_id) + if not user: + return None + + user.aliyun_api_key = encrypted_api_key + user.updated_at = datetime.utcnow() + db.commit() + db.refresh(user) + return user + def create_job(db: Session, user_id: int, job_type: str, input_data: Dict[str, Any]) -> Job: job = Job( user_id=user_id, diff --git a/qwen3-tts-backend/db/models.py b/qwen3-tts-backend/db/models.py index 4e7e631..42f2433 100644 --- a/qwen3-tts-backend/db/models.py +++ b/qwen3-tts-backend/db/models.py @@ -20,6 +20,7 @@ class User(Base): hashed_password = Column(String(255), nullable=False) is_active = Column(Boolean, default=True, nullable=False) is_superuser = Column(Boolean, default=False, nullable=False) + aliyun_api_key = Column(Text, nullable=True) created_at = Column(DateTime, default=datetime.utcnow, nullable=False) updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False) @@ -33,6 +34,7 @@ class Job(Base): user_id = Column(Integer, ForeignKey("users.id"), nullable=False, index=True) job_type = Column(String(50), nullable=False) status = Column(String(50), default="pending", nullable=False, index=True) + backend_type = Column(String(20), default="local", nullable=False) input_data = Column(Text, nullable=True) input_params = Column(JSON, nullable=True) output_path = Column(String(500), nullable=True) diff --git a/qwen3-tts-backend/main.py b/qwen3-tts-backend/main.py index 6fe8c7b..7644dac 100644 --- a/qwen3-tts-backend/main.py +++ b/qwen3-tts-backend/main.py @@ -194,6 +194,27 @@ async def health_check(): if queue_length > 50: minor_issues.append("queue_congested") + backends_status = {} + + try: + from core.tts_service import TTSServiceFactory + + try: + local_backend = await TTSServiceFactory.get_backend("local") + local_health = await local_backend.health_check() + backends_status["local"] = local_health + except Exception as e: + backends_status["local"] = {"available": False, "error": str(e)} + + backends_status["aliyun"] = { + "available": True, + "region": settings.ALIYUN_REGION, + "note": "Requires user API key configuration" + } + except Exception as e: + logger.error(f"Backend health check failed: {e}") + backends_status = {"error": str(e)} + if critical_issues: status = "unhealthy" elif minor_issues: @@ -211,6 +232,7 @@ async def health_check(): "database_connected": database_connected, "cache_dir_writable": cache_dir_writable, "output_dir_writable": output_dir_writable, + "backends": backends_status, "issues": { "critical": critical_issues, "minor": minor_issues diff --git a/qwen3-tts-backend/requirements.txt b/qwen3-tts-backend/requirements.txt index 15105fc..5081f5a 100644 --- a/qwen3-tts-backend/requirements.txt +++ b/qwen3-tts-backend/requirements.txt @@ -6,6 +6,7 @@ python-multipart==0.0.12 python-jose[cryptography]==3.3.0 passlib==1.7.4 bcrypt==3.2.2 +cryptography>=41.0.0 sqlalchemy==2.0.35 aiosqlite==0.20.0 soundfile==0.12.1 @@ -17,3 +18,4 @@ pytest==8.3.0 pytest-cov==4.1.0 pytest-asyncio==0.23.0 httpx==0.27.0 +websockets>=12.0 diff --git a/qwen3-tts-backend/schemas/tts.py b/qwen3-tts-backend/schemas/tts.py index 4da189a..f1e12ec 100644 --- a/qwen3-tts-backend/schemas/tts.py +++ b/qwen3-tts-backend/schemas/tts.py @@ -24,6 +24,7 @@ class CustomVoiceRequest(BaseModel): top_k: Optional[int] = Field(default=50, ge=1, le=100) top_p: Optional[float] = Field(default=1.0, ge=0.0, le=1.0) repetition_penalty: Optional[float] = Field(default=1.05, ge=1.0, le=2.0) + backend: Optional[str] = Field(default=None, description="Backend type: local or aliyun") class VoiceDesignRequest(BaseModel): @@ -35,6 +36,7 @@ class VoiceDesignRequest(BaseModel): top_k: Optional[int] = Field(default=50, ge=1, le=100) top_p: Optional[float] = Field(default=1.0, ge=0.0, le=1.0) repetition_penalty: Optional[float] = Field(default=1.05, ge=1.0, le=2.0) + backend: Optional[str] = Field(default=None) class VoiceCloneRequest(BaseModel): diff --git a/qwen3-tts-backend/schemas/user.py b/qwen3-tts-backend/schemas/user.py index 9eb2189..cdeeb34 100644 --- a/qwen3-tts-backend/schemas/user.py +++ b/qwen3-tts-backend/schemas/user.py @@ -111,3 +111,10 @@ class PasswordChange(BaseModel): if self.new_password != self.confirm_password: raise ValueError('Passwords do not match') return self + +class AliyunKeyUpdate(BaseModel): + api_key: str = Field(..., min_length=1, max_length=500) + +class AliyunKeyVerifyResponse(BaseModel): + valid: bool + message: str diff --git a/qwen3-tts-frontend/index.html b/qwen3-tts-frontend/index.html index c1f1c90..1d40e2e 100644 --- a/qwen3-tts-frontend/index.html +++ b/qwen3-tts-frontend/index.html @@ -2,7 +2,7 @@ - + Qwen3-TTS-WebUI diff --git a/qwen3-tts-frontend/public/qwen.svg b/qwen3-tts-frontend/public/qwen.svg new file mode 100644 index 0000000..26de1c9 --- /dev/null +++ b/qwen3-tts-frontend/public/qwen.svg @@ -0,0 +1 @@ +Redirecting to /@lobehub/icons-static-svg@1.78.0/icons/qwen.svg \ No newline at end of file diff --git a/qwen3-tts-frontend/src/components/tts/CustomVoiceForm.tsx b/qwen3-tts-frontend/src/components/tts/CustomVoiceForm.tsx index 022d90d..5b780b5 100644 --- a/qwen3-tts-frontend/src/components/tts/CustomVoiceForm.tsx +++ b/qwen3-tts-frontend/src/components/tts/CustomVoiceForm.tsx @@ -32,6 +32,7 @@ const formSchema = z.object({ top_k: z.number().min(1).max(100).optional(), top_p: z.number().min(0).max(1).optional(), repetition_penalty: z.number().min(0).max(2).optional(), + backend: z.string().optional(), }) type FormData = z.infer @@ -74,6 +75,7 @@ const CustomVoiceForm = forwardRef((_props, ref) => { top_k: 20, top_p: 0.7, repetition_penalty: 1.05, + backend: 'local', }, }) @@ -88,6 +90,7 @@ const CustomVoiceForm = forwardRef((_props, ref) => { setValue('top_k', params.top_k || 20) setValue('top_p', params.top_p || 0.7) setValue('repetition_penalty', params.repetition_penalty || 1.05) + setValue('backend', params.backend || 'local') } })) @@ -131,6 +134,22 @@ const CustomVoiceForm = forwardRef((_props, ref) => { return (
+
+ + +
+
setValue('backend', value)} + > + + + + + 本地模型 + 阿里云 API + + +
+
setValue('backend', value)} + > + + + + + 本地模型 + 阿里云 API + + +
+