feat: Implement Aliyun TTS backend integration and API key management

This commit is contained in:
2026-02-03 15:44:25 +08:00
parent 5a5c93f075
commit abe0dc131b
19 changed files with 716 additions and 118 deletions

View File

@@ -20,3 +20,7 @@ BATCH_SIZE=4
BATCH_WAIT_TIME=0.5 BATCH_WAIT_TIME=0.5
MAX_TEXT_LENGTH=1000 MAX_TEXT_LENGTH=1000
MAX_AUDIO_SIZE_MB=10 MAX_AUDIO_SIZE_MB=10
ALIYUN_REGION=beijing
DEFAULT_BACKEND=local

View File

@@ -14,8 +14,8 @@ from core.security import (
decode_access_token decode_access_token
) )
from db.database import get_db from db.database import get_db
from db.crud import get_user_by_username, get_user_by_email, create_user, change_user_password from db.crud import get_user_by_username, get_user_by_email, create_user, change_user_password, update_user_aliyun_key
from schemas.user import User, UserCreate, Token, PasswordChange from schemas.user import User, UserCreate, Token, PasswordChange, AliyunKeyUpdate, AliyunKeyVerifyResponse
router = APIRouter(prefix="/auth", tags=["authentication"]) router = APIRouter(prefix="/auth", tags=["authentication"])
@@ -135,3 +135,79 @@ async def change_password(
) )
return user return user
@router.post("/aliyun-key", response_model=User)
@limiter.limit("5/minute")
async def set_aliyun_key(
request: Request,
key_data: AliyunKeyUpdate,
current_user: Annotated[User, Depends(get_current_user)],
db: Session = Depends(get_db)
):
from core.security import encrypt_api_key
from core.tts_service import AliyunTTSBackend
api_key = key_data.api_key.strip()
aliyun_backend = AliyunTTSBackend(api_key=api_key, region=settings.ALIYUN_REGION)
health = await aliyun_backend.health_check()
if not health.get("available", False):
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="Invalid Aliyun API key. Please check your API key and try again."
)
encrypted_key = encrypt_api_key(api_key)
user = update_user_aliyun_key(
db,
user_id=current_user.id,
encrypted_api_key=encrypted_key
)
if not user:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="User not found"
)
return user
@router.get("/aliyun-key/verify", response_model=AliyunKeyVerifyResponse)
@limiter.limit("10/minute")
async def verify_aliyun_key(
request: Request,
current_user: Annotated[User, Depends(get_current_user)],
db: Session = Depends(get_db)
):
from core.security import decrypt_api_key
from core.tts_service import AliyunTTSBackend
if not current_user.aliyun_api_key:
return AliyunKeyVerifyResponse(
valid=False,
message="No Aliyun API key configured"
)
api_key = decrypt_api_key(current_user.aliyun_api_key)
if not api_key:
return AliyunKeyVerifyResponse(
valid=False,
message="Failed to decrypt API key"
)
aliyun_backend = AliyunTTSBackend(api_key=api_key, region=settings.ALIYUN_REGION)
health = await aliyun_backend.health_check()
if health.get("available", False):
return AliyunKeyVerifyResponse(
valid=True,
message="Aliyun API key is valid and working"
)
else:
return AliyunKeyVerifyResponse(
valid=False,
message="Aliyun API key is not working. Please check your API key."
)

View File

@@ -36,9 +36,12 @@ async def process_custom_voice_job(
job_id: int, job_id: int,
user_id: int, user_id: int,
request_data: dict, request_data: dict,
backend_type: str,
db_url: str db_url: str
): ):
from core.database import SessionLocal from core.database import SessionLocal
from core.tts_service import TTSServiceFactory
from core.security import decrypt_api_key
db = SessionLocal() db = SessionLocal()
try: try:
@@ -51,42 +54,24 @@ async def process_custom_voice_job(
job.started_at = datetime.utcnow() job.started_at = datetime.utcnow()
db.commit() db.commit()
logger.info(f"Processing custom-voice job {job_id}") logger.info(f"Processing custom-voice job {job_id} with backend {backend_type}")
model_manager = await ModelManager.get_instance() user_api_key = None
await model_manager.load_model("custom-voice") if backend_type == "aliyun":
_, tts = await model_manager.get_current_model() user = db.query(User).filter(User.id == user_id).first()
if user and user.aliyun_api_key:
user_api_key = decrypt_api_key(user.aliyun_api_key)
if tts is None: backend = await TTSServiceFactory.get_backend(backend_type, user_api_key)
raise RuntimeError("Failed to load custom-voice model")
result = tts.generate_custom_voice( audio_bytes, sample_rate = await backend.generate_custom_voice(request_data)
text=request_data['text'],
language=request_data['language'],
speaker=request_data['speaker'],
instruct=request_data.get('instruct', ''),
max_new_tokens=request_data['max_new_tokens'],
temperature=request_data['temperature'],
top_k=request_data['top_k'],
top_p=request_data['top_p'],
repetition_penalty=request_data['repetition_penalty']
)
import numpy as np
if isinstance(result, tuple):
audio_data = result[0]
elif isinstance(result, list):
audio_data = np.array(result)
else:
audio_data = result
from pathlib import Path
timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S") timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
filename = f"{user_id}_{job_id}_{timestamp}.wav" filename = f"{user_id}_{job_id}_{timestamp}.wav"
output_path = Path(settings.OUTPUT_DIR) / filename output_path = Path(settings.OUTPUT_DIR) / filename
save_audio_file(audio_data, 24000, output_path) with open(output_path, 'wb') as f:
f.write(audio_bytes)
job.status = JobStatus.COMPLETED job.status = JobStatus.COMPLETED
job.output_path = str(output_path) job.output_path = str(output_path)
@@ -112,9 +97,12 @@ async def process_voice_design_job(
job_id: int, job_id: int,
user_id: int, user_id: int,
request_data: dict, request_data: dict,
backend_type: str,
db_url: str db_url: str
): ):
from core.database import SessionLocal from core.database import SessionLocal
from core.tts_service import TTSServiceFactory
from core.security import decrypt_api_key
db = SessionLocal() db = SessionLocal()
try: try:
@@ -127,41 +115,24 @@ async def process_voice_design_job(
job.started_at = datetime.utcnow() job.started_at = datetime.utcnow()
db.commit() db.commit()
logger.info(f"Processing voice-design job {job_id}") logger.info(f"Processing voice-design job {job_id} with backend {backend_type}")
model_manager = await ModelManager.get_instance() user_api_key = None
await model_manager.load_model("voice-design") if backend_type == "aliyun":
_, tts = await model_manager.get_current_model() user = db.query(User).filter(User.id == user_id).first()
if user and user.aliyun_api_key:
user_api_key = decrypt_api_key(user.aliyun_api_key)
if tts is None: backend = await TTSServiceFactory.get_backend(backend_type, user_api_key)
raise RuntimeError("Failed to load voice-design model")
result = tts.generate_voice_design( audio_bytes, sample_rate = await backend.generate_voice_design(request_data)
text=request_data['text'],
language=request_data['language'],
instruct=request_data['instruct'],
max_new_tokens=request_data['max_new_tokens'],
temperature=request_data['temperature'],
top_k=request_data['top_k'],
top_p=request_data['top_p'],
repetition_penalty=request_data['repetition_penalty']
)
import numpy as np
if isinstance(result, tuple):
audio_data = result[0]
elif isinstance(result, list):
audio_data = np.array(result)
else:
audio_data = result
from pathlib import Path
timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S") timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
filename = f"{user_id}_{job_id}_{timestamp}.wav" filename = f"{user_id}_{job_id}_{timestamp}.wav"
output_path = Path(settings.OUTPUT_DIR) / filename output_path = Path(settings.OUTPUT_DIR) / filename
save_audio_file(audio_data, 24000, output_path) with open(output_path, 'wb') as f:
f.write(audio_bytes)
job.status = JobStatus.COMPLETED job.status = JobStatus.COMPLETED
job.output_path = str(output_path) job.output_path = str(output_path)
@@ -188,9 +159,11 @@ async def process_voice_clone_job(
user_id: int, user_id: int,
request_data: dict, request_data: dict,
ref_audio_path: str, ref_audio_path: str,
backend_type: str,
db_url: str db_url: str
): ):
from core.database import SessionLocal from core.database import SessionLocal
from core.tts_service import TTSServiceFactory
import numpy as np import numpy as np
db = SessionLocal() db = SessionLocal()
@@ -204,7 +177,14 @@ async def process_voice_clone_job(
job.started_at = datetime.utcnow() job.started_at = datetime.utcnow()
db.commit() db.commit()
logger.info(f"Processing voice-clone job {job_id}") logger.info(f"Processing voice-clone job {job_id} with backend {backend_type}")
from core.security import decrypt_api_key
user_api_key = None
if backend_type == "aliyun":
user = db.query(User).filter(User.id == user_id).first()
if user and user.aliyun_api_key:
user_api_key = decrypt_api_key(user.aliyun_api_key)
with open(ref_audio_path, 'rb') as f: with open(ref_audio_path, 'rb') as f:
ref_audio_data = f.read() ref_audio_data = f.read()
@@ -212,6 +192,7 @@ async def process_voice_clone_job(
cache_manager = await VoiceCacheManager.get_instance() cache_manager = await VoiceCacheManager.get_instance()
ref_audio_hash = cache_manager.get_audio_hash(ref_audio_data) ref_audio_hash = cache_manager.get_audio_hash(ref_audio_data)
if request_data.get('x_vector_only_mode', False) and backend_type == "local":
x_vector = None x_vector = None
cache_id = None cache_id = None
@@ -238,7 +219,7 @@ async def process_voice_clone_job(
x_vector = tts.create_voice_clone_prompt( x_vector = tts.create_voice_clone_prompt(
ref_audio=(ref_audio_array, ref_sr), ref_audio=(ref_audio_array, ref_sr),
ref_text=request_data.get('ref_text', ''), ref_text=request_data.get('ref_text', ''),
x_vector_only_mode=request_data.get('x_vector_only_mode', False) x_vector_only_mode=True
) )
if request_data.get('use_cache', True): if request_data.get('use_cache', True):
@@ -247,14 +228,13 @@ async def process_voice_clone_job(
'duration': features['duration'], 'duration': features['duration'],
'sample_rate': features['sample_rate'], 'sample_rate': features['sample_rate'],
'ref_text': request_data.get('ref_text', ''), 'ref_text': request_data.get('ref_text', ''),
'x_vector_only_mode': request_data.get('x_vector_only_mode', False) 'x_vector_only_mode': True
} }
cache_id = await cache_manager.set_cache( cache_id = await cache_manager.set_cache(
user_id, ref_audio_hash, x_vector, metadata, db user_id, ref_audio_hash, x_vector, metadata, db
) )
logger.info(f"Created cache for job {job_id}, cache_id={cache_id}") logger.info(f"Created cache for job {job_id}, cache_id={cache_id}")
if request_data.get('x_vector_only_mode', False):
job.status = JobStatus.COMPLETED job.status = JobStatus.COMPLETED
job.output_path = f"x_vector_cached_{cache_id}" job.output_path = f"x_vector_cached_{cache_id}"
job.completed_at = datetime.utcnow() job.completed_at = datetime.utcnow()
@@ -262,31 +242,16 @@ async def process_voice_clone_job(
logger.info(f"Job {job_id} completed (x_vector_only_mode)") logger.info(f"Job {job_id} completed (x_vector_only_mode)")
return return
model_manager = await ModelManager.get_instance() backend = await TTSServiceFactory.get_backend(backend_type, user_api_key)
await model_manager.load_model("base")
_, tts = await model_manager.get_current_model()
if tts is None: audio_bytes, sample_rate = await backend.generate_voice_clone(request_data, ref_audio_data)
raise RuntimeError("Failed to load base model")
wavs, sample_rate = tts.generate_voice_clone(
text=request_data['text'],
language=request_data['language'],
voice_clone_prompt=x_vector,
max_new_tokens=request_data['max_new_tokens'],
temperature=request_data['temperature'],
top_k=request_data['top_k'],
top_p=request_data['top_p'],
repetition_penalty=request_data['repetition_penalty']
)
audio_data = wavs[0] if isinstance(wavs, list) else wavs
timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S") timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
filename = f"{user_id}_{job_id}_{timestamp}.wav" filename = f"{user_id}_{job_id}_{timestamp}.wav"
output_path = Path(settings.OUTPUT_DIR) / filename output_path = Path(settings.OUTPUT_DIR) / filename
save_audio_file(audio_data, sample_rate, output_path) with open(output_path, 'wb') as f:
f.write(audio_bytes)
job.status = JobStatus.COMPLETED job.status = JobStatus.COMPLETED
job.output_path = str(output_path) job.output_path = str(output_path)
@@ -319,6 +284,16 @@ async def create_custom_voice_job(
current_user: User = Depends(get_current_user), current_user: User = Depends(get_current_user),
db: Session = Depends(get_db) db: Session = Depends(get_db)
): ):
from core.security import decrypt_api_key
backend_type = req_data.backend or settings.DEFAULT_BACKEND
if backend_type == "aliyun":
if not current_user.aliyun_api_key:
raise HTTPException(status_code=400, detail="Aliyun API key not configured. Please set your API key first.")
user_api_key = decrypt_api_key(current_user.aliyun_api_key)
if not user_api_key:
raise HTTPException(status_code=400, detail="Invalid Aliyun API key. Please update your API key.")
try: try:
validate_text_length(req_data.text) validate_text_length(req_data.text)
language = validate_language(req_data.language) language = validate_language(req_data.language)
@@ -339,6 +314,7 @@ async def create_custom_voice_job(
user_id=current_user.id, user_id=current_user.id,
job_type="custom-voice", job_type="custom-voice",
status=JobStatus.PENDING, status=JobStatus.PENDING,
backend_type=backend_type,
input_data="", input_data="",
input_params={ input_params={
"text": req_data.text, "text": req_data.text,
@@ -365,6 +341,7 @@ async def create_custom_voice_job(
job.id, job.id,
current_user.id, current_user.id,
request_data, request_data,
backend_type,
str(settings.DATABASE_URL) str(settings.DATABASE_URL)
) )
@@ -384,6 +361,16 @@ async def create_voice_design_job(
current_user: User = Depends(get_current_user), current_user: User = Depends(get_current_user),
db: Session = Depends(get_db) db: Session = Depends(get_db)
): ):
from core.security import decrypt_api_key
backend_type = req_data.backend or settings.DEFAULT_BACKEND
if backend_type == "aliyun":
if not current_user.aliyun_api_key:
raise HTTPException(status_code=400, detail="Aliyun API key not configured. Please set your API key first.")
user_api_key = decrypt_api_key(current_user.aliyun_api_key)
if not user_api_key:
raise HTTPException(status_code=400, detail="Invalid Aliyun API key. Please update your API key.")
try: try:
validate_text_length(req_data.text) validate_text_length(req_data.text)
language = validate_language(req_data.language) language = validate_language(req_data.language)
@@ -406,6 +393,7 @@ async def create_voice_design_job(
user_id=current_user.id, user_id=current_user.id,
job_type="voice-design", job_type="voice-design",
status=JobStatus.PENDING, status=JobStatus.PENDING,
backend_type=backend_type,
input_data="", input_data="",
input_params={ input_params={
"text": req_data.text, "text": req_data.text,
@@ -430,6 +418,7 @@ async def create_voice_design_job(
job.id, job.id,
current_user.id, current_user.id,
request_data, request_data,
backend_type,
str(settings.DATABASE_URL) str(settings.DATABASE_URL)
) )
@@ -455,10 +444,21 @@ async def create_voice_clone_job(
top_k: Optional[int] = Form(default=50), top_k: Optional[int] = Form(default=50),
top_p: Optional[float] = Form(default=1.0), top_p: Optional[float] = Form(default=1.0),
repetition_penalty: Optional[float] = Form(default=1.05), repetition_penalty: Optional[float] = Form(default=1.05),
backend: Optional[str] = Form(default=None),
background_tasks: BackgroundTasks = None, background_tasks: BackgroundTasks = None,
current_user: User = Depends(get_current_user), current_user: User = Depends(get_current_user),
db: Session = Depends(get_db) db: Session = Depends(get_db)
): ):
from core.security import decrypt_api_key
backend_type = backend or settings.DEFAULT_BACKEND
if backend_type == "aliyun":
if not current_user.aliyun_api_key:
raise HTTPException(status_code=400, detail="Aliyun API key not configured. Please set your API key first.")
user_api_key = decrypt_api_key(current_user.aliyun_api_key)
if not user_api_key:
raise HTTPException(status_code=400, detail="Invalid Aliyun API key. Please update your API key.")
try: try:
validate_text_length(text) validate_text_length(text)
language = validate_language(language) language = validate_language(language)
@@ -486,6 +486,7 @@ async def create_voice_clone_job(
user_id=current_user.id, user_id=current_user.id,
job_type="voice-clone", job_type="voice-clone",
status=JobStatus.PENDING, status=JobStatus.PENDING,
backend_type=backend_type,
input_data="", input_data="",
input_params={ input_params={
"text": text, "text": text,
@@ -520,6 +521,7 @@ async def create_voice_clone_job(
current_user.id, current_user.id,
request_data, request_data,
tmp_audio_path, tmp_audio_path,
backend_type,
str(settings.DATABASE_URL) str(settings.DATABASE_URL)
) )

View File

@@ -36,6 +36,14 @@ class Settings(BaseSettings):
MAX_TEXT_LENGTH: int = Field(default=1000) MAX_TEXT_LENGTH: int = Field(default=1000)
MAX_AUDIO_SIZE_MB: int = Field(default=10) MAX_AUDIO_SIZE_MB: int = Field(default=10)
ALIYUN_REGION: str = Field(default="beijing")
ALIYUN_MODEL_FLASH: str = Field(default="qwen3-tts-flash-realtime")
ALIYUN_MODEL_VC: str = Field(default="qwen3-tts-vc-realtime-2026-01-15")
ALIYUN_MODEL_VD: str = Field(default="qwen3-tts-vd-realtime-2026-01-15")
DEFAULT_BACKEND: str = Field(default="local")
class Config: class Config:
env_file = ".env" env_file = ".env"
case_sensitive = True case_sensitive = True

View File

@@ -2,6 +2,9 @@ from datetime import datetime, timedelta
from typing import Optional from typing import Optional
from jose import JWTError, jwt from jose import JWTError, jwt
from passlib.context import CryptContext from passlib.context import CryptContext
from cryptography.fernet import Fernet
import base64
import hashlib
from config import settings from config import settings
@@ -33,3 +36,24 @@ def decode_access_token(token: str) -> Optional[str]:
return username return username
except JWTError: except JWTError:
return None return None
def _get_fernet_key() -> bytes:
key = hashlib.sha256(settings.SECRET_KEY.encode()).digest()
return base64.urlsafe_b64encode(key)
def encrypt_api_key(api_key: str) -> str:
if not api_key:
return ""
fernet = Fernet(_get_fernet_key())
encrypted = fernet.encrypt(api_key.encode())
return encrypted.decode()
def decrypt_api_key(encrypted_key: str) -> Optional[str]:
if not encrypted_key:
return None
try:
fernet = Fernet(_get_fernet_key())
decrypted = fernet.decrypt(encrypted_key.encode())
return decrypted.decode()
except Exception:
return None

View File

@@ -0,0 +1,371 @@
import time
import logging
from abc import ABC, abstractmethod
from typing import Tuple, Optional
import websockets
import json
import base64
logger = logging.getLogger(__name__)
class TTSBackend(ABC):
@abstractmethod
async def generate_custom_voice(self, params: dict) -> Tuple[bytes, int]:
pass
@abstractmethod
async def generate_voice_design(self, params: dict) -> Tuple[bytes, int]:
pass
@abstractmethod
async def generate_voice_clone(self, params: dict, ref_audio_bytes: bytes) -> Tuple[bytes, int]:
pass
@abstractmethod
async def health_check(self) -> dict:
pass
class LocalTTSBackend(TTSBackend):
def __init__(self):
self.model_manager = None
async def initialize(self):
from core.model_manager import ModelManager
self.model_manager = await ModelManager.get_instance()
async def generate_custom_voice(self, params: dict) -> Tuple[bytes, int]:
await self.model_manager.load_model("custom-voice")
_, tts = await self.model_manager.get_current_model()
result = tts.generate_custom_voice(
text=params['text'],
language=params['language'],
speaker=params['speaker'],
instruct=params.get('instruct', ''),
max_new_tokens=params['max_new_tokens'],
temperature=params['temperature'],
top_k=params['top_k'],
top_p=params['top_p'],
repetition_penalty=params['repetition_penalty']
)
import numpy as np
if isinstance(result, tuple):
audio_data = result[0]
elif isinstance(result, list):
audio_data = np.array(result)
else:
audio_data = result
return self._numpy_to_bytes(audio_data), 24000
async def generate_voice_design(self, params: dict) -> Tuple[bytes, int]:
await self.model_manager.load_model("voice-design")
_, tts = await self.model_manager.get_current_model()
result = tts.generate_voice_design(
text=params['text'],
language=params['language'],
instruct=params['instruct'],
max_new_tokens=params['max_new_tokens'],
temperature=params['temperature'],
top_k=params['top_k'],
top_p=params['top_p'],
repetition_penalty=params['repetition_penalty']
)
import numpy as np
audio_data = result[0] if isinstance(result, tuple) else result
return self._numpy_to_bytes(audio_data), 24000
async def generate_voice_clone(self, params: dict, ref_audio_bytes: bytes) -> Tuple[bytes, int]:
from utils.audio import process_ref_audio
ref_audio_array, ref_sr = process_ref_audio(ref_audio_bytes)
await self.model_manager.load_model("base")
_, tts = await self.model_manager.get_current_model()
x_vector = tts.create_voice_clone_prompt(
ref_audio=(ref_audio_array, ref_sr),
ref_text=params.get('ref_text', ''),
x_vector_only_mode=False
)
wavs, sample_rate = tts.generate_voice_clone(
text=params['text'],
language=params['language'],
voice_clone_prompt=x_vector,
max_new_tokens=params['max_new_tokens'],
temperature=params['temperature'],
top_k=params['top_k'],
top_p=params['top_p'],
repetition_penalty=params['repetition_penalty']
)
audio_data = wavs[0] if isinstance(wavs, list) else wavs
return self._numpy_to_bytes(audio_data), sample_rate
async def health_check(self) -> dict:
return {
"available": self.model_manager is not None,
"current_model": self.model_manager.current_model_name if self.model_manager else None
}
@staticmethod
def _numpy_to_bytes(audio_array) -> bytes:
import numpy as np
import io
import scipy.io.wavfile
buffer = io.BytesIO()
scipy.io.wavfile.write(buffer, 24000, (audio_array * 32767).astype(np.int16))
buffer.seek(0)
return buffer.read()
class AliyunTTSBackend(TTSBackend):
def __init__(self, api_key: str, region: str):
self.api_key = api_key
self.region = region
self.ws_url = self._get_ws_url(region)
self.http_url = self._get_http_url(region)
def _get_ws_url(self, region: str) -> str:
if region == "beijing":
return "wss://dashscope.aliyuncs.com/api-ws/v1/realtime"
else:
return "wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime"
def _get_http_url(self, region: str) -> str:
if region == "beijing":
return "https://dashscope.aliyuncs.com/api/v1/services/audio/tts/customization"
else:
return "https://dashscope-intl.aliyuncs.com/api/v1/services/audio/tts/customization"
async def generate_custom_voice(self, params: dict) -> Tuple[bytes, int]:
from core.config import settings
voice = self._map_speaker(params['speaker'])
model = settings.ALIYUN_MODEL_FLASH
return await self._generate_via_websocket(
model=model,
text=params['text'],
voice=voice,
language=params['language']
)
async def generate_voice_design(self, params: dict) -> Tuple[bytes, int]:
from core.config import settings
voice_id = await self._create_voice_design(
instruct=params['instruct'],
preview_text=params['text']
)
model = settings.ALIYUN_MODEL_VD
return await self._generate_via_websocket(
model=model,
text=params['text'],
voice=voice_id,
language=params['language']
)
async def generate_voice_clone(self, params: dict, ref_audio_bytes: bytes) -> Tuple[bytes, int]:
from core.config import settings
voice_id = await self._create_voice_clone(ref_audio_bytes)
model = settings.ALIYUN_MODEL_VC
return await self._generate_via_websocket(
model=model,
text=params['text'],
voice=voice_id,
language=params['language']
)
async def _generate_via_websocket(
self,
model: str,
text: str,
voice: str,
language: str
) -> Tuple[bytes, int]:
audio_chunks = []
url = f"{self.ws_url}?model={model}"
headers = {"Authorization": f"Bearer {self.api_key}"}
async with websockets.connect(url, additional_headers=headers) as ws:
await ws.send(json.dumps({
"type": "session.update",
"session": {
"mode": "server_commit",
"voice": voice,
"language_type": language,
"response_format": "pcm",
"sample_rate": 24000
}
}))
await ws.send(json.dumps({
"type": "input_text_buffer.append",
"text": text
}))
await ws.send(json.dumps({
"type": "session.finish"
}))
async for message in ws:
event = json.loads(message)
event_type = event.get('type')
if event_type == 'response.audio.delta':
audio_data = base64.b64decode(event['delta'])
audio_chunks.append(audio_data)
elif event_type == 'session.finished':
break
elif event_type == 'error':
raise RuntimeError(f"Aliyun API error: {event.get('error')}")
pcm_data = b''.join(audio_chunks)
wav_bytes = self._pcm_to_wav(pcm_data, 24000)
return wav_bytes, 24000
async def _create_voice_clone(self, ref_audio_bytes: bytes) -> str:
from core.config import settings
import httpx
audio_b64 = base64.b64encode(ref_audio_bytes).decode()
data_uri = f"data:audio/wav;base64,{audio_b64}"
payload = {
"model": "qwen-voice-enrollment",
"input": {
"action": "create",
"target_model": settings.ALIYUN_MODEL_VC,
"preferred_name": f"clone_{int(time.time())}",
"audio": {"data": data_uri}
}
}
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
}
async with httpx.AsyncClient() as client:
resp = await client.post(self.http_url, json=payload, headers=headers, timeout=60)
resp.raise_for_status()
result = resp.json()
return result['output']['voice']
async def _create_voice_design(self, instruct: str, preview_text: str) -> str:
from core.config import settings
import httpx
payload = {
"model": "qwen-voice-design",
"input": {
"action": "create",
"target_model": settings.ALIYUN_MODEL_VD,
"voice_prompt": instruct,
"preview_text": preview_text,
"preferred_name": f"design_{int(time.time())}",
"language": "zh"
},
"parameters": {
"sample_rate": 24000,
"response_format": "wav"
}
}
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
}
async with httpx.AsyncClient() as client:
resp = await client.post(self.http_url, json=payload, headers=headers, timeout=60)
resp.raise_for_status()
result = resp.json()
return result['output']['voice']
async def health_check(self) -> dict:
try:
import httpx
async with httpx.AsyncClient() as client:
resp = await client.get(
self.http_url.replace('/customization', '/health'),
headers={"Authorization": f"Bearer {self.api_key}"},
timeout=5
)
return {"available": resp.status_code < 500}
except:
return {"available": False}
@staticmethod
def _pcm_to_wav(pcm_data: bytes, sample_rate: int) -> bytes:
import io
import wave
wav_buffer = io.BytesIO()
with wave.open(wav_buffer, 'wb') as wav_file:
wav_file.setnchannels(1)
wav_file.setsampwidth(2)
wav_file.setframerate(sample_rate)
wav_file.writeframes(pcm_data)
wav_buffer.seek(0)
return wav_buffer.read()
@staticmethod
def _map_speaker(local_speaker: str) -> str:
mapping = {
"Vivian": "Cherry",
"Serena": "Lili",
"Uncle_Fu": "Longxiaochun",
"Dylan": "Longxiaochun",
"Female": "Cherry",
"Male": "Longxiaochun"
}
return mapping.get(local_speaker, "Cherry")
class TTSServiceFactory:
_local_backend: Optional[LocalTTSBackend] = None
_aliyun_backend: Optional[AliyunTTSBackend] = None
_user_aliyun_backends: dict[str, AliyunTTSBackend] = {}
@classmethod
async def get_backend(cls, backend_type: str = None, user_api_key: Optional[str] = None) -> TTSBackend:
from core.config import settings
if backend_type is None:
backend_type = settings.DEFAULT_BACKEND
if backend_type == "local":
if cls._local_backend is None:
cls._local_backend = LocalTTSBackend()
await cls._local_backend.initialize()
return cls._local_backend
elif backend_type == "aliyun":
if not user_api_key:
raise ValueError("Aliyun backend requires user API key. Please set your API key first.")
if user_api_key not in cls._user_aliyun_backends:
cls._user_aliyun_backends[user_api_key] = AliyunTTSBackend(
api_key=user_api_key,
region=settings.ALIYUN_REGION
)
return cls._user_aliyun_backends[user_api_key]
else:
raise ValueError(f"Unknown backend type: {backend_type}")

View File

@@ -103,6 +103,21 @@ def change_user_password(
db.refresh(user) db.refresh(user)
return user return user
def update_user_aliyun_key(
db: Session,
user_id: int,
encrypted_api_key: str
) -> Optional[User]:
user = get_user_by_id(db, user_id)
if not user:
return None
user.aliyun_api_key = encrypted_api_key
user.updated_at = datetime.utcnow()
db.commit()
db.refresh(user)
return user
def create_job(db: Session, user_id: int, job_type: str, input_data: Dict[str, Any]) -> Job: def create_job(db: Session, user_id: int, job_type: str, input_data: Dict[str, Any]) -> Job:
job = Job( job = Job(
user_id=user_id, user_id=user_id,

View File

@@ -20,6 +20,7 @@ class User(Base):
hashed_password = Column(String(255), nullable=False) hashed_password = Column(String(255), nullable=False)
is_active = Column(Boolean, default=True, nullable=False) is_active = Column(Boolean, default=True, nullable=False)
is_superuser = Column(Boolean, default=False, nullable=False) is_superuser = Column(Boolean, default=False, nullable=False)
aliyun_api_key = Column(Text, nullable=True)
created_at = Column(DateTime, default=datetime.utcnow, nullable=False) created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False) updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
@@ -33,6 +34,7 @@ class Job(Base):
user_id = Column(Integer, ForeignKey("users.id"), nullable=False, index=True) user_id = Column(Integer, ForeignKey("users.id"), nullable=False, index=True)
job_type = Column(String(50), nullable=False) job_type = Column(String(50), nullable=False)
status = Column(String(50), default="pending", nullable=False, index=True) status = Column(String(50), default="pending", nullable=False, index=True)
backend_type = Column(String(20), default="local", nullable=False)
input_data = Column(Text, nullable=True) input_data = Column(Text, nullable=True)
input_params = Column(JSON, nullable=True) input_params = Column(JSON, nullable=True)
output_path = Column(String(500), nullable=True) output_path = Column(String(500), nullable=True)

View File

@@ -194,6 +194,27 @@ async def health_check():
if queue_length > 50: if queue_length > 50:
minor_issues.append("queue_congested") minor_issues.append("queue_congested")
backends_status = {}
try:
from core.tts_service import TTSServiceFactory
try:
local_backend = await TTSServiceFactory.get_backend("local")
local_health = await local_backend.health_check()
backends_status["local"] = local_health
except Exception as e:
backends_status["local"] = {"available": False, "error": str(e)}
backends_status["aliyun"] = {
"available": True,
"region": settings.ALIYUN_REGION,
"note": "Requires user API key configuration"
}
except Exception as e:
logger.error(f"Backend health check failed: {e}")
backends_status = {"error": str(e)}
if critical_issues: if critical_issues:
status = "unhealthy" status = "unhealthy"
elif minor_issues: elif minor_issues:
@@ -211,6 +232,7 @@ async def health_check():
"database_connected": database_connected, "database_connected": database_connected,
"cache_dir_writable": cache_dir_writable, "cache_dir_writable": cache_dir_writable,
"output_dir_writable": output_dir_writable, "output_dir_writable": output_dir_writable,
"backends": backends_status,
"issues": { "issues": {
"critical": critical_issues, "critical": critical_issues,
"minor": minor_issues "minor": minor_issues

View File

@@ -6,6 +6,7 @@ python-multipart==0.0.12
python-jose[cryptography]==3.3.0 python-jose[cryptography]==3.3.0
passlib==1.7.4 passlib==1.7.4
bcrypt==3.2.2 bcrypt==3.2.2
cryptography>=41.0.0
sqlalchemy==2.0.35 sqlalchemy==2.0.35
aiosqlite==0.20.0 aiosqlite==0.20.0
soundfile==0.12.1 soundfile==0.12.1
@@ -17,3 +18,4 @@ pytest==8.3.0
pytest-cov==4.1.0 pytest-cov==4.1.0
pytest-asyncio==0.23.0 pytest-asyncio==0.23.0
httpx==0.27.0 httpx==0.27.0
websockets>=12.0

View File

@@ -24,6 +24,7 @@ class CustomVoiceRequest(BaseModel):
top_k: Optional[int] = Field(default=50, ge=1, le=100) top_k: Optional[int] = Field(default=50, ge=1, le=100)
top_p: Optional[float] = Field(default=1.0, ge=0.0, le=1.0) top_p: Optional[float] = Field(default=1.0, ge=0.0, le=1.0)
repetition_penalty: Optional[float] = Field(default=1.05, ge=1.0, le=2.0) repetition_penalty: Optional[float] = Field(default=1.05, ge=1.0, le=2.0)
backend: Optional[str] = Field(default=None, description="Backend type: local or aliyun")
class VoiceDesignRequest(BaseModel): class VoiceDesignRequest(BaseModel):
@@ -35,6 +36,7 @@ class VoiceDesignRequest(BaseModel):
top_k: Optional[int] = Field(default=50, ge=1, le=100) top_k: Optional[int] = Field(default=50, ge=1, le=100)
top_p: Optional[float] = Field(default=1.0, ge=0.0, le=1.0) top_p: Optional[float] = Field(default=1.0, ge=0.0, le=1.0)
repetition_penalty: Optional[float] = Field(default=1.05, ge=1.0, le=2.0) repetition_penalty: Optional[float] = Field(default=1.05, ge=1.0, le=2.0)
backend: Optional[str] = Field(default=None)
class VoiceCloneRequest(BaseModel): class VoiceCloneRequest(BaseModel):

View File

@@ -111,3 +111,10 @@ class PasswordChange(BaseModel):
if self.new_password != self.confirm_password: if self.new_password != self.confirm_password:
raise ValueError('Passwords do not match') raise ValueError('Passwords do not match')
return self return self
class AliyunKeyUpdate(BaseModel):
api_key: str = Field(..., min_length=1, max_length=500)
class AliyunKeyVerifyResponse(BaseModel):
valid: bool
message: str

View File

@@ -2,7 +2,7 @@
<html lang="en"> <html lang="en">
<head> <head>
<meta charset="UTF-8" /> <meta charset="UTF-8" />
<link rel="icon" type="image/svg+xml" href="/vite.svg" /> <link rel="icon" type="image/svg+xml" href="/qwen.svg" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /> <meta name="viewport" content="width=device-width, initial-scale=1.0" />
<link rel="preload" href="/fonts/noto-serif-regular.woff2" as="font" type="font/woff2" crossorigin> <link rel="preload" href="/fonts/noto-serif-regular.woff2" as="font" type="font/woff2" crossorigin>
<title>Qwen3-TTS-WebUI</title> <title>Qwen3-TTS-WebUI</title>

View File

@@ -0,0 +1 @@
Redirecting to /@lobehub/icons-static-svg@1.78.0/icons/qwen.svg

View File

@@ -32,6 +32,7 @@ const formSchema = z.object({
top_k: z.number().min(1).max(100).optional(), top_k: z.number().min(1).max(100).optional(),
top_p: z.number().min(0).max(1).optional(), top_p: z.number().min(0).max(1).optional(),
repetition_penalty: z.number().min(0).max(2).optional(), repetition_penalty: z.number().min(0).max(2).optional(),
backend: z.string().optional(),
}) })
type FormData = z.infer<typeof formSchema> type FormData = z.infer<typeof formSchema>
@@ -74,6 +75,7 @@ const CustomVoiceForm = forwardRef<CustomVoiceFormHandle>((_props, ref) => {
top_k: 20, top_k: 20,
top_p: 0.7, top_p: 0.7,
repetition_penalty: 1.05, repetition_penalty: 1.05,
backend: 'local',
}, },
}) })
@@ -88,6 +90,7 @@ const CustomVoiceForm = forwardRef<CustomVoiceFormHandle>((_props, ref) => {
setValue('top_k', params.top_k || 20) setValue('top_k', params.top_k || 20)
setValue('top_p', params.top_p || 0.7) setValue('top_p', params.top_p || 0.7)
setValue('repetition_penalty', params.repetition_penalty || 1.05) setValue('repetition_penalty', params.repetition_penalty || 1.05)
setValue('backend', params.backend || 'local')
} }
})) }))
@@ -131,6 +134,22 @@ const CustomVoiceForm = forwardRef<CustomVoiceFormHandle>((_props, ref) => {
return ( return (
<form onSubmit={handleSubmit(onSubmit)} className="space-y-2"> <form onSubmit={handleSubmit(onSubmit)} className="space-y-2">
<div className="space-y-0.5">
<Label></Label>
<Select
value={watch('backend')}
onValueChange={(value: string) => setValue('backend', value)}
>
<SelectTrigger>
<SelectValue />
</SelectTrigger>
<SelectContent>
<SelectItem value="local"></SelectItem>
<SelectItem value="aliyun"> API</SelectItem>
</SelectContent>
</Select>
</div>
<div className="space-y-0.5"> <div className="space-y-0.5">
<IconLabel icon={Globe2} tooltip="语言" required /> <IconLabel icon={Globe2} tooltip="语言" required />
<Select <Select

View File

@@ -37,6 +37,7 @@ const formSchema = z.object({
top_k: z.number().min(1).max(100).optional(), top_k: z.number().min(1).max(100).optional(),
top_p: z.number().min(0).max(1).optional(), top_p: z.number().min(0).max(1).optional(),
repetition_penalty: z.number().min(0).max(2).optional(), repetition_penalty: z.number().min(0).max(2).optional(),
backend: z.string().optional(),
}) })
type FormData = z.infer<typeof formSchema> type FormData = z.infer<typeof formSchema>
@@ -75,6 +76,7 @@ function VoiceCloneForm() {
top_k: 20, top_k: 20,
top_p: 0.7, top_p: 0.7,
repetition_penalty: 1.05, repetition_penalty: 1.05,
backend: 'local',
} as Partial<FormData>, } as Partial<FormData>,
}) })
@@ -233,6 +235,22 @@ function VoiceCloneForm() {
<div className={step === 2 ? 'block space-y-4' : 'hidden'}> <div className={step === 2 ? 'block space-y-4' : 'hidden'}>
{/* Step 2: Synthesis Options */} {/* Step 2: Synthesis Options */}
<div className="space-y-0.5">
<Label></Label>
<Select
value={watch('backend')}
onValueChange={(value: string) => setValue('backend', value)}
>
<SelectTrigger>
<SelectValue />
</SelectTrigger>
<SelectContent>
<SelectItem value="local"></SelectItem>
<SelectItem value="aliyun"> API</SelectItem>
</SelectContent>
</Select>
</div>
<div className="space-y-0.5"> <div className="space-y-0.5">
<IconLabel icon={Globe2} tooltip="语言(可选)" /> <IconLabel icon={Globe2} tooltip="语言(可选)" />
<Select <Select

View File

@@ -31,6 +31,7 @@ const formSchema = z.object({
top_k: z.number().min(1).max(100).optional(), top_k: z.number().min(1).max(100).optional(),
top_p: z.number().min(0).max(1).optional(), top_p: z.number().min(0).max(1).optional(),
repetition_penalty: z.number().min(0).max(2).optional(), repetition_penalty: z.number().min(0).max(2).optional(),
backend: z.string().optional(),
}) })
type FormData = z.infer<typeof formSchema> type FormData = z.infer<typeof formSchema>
@@ -71,6 +72,7 @@ const VoiceDesignForm = forwardRef<VoiceDesignFormHandle>((_props, ref) => {
top_k: 20, top_k: 20,
top_p: 0.7, top_p: 0.7,
repetition_penalty: 1.05, repetition_penalty: 1.05,
backend: 'local',
}, },
}) })
@@ -84,6 +86,7 @@ const VoiceDesignForm = forwardRef<VoiceDesignFormHandle>((_props, ref) => {
setValue('top_k', params.top_k || 20) setValue('top_k', params.top_k || 20)
setValue('top_p', params.top_p || 0.7) setValue('top_p', params.top_p || 0.7)
setValue('repetition_penalty', params.repetition_penalty || 1.05) setValue('repetition_penalty', params.repetition_penalty || 1.05)
setValue('backend', params.backend || 'local')
} }
})) }))
@@ -122,6 +125,22 @@ const VoiceDesignForm = forwardRef<VoiceDesignFormHandle>((_props, ref) => {
return ( return (
<form onSubmit={handleSubmit(onSubmit)} className="space-y-2"> <form onSubmit={handleSubmit(onSubmit)} className="space-y-2">
<div className="space-y-0.5">
<Label></Label>
<Select
value={watch('backend')}
onValueChange={(value: string) => setValue('backend', value)}
>
<SelectTrigger>
<SelectValue />
</SelectTrigger>
<SelectContent>
<SelectItem value="local"></SelectItem>
<SelectItem value="aliyun"> API</SelectItem>
</SelectContent>
</Select>
</div>
<div className="space-y-0.5"> <div className="space-y-0.5">
<IconLabel icon={Globe2} tooltip="语言" required /> <IconLabel icon={Globe2} tooltip="语言" required />
<Select <Select

View File

@@ -247,6 +247,9 @@ export const ttsApi = {
if (data.repetition_penalty !== undefined) { if (data.repetition_penalty !== undefined) {
formData.append('repetition_penalty', String(data.repetition_penalty)) formData.append('repetition_penalty', String(data.repetition_penalty))
} }
if (data.backend) {
formData.append('backend', data.backend)
}
const response = await apiClient.post<JobCreateResponse>( const response = await apiClient.post<JobCreateResponse>(
API_ENDPOINTS.TTS.VOICE_CLONE, API_ENDPOINTS.TTS.VOICE_CLONE,

View File

@@ -18,6 +18,7 @@ export interface CustomVoiceForm {
top_k?: number top_k?: number
top_p?: number top_p?: number
repetition_penalty?: number repetition_penalty?: number
backend?: string
} }
export interface VoiceDesignForm { export interface VoiceDesignForm {
@@ -29,6 +30,7 @@ export interface VoiceDesignForm {
top_k?: number top_k?: number
top_p?: number top_p?: number
repetition_penalty?: number repetition_penalty?: number
backend?: string
} }
export interface VoiceCloneForm { export interface VoiceCloneForm {
@@ -43,4 +45,5 @@ export interface VoiceCloneForm {
top_k?: number top_k?: number
top_p?: number top_p?: number
repetition_penalty?: number repetition_penalty?: number
backend?: string
} }