feat: Implement Aliyun TTS backend integration and API key management
This commit is contained in:
@@ -20,3 +20,7 @@ BATCH_SIZE=4
|
|||||||
BATCH_WAIT_TIME=0.5
|
BATCH_WAIT_TIME=0.5
|
||||||
MAX_TEXT_LENGTH=1000
|
MAX_TEXT_LENGTH=1000
|
||||||
MAX_AUDIO_SIZE_MB=10
|
MAX_AUDIO_SIZE_MB=10
|
||||||
|
|
||||||
|
ALIYUN_REGION=beijing
|
||||||
|
|
||||||
|
DEFAULT_BACKEND=local
|
||||||
|
|||||||
@@ -14,8 +14,8 @@ from core.security import (
|
|||||||
decode_access_token
|
decode_access_token
|
||||||
)
|
)
|
||||||
from db.database import get_db
|
from db.database import get_db
|
||||||
from db.crud import get_user_by_username, get_user_by_email, create_user, change_user_password
|
from db.crud import get_user_by_username, get_user_by_email, create_user, change_user_password, update_user_aliyun_key
|
||||||
from schemas.user import User, UserCreate, Token, PasswordChange
|
from schemas.user import User, UserCreate, Token, PasswordChange, AliyunKeyUpdate, AliyunKeyVerifyResponse
|
||||||
|
|
||||||
router = APIRouter(prefix="/auth", tags=["authentication"])
|
router = APIRouter(prefix="/auth", tags=["authentication"])
|
||||||
|
|
||||||
@@ -135,3 +135,79 @@ async def change_password(
|
|||||||
)
|
)
|
||||||
|
|
||||||
return user
|
return user
|
||||||
|
|
||||||
|
@router.post("/aliyun-key", response_model=User)
|
||||||
|
@limiter.limit("5/minute")
|
||||||
|
async def set_aliyun_key(
|
||||||
|
request: Request,
|
||||||
|
key_data: AliyunKeyUpdate,
|
||||||
|
current_user: Annotated[User, Depends(get_current_user)],
|
||||||
|
db: Session = Depends(get_db)
|
||||||
|
):
|
||||||
|
from core.security import encrypt_api_key
|
||||||
|
from core.tts_service import AliyunTTSBackend
|
||||||
|
|
||||||
|
api_key = key_data.api_key.strip()
|
||||||
|
|
||||||
|
aliyun_backend = AliyunTTSBackend(api_key=api_key, region=settings.ALIYUN_REGION)
|
||||||
|
health = await aliyun_backend.health_check()
|
||||||
|
|
||||||
|
if not health.get("available", False):
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_400_BAD_REQUEST,
|
||||||
|
detail="Invalid Aliyun API key. Please check your API key and try again."
|
||||||
|
)
|
||||||
|
|
||||||
|
encrypted_key = encrypt_api_key(api_key)
|
||||||
|
|
||||||
|
user = update_user_aliyun_key(
|
||||||
|
db,
|
||||||
|
user_id=current_user.id,
|
||||||
|
encrypted_api_key=encrypted_key
|
||||||
|
)
|
||||||
|
|
||||||
|
if not user:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_404_NOT_FOUND,
|
||||||
|
detail="User not found"
|
||||||
|
)
|
||||||
|
|
||||||
|
return user
|
||||||
|
|
||||||
|
@router.get("/aliyun-key/verify", response_model=AliyunKeyVerifyResponse)
|
||||||
|
@limiter.limit("10/minute")
|
||||||
|
async def verify_aliyun_key(
|
||||||
|
request: Request,
|
||||||
|
current_user: Annotated[User, Depends(get_current_user)],
|
||||||
|
db: Session = Depends(get_db)
|
||||||
|
):
|
||||||
|
from core.security import decrypt_api_key
|
||||||
|
from core.tts_service import AliyunTTSBackend
|
||||||
|
|
||||||
|
if not current_user.aliyun_api_key:
|
||||||
|
return AliyunKeyVerifyResponse(
|
||||||
|
valid=False,
|
||||||
|
message="No Aliyun API key configured"
|
||||||
|
)
|
||||||
|
|
||||||
|
api_key = decrypt_api_key(current_user.aliyun_api_key)
|
||||||
|
|
||||||
|
if not api_key:
|
||||||
|
return AliyunKeyVerifyResponse(
|
||||||
|
valid=False,
|
||||||
|
message="Failed to decrypt API key"
|
||||||
|
)
|
||||||
|
|
||||||
|
aliyun_backend = AliyunTTSBackend(api_key=api_key, region=settings.ALIYUN_REGION)
|
||||||
|
health = await aliyun_backend.health_check()
|
||||||
|
|
||||||
|
if health.get("available", False):
|
||||||
|
return AliyunKeyVerifyResponse(
|
||||||
|
valid=True,
|
||||||
|
message="Aliyun API key is valid and working"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
return AliyunKeyVerifyResponse(
|
||||||
|
valid=False,
|
||||||
|
message="Aliyun API key is not working. Please check your API key."
|
||||||
|
)
|
||||||
|
|||||||
@@ -36,9 +36,12 @@ async def process_custom_voice_job(
|
|||||||
job_id: int,
|
job_id: int,
|
||||||
user_id: int,
|
user_id: int,
|
||||||
request_data: dict,
|
request_data: dict,
|
||||||
|
backend_type: str,
|
||||||
db_url: str
|
db_url: str
|
||||||
):
|
):
|
||||||
from core.database import SessionLocal
|
from core.database import SessionLocal
|
||||||
|
from core.tts_service import TTSServiceFactory
|
||||||
|
from core.security import decrypt_api_key
|
||||||
|
|
||||||
db = SessionLocal()
|
db = SessionLocal()
|
||||||
try:
|
try:
|
||||||
@@ -51,42 +54,24 @@ async def process_custom_voice_job(
|
|||||||
job.started_at = datetime.utcnow()
|
job.started_at = datetime.utcnow()
|
||||||
db.commit()
|
db.commit()
|
||||||
|
|
||||||
logger.info(f"Processing custom-voice job {job_id}")
|
logger.info(f"Processing custom-voice job {job_id} with backend {backend_type}")
|
||||||
|
|
||||||
model_manager = await ModelManager.get_instance()
|
user_api_key = None
|
||||||
await model_manager.load_model("custom-voice")
|
if backend_type == "aliyun":
|
||||||
_, tts = await model_manager.get_current_model()
|
user = db.query(User).filter(User.id == user_id).first()
|
||||||
|
if user and user.aliyun_api_key:
|
||||||
|
user_api_key = decrypt_api_key(user.aliyun_api_key)
|
||||||
|
|
||||||
if tts is None:
|
backend = await TTSServiceFactory.get_backend(backend_type, user_api_key)
|
||||||
raise RuntimeError("Failed to load custom-voice model")
|
|
||||||
|
|
||||||
result = tts.generate_custom_voice(
|
audio_bytes, sample_rate = await backend.generate_custom_voice(request_data)
|
||||||
text=request_data['text'],
|
|
||||||
language=request_data['language'],
|
|
||||||
speaker=request_data['speaker'],
|
|
||||||
instruct=request_data.get('instruct', ''),
|
|
||||||
max_new_tokens=request_data['max_new_tokens'],
|
|
||||||
temperature=request_data['temperature'],
|
|
||||||
top_k=request_data['top_k'],
|
|
||||||
top_p=request_data['top_p'],
|
|
||||||
repetition_penalty=request_data['repetition_penalty']
|
|
||||||
)
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
if isinstance(result, tuple):
|
|
||||||
audio_data = result[0]
|
|
||||||
elif isinstance(result, list):
|
|
||||||
audio_data = np.array(result)
|
|
||||||
else:
|
|
||||||
audio_data = result
|
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
|
timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
|
||||||
filename = f"{user_id}_{job_id}_{timestamp}.wav"
|
filename = f"{user_id}_{job_id}_{timestamp}.wav"
|
||||||
output_path = Path(settings.OUTPUT_DIR) / filename
|
output_path = Path(settings.OUTPUT_DIR) / filename
|
||||||
|
|
||||||
save_audio_file(audio_data, 24000, output_path)
|
with open(output_path, 'wb') as f:
|
||||||
|
f.write(audio_bytes)
|
||||||
|
|
||||||
job.status = JobStatus.COMPLETED
|
job.status = JobStatus.COMPLETED
|
||||||
job.output_path = str(output_path)
|
job.output_path = str(output_path)
|
||||||
@@ -112,9 +97,12 @@ async def process_voice_design_job(
|
|||||||
job_id: int,
|
job_id: int,
|
||||||
user_id: int,
|
user_id: int,
|
||||||
request_data: dict,
|
request_data: dict,
|
||||||
|
backend_type: str,
|
||||||
db_url: str
|
db_url: str
|
||||||
):
|
):
|
||||||
from core.database import SessionLocal
|
from core.database import SessionLocal
|
||||||
|
from core.tts_service import TTSServiceFactory
|
||||||
|
from core.security import decrypt_api_key
|
||||||
|
|
||||||
db = SessionLocal()
|
db = SessionLocal()
|
||||||
try:
|
try:
|
||||||
@@ -127,41 +115,24 @@ async def process_voice_design_job(
|
|||||||
job.started_at = datetime.utcnow()
|
job.started_at = datetime.utcnow()
|
||||||
db.commit()
|
db.commit()
|
||||||
|
|
||||||
logger.info(f"Processing voice-design job {job_id}")
|
logger.info(f"Processing voice-design job {job_id} with backend {backend_type}")
|
||||||
|
|
||||||
model_manager = await ModelManager.get_instance()
|
user_api_key = None
|
||||||
await model_manager.load_model("voice-design")
|
if backend_type == "aliyun":
|
||||||
_, tts = await model_manager.get_current_model()
|
user = db.query(User).filter(User.id == user_id).first()
|
||||||
|
if user and user.aliyun_api_key:
|
||||||
|
user_api_key = decrypt_api_key(user.aliyun_api_key)
|
||||||
|
|
||||||
if tts is None:
|
backend = await TTSServiceFactory.get_backend(backend_type, user_api_key)
|
||||||
raise RuntimeError("Failed to load voice-design model")
|
|
||||||
|
|
||||||
result = tts.generate_voice_design(
|
audio_bytes, sample_rate = await backend.generate_voice_design(request_data)
|
||||||
text=request_data['text'],
|
|
||||||
language=request_data['language'],
|
|
||||||
instruct=request_data['instruct'],
|
|
||||||
max_new_tokens=request_data['max_new_tokens'],
|
|
||||||
temperature=request_data['temperature'],
|
|
||||||
top_k=request_data['top_k'],
|
|
||||||
top_p=request_data['top_p'],
|
|
||||||
repetition_penalty=request_data['repetition_penalty']
|
|
||||||
)
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
if isinstance(result, tuple):
|
|
||||||
audio_data = result[0]
|
|
||||||
elif isinstance(result, list):
|
|
||||||
audio_data = np.array(result)
|
|
||||||
else:
|
|
||||||
audio_data = result
|
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
|
timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
|
||||||
filename = f"{user_id}_{job_id}_{timestamp}.wav"
|
filename = f"{user_id}_{job_id}_{timestamp}.wav"
|
||||||
output_path = Path(settings.OUTPUT_DIR) / filename
|
output_path = Path(settings.OUTPUT_DIR) / filename
|
||||||
|
|
||||||
save_audio_file(audio_data, 24000, output_path)
|
with open(output_path, 'wb') as f:
|
||||||
|
f.write(audio_bytes)
|
||||||
|
|
||||||
job.status = JobStatus.COMPLETED
|
job.status = JobStatus.COMPLETED
|
||||||
job.output_path = str(output_path)
|
job.output_path = str(output_path)
|
||||||
@@ -188,9 +159,11 @@ async def process_voice_clone_job(
|
|||||||
user_id: int,
|
user_id: int,
|
||||||
request_data: dict,
|
request_data: dict,
|
||||||
ref_audio_path: str,
|
ref_audio_path: str,
|
||||||
|
backend_type: str,
|
||||||
db_url: str
|
db_url: str
|
||||||
):
|
):
|
||||||
from core.database import SessionLocal
|
from core.database import SessionLocal
|
||||||
|
from core.tts_service import TTSServiceFactory
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
db = SessionLocal()
|
db = SessionLocal()
|
||||||
@@ -204,7 +177,14 @@ async def process_voice_clone_job(
|
|||||||
job.started_at = datetime.utcnow()
|
job.started_at = datetime.utcnow()
|
||||||
db.commit()
|
db.commit()
|
||||||
|
|
||||||
logger.info(f"Processing voice-clone job {job_id}")
|
logger.info(f"Processing voice-clone job {job_id} with backend {backend_type}")
|
||||||
|
|
||||||
|
from core.security import decrypt_api_key
|
||||||
|
user_api_key = None
|
||||||
|
if backend_type == "aliyun":
|
||||||
|
user = db.query(User).filter(User.id == user_id).first()
|
||||||
|
if user and user.aliyun_api_key:
|
||||||
|
user_api_key = decrypt_api_key(user.aliyun_api_key)
|
||||||
|
|
||||||
with open(ref_audio_path, 'rb') as f:
|
with open(ref_audio_path, 'rb') as f:
|
||||||
ref_audio_data = f.read()
|
ref_audio_data = f.read()
|
||||||
@@ -212,6 +192,7 @@ async def process_voice_clone_job(
|
|||||||
cache_manager = await VoiceCacheManager.get_instance()
|
cache_manager = await VoiceCacheManager.get_instance()
|
||||||
ref_audio_hash = cache_manager.get_audio_hash(ref_audio_data)
|
ref_audio_hash = cache_manager.get_audio_hash(ref_audio_data)
|
||||||
|
|
||||||
|
if request_data.get('x_vector_only_mode', False) and backend_type == "local":
|
||||||
x_vector = None
|
x_vector = None
|
||||||
cache_id = None
|
cache_id = None
|
||||||
|
|
||||||
@@ -238,7 +219,7 @@ async def process_voice_clone_job(
|
|||||||
x_vector = tts.create_voice_clone_prompt(
|
x_vector = tts.create_voice_clone_prompt(
|
||||||
ref_audio=(ref_audio_array, ref_sr),
|
ref_audio=(ref_audio_array, ref_sr),
|
||||||
ref_text=request_data.get('ref_text', ''),
|
ref_text=request_data.get('ref_text', ''),
|
||||||
x_vector_only_mode=request_data.get('x_vector_only_mode', False)
|
x_vector_only_mode=True
|
||||||
)
|
)
|
||||||
|
|
||||||
if request_data.get('use_cache', True):
|
if request_data.get('use_cache', True):
|
||||||
@@ -247,14 +228,13 @@ async def process_voice_clone_job(
|
|||||||
'duration': features['duration'],
|
'duration': features['duration'],
|
||||||
'sample_rate': features['sample_rate'],
|
'sample_rate': features['sample_rate'],
|
||||||
'ref_text': request_data.get('ref_text', ''),
|
'ref_text': request_data.get('ref_text', ''),
|
||||||
'x_vector_only_mode': request_data.get('x_vector_only_mode', False)
|
'x_vector_only_mode': True
|
||||||
}
|
}
|
||||||
cache_id = await cache_manager.set_cache(
|
cache_id = await cache_manager.set_cache(
|
||||||
user_id, ref_audio_hash, x_vector, metadata, db
|
user_id, ref_audio_hash, x_vector, metadata, db
|
||||||
)
|
)
|
||||||
logger.info(f"Created cache for job {job_id}, cache_id={cache_id}")
|
logger.info(f"Created cache for job {job_id}, cache_id={cache_id}")
|
||||||
|
|
||||||
if request_data.get('x_vector_only_mode', False):
|
|
||||||
job.status = JobStatus.COMPLETED
|
job.status = JobStatus.COMPLETED
|
||||||
job.output_path = f"x_vector_cached_{cache_id}"
|
job.output_path = f"x_vector_cached_{cache_id}"
|
||||||
job.completed_at = datetime.utcnow()
|
job.completed_at = datetime.utcnow()
|
||||||
@@ -262,31 +242,16 @@ async def process_voice_clone_job(
|
|||||||
logger.info(f"Job {job_id} completed (x_vector_only_mode)")
|
logger.info(f"Job {job_id} completed (x_vector_only_mode)")
|
||||||
return
|
return
|
||||||
|
|
||||||
model_manager = await ModelManager.get_instance()
|
backend = await TTSServiceFactory.get_backend(backend_type, user_api_key)
|
||||||
await model_manager.load_model("base")
|
|
||||||
_, tts = await model_manager.get_current_model()
|
|
||||||
|
|
||||||
if tts is None:
|
audio_bytes, sample_rate = await backend.generate_voice_clone(request_data, ref_audio_data)
|
||||||
raise RuntimeError("Failed to load base model")
|
|
||||||
|
|
||||||
wavs, sample_rate = tts.generate_voice_clone(
|
|
||||||
text=request_data['text'],
|
|
||||||
language=request_data['language'],
|
|
||||||
voice_clone_prompt=x_vector,
|
|
||||||
max_new_tokens=request_data['max_new_tokens'],
|
|
||||||
temperature=request_data['temperature'],
|
|
||||||
top_k=request_data['top_k'],
|
|
||||||
top_p=request_data['top_p'],
|
|
||||||
repetition_penalty=request_data['repetition_penalty']
|
|
||||||
)
|
|
||||||
|
|
||||||
audio_data = wavs[0] if isinstance(wavs, list) else wavs
|
|
||||||
|
|
||||||
timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
|
timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
|
||||||
filename = f"{user_id}_{job_id}_{timestamp}.wav"
|
filename = f"{user_id}_{job_id}_{timestamp}.wav"
|
||||||
output_path = Path(settings.OUTPUT_DIR) / filename
|
output_path = Path(settings.OUTPUT_DIR) / filename
|
||||||
|
|
||||||
save_audio_file(audio_data, sample_rate, output_path)
|
with open(output_path, 'wb') as f:
|
||||||
|
f.write(audio_bytes)
|
||||||
|
|
||||||
job.status = JobStatus.COMPLETED
|
job.status = JobStatus.COMPLETED
|
||||||
job.output_path = str(output_path)
|
job.output_path = str(output_path)
|
||||||
@@ -319,6 +284,16 @@ async def create_custom_voice_job(
|
|||||||
current_user: User = Depends(get_current_user),
|
current_user: User = Depends(get_current_user),
|
||||||
db: Session = Depends(get_db)
|
db: Session = Depends(get_db)
|
||||||
):
|
):
|
||||||
|
from core.security import decrypt_api_key
|
||||||
|
|
||||||
|
backend_type = req_data.backend or settings.DEFAULT_BACKEND
|
||||||
|
if backend_type == "aliyun":
|
||||||
|
if not current_user.aliyun_api_key:
|
||||||
|
raise HTTPException(status_code=400, detail="Aliyun API key not configured. Please set your API key first.")
|
||||||
|
user_api_key = decrypt_api_key(current_user.aliyun_api_key)
|
||||||
|
if not user_api_key:
|
||||||
|
raise HTTPException(status_code=400, detail="Invalid Aliyun API key. Please update your API key.")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
validate_text_length(req_data.text)
|
validate_text_length(req_data.text)
|
||||||
language = validate_language(req_data.language)
|
language = validate_language(req_data.language)
|
||||||
@@ -339,6 +314,7 @@ async def create_custom_voice_job(
|
|||||||
user_id=current_user.id,
|
user_id=current_user.id,
|
||||||
job_type="custom-voice",
|
job_type="custom-voice",
|
||||||
status=JobStatus.PENDING,
|
status=JobStatus.PENDING,
|
||||||
|
backend_type=backend_type,
|
||||||
input_data="",
|
input_data="",
|
||||||
input_params={
|
input_params={
|
||||||
"text": req_data.text,
|
"text": req_data.text,
|
||||||
@@ -365,6 +341,7 @@ async def create_custom_voice_job(
|
|||||||
job.id,
|
job.id,
|
||||||
current_user.id,
|
current_user.id,
|
||||||
request_data,
|
request_data,
|
||||||
|
backend_type,
|
||||||
str(settings.DATABASE_URL)
|
str(settings.DATABASE_URL)
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -384,6 +361,16 @@ async def create_voice_design_job(
|
|||||||
current_user: User = Depends(get_current_user),
|
current_user: User = Depends(get_current_user),
|
||||||
db: Session = Depends(get_db)
|
db: Session = Depends(get_db)
|
||||||
):
|
):
|
||||||
|
from core.security import decrypt_api_key
|
||||||
|
|
||||||
|
backend_type = req_data.backend or settings.DEFAULT_BACKEND
|
||||||
|
if backend_type == "aliyun":
|
||||||
|
if not current_user.aliyun_api_key:
|
||||||
|
raise HTTPException(status_code=400, detail="Aliyun API key not configured. Please set your API key first.")
|
||||||
|
user_api_key = decrypt_api_key(current_user.aliyun_api_key)
|
||||||
|
if not user_api_key:
|
||||||
|
raise HTTPException(status_code=400, detail="Invalid Aliyun API key. Please update your API key.")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
validate_text_length(req_data.text)
|
validate_text_length(req_data.text)
|
||||||
language = validate_language(req_data.language)
|
language = validate_language(req_data.language)
|
||||||
@@ -406,6 +393,7 @@ async def create_voice_design_job(
|
|||||||
user_id=current_user.id,
|
user_id=current_user.id,
|
||||||
job_type="voice-design",
|
job_type="voice-design",
|
||||||
status=JobStatus.PENDING,
|
status=JobStatus.PENDING,
|
||||||
|
backend_type=backend_type,
|
||||||
input_data="",
|
input_data="",
|
||||||
input_params={
|
input_params={
|
||||||
"text": req_data.text,
|
"text": req_data.text,
|
||||||
@@ -430,6 +418,7 @@ async def create_voice_design_job(
|
|||||||
job.id,
|
job.id,
|
||||||
current_user.id,
|
current_user.id,
|
||||||
request_data,
|
request_data,
|
||||||
|
backend_type,
|
||||||
str(settings.DATABASE_URL)
|
str(settings.DATABASE_URL)
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -455,10 +444,21 @@ async def create_voice_clone_job(
|
|||||||
top_k: Optional[int] = Form(default=50),
|
top_k: Optional[int] = Form(default=50),
|
||||||
top_p: Optional[float] = Form(default=1.0),
|
top_p: Optional[float] = Form(default=1.0),
|
||||||
repetition_penalty: Optional[float] = Form(default=1.05),
|
repetition_penalty: Optional[float] = Form(default=1.05),
|
||||||
|
backend: Optional[str] = Form(default=None),
|
||||||
background_tasks: BackgroundTasks = None,
|
background_tasks: BackgroundTasks = None,
|
||||||
current_user: User = Depends(get_current_user),
|
current_user: User = Depends(get_current_user),
|
||||||
db: Session = Depends(get_db)
|
db: Session = Depends(get_db)
|
||||||
):
|
):
|
||||||
|
from core.security import decrypt_api_key
|
||||||
|
|
||||||
|
backend_type = backend or settings.DEFAULT_BACKEND
|
||||||
|
if backend_type == "aliyun":
|
||||||
|
if not current_user.aliyun_api_key:
|
||||||
|
raise HTTPException(status_code=400, detail="Aliyun API key not configured. Please set your API key first.")
|
||||||
|
user_api_key = decrypt_api_key(current_user.aliyun_api_key)
|
||||||
|
if not user_api_key:
|
||||||
|
raise HTTPException(status_code=400, detail="Invalid Aliyun API key. Please update your API key.")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
validate_text_length(text)
|
validate_text_length(text)
|
||||||
language = validate_language(language)
|
language = validate_language(language)
|
||||||
@@ -486,6 +486,7 @@ async def create_voice_clone_job(
|
|||||||
user_id=current_user.id,
|
user_id=current_user.id,
|
||||||
job_type="voice-clone",
|
job_type="voice-clone",
|
||||||
status=JobStatus.PENDING,
|
status=JobStatus.PENDING,
|
||||||
|
backend_type=backend_type,
|
||||||
input_data="",
|
input_data="",
|
||||||
input_params={
|
input_params={
|
||||||
"text": text,
|
"text": text,
|
||||||
@@ -520,6 +521,7 @@ async def create_voice_clone_job(
|
|||||||
current_user.id,
|
current_user.id,
|
||||||
request_data,
|
request_data,
|
||||||
tmp_audio_path,
|
tmp_audio_path,
|
||||||
|
backend_type,
|
||||||
str(settings.DATABASE_URL)
|
str(settings.DATABASE_URL)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -36,6 +36,14 @@ class Settings(BaseSettings):
|
|||||||
MAX_TEXT_LENGTH: int = Field(default=1000)
|
MAX_TEXT_LENGTH: int = Field(default=1000)
|
||||||
MAX_AUDIO_SIZE_MB: int = Field(default=10)
|
MAX_AUDIO_SIZE_MB: int = Field(default=10)
|
||||||
|
|
||||||
|
ALIYUN_REGION: str = Field(default="beijing")
|
||||||
|
|
||||||
|
ALIYUN_MODEL_FLASH: str = Field(default="qwen3-tts-flash-realtime")
|
||||||
|
ALIYUN_MODEL_VC: str = Field(default="qwen3-tts-vc-realtime-2026-01-15")
|
||||||
|
ALIYUN_MODEL_VD: str = Field(default="qwen3-tts-vd-realtime-2026-01-15")
|
||||||
|
|
||||||
|
DEFAULT_BACKEND: str = Field(default="local")
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
env_file = ".env"
|
env_file = ".env"
|
||||||
case_sensitive = True
|
case_sensitive = True
|
||||||
|
|||||||
@@ -2,6 +2,9 @@ from datetime import datetime, timedelta
|
|||||||
from typing import Optional
|
from typing import Optional
|
||||||
from jose import JWTError, jwt
|
from jose import JWTError, jwt
|
||||||
from passlib.context import CryptContext
|
from passlib.context import CryptContext
|
||||||
|
from cryptography.fernet import Fernet
|
||||||
|
import base64
|
||||||
|
import hashlib
|
||||||
|
|
||||||
from config import settings
|
from config import settings
|
||||||
|
|
||||||
@@ -33,3 +36,24 @@ def decode_access_token(token: str) -> Optional[str]:
|
|||||||
return username
|
return username
|
||||||
except JWTError:
|
except JWTError:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def _get_fernet_key() -> bytes:
|
||||||
|
key = hashlib.sha256(settings.SECRET_KEY.encode()).digest()
|
||||||
|
return base64.urlsafe_b64encode(key)
|
||||||
|
|
||||||
|
def encrypt_api_key(api_key: str) -> str:
|
||||||
|
if not api_key:
|
||||||
|
return ""
|
||||||
|
fernet = Fernet(_get_fernet_key())
|
||||||
|
encrypted = fernet.encrypt(api_key.encode())
|
||||||
|
return encrypted.decode()
|
||||||
|
|
||||||
|
def decrypt_api_key(encrypted_key: str) -> Optional[str]:
|
||||||
|
if not encrypted_key:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
fernet = Fernet(_get_fernet_key())
|
||||||
|
decrypted = fernet.decrypt(encrypted_key.encode())
|
||||||
|
return decrypted.decode()
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|||||||
371
qwen3-tts-backend/core/tts_service.py
Normal file
371
qwen3-tts-backend/core/tts_service.py
Normal file
@@ -0,0 +1,371 @@
|
|||||||
|
import time
|
||||||
|
import logging
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from typing import Tuple, Optional
|
||||||
|
import websockets
|
||||||
|
import json
|
||||||
|
import base64
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class TTSBackend(ABC):
|
||||||
|
@abstractmethod
|
||||||
|
async def generate_custom_voice(self, params: dict) -> Tuple[bytes, int]:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def generate_voice_design(self, params: dict) -> Tuple[bytes, int]:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def generate_voice_clone(self, params: dict, ref_audio_bytes: bytes) -> Tuple[bytes, int]:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def health_check(self) -> dict:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class LocalTTSBackend(TTSBackend):
|
||||||
|
def __init__(self):
|
||||||
|
self.model_manager = None
|
||||||
|
|
||||||
|
async def initialize(self):
|
||||||
|
from core.model_manager import ModelManager
|
||||||
|
self.model_manager = await ModelManager.get_instance()
|
||||||
|
|
||||||
|
async def generate_custom_voice(self, params: dict) -> Tuple[bytes, int]:
|
||||||
|
await self.model_manager.load_model("custom-voice")
|
||||||
|
_, tts = await self.model_manager.get_current_model()
|
||||||
|
|
||||||
|
result = tts.generate_custom_voice(
|
||||||
|
text=params['text'],
|
||||||
|
language=params['language'],
|
||||||
|
speaker=params['speaker'],
|
||||||
|
instruct=params.get('instruct', ''),
|
||||||
|
max_new_tokens=params['max_new_tokens'],
|
||||||
|
temperature=params['temperature'],
|
||||||
|
top_k=params['top_k'],
|
||||||
|
top_p=params['top_p'],
|
||||||
|
repetition_penalty=params['repetition_penalty']
|
||||||
|
)
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
if isinstance(result, tuple):
|
||||||
|
audio_data = result[0]
|
||||||
|
elif isinstance(result, list):
|
||||||
|
audio_data = np.array(result)
|
||||||
|
else:
|
||||||
|
audio_data = result
|
||||||
|
|
||||||
|
return self._numpy_to_bytes(audio_data), 24000
|
||||||
|
|
||||||
|
async def generate_voice_design(self, params: dict) -> Tuple[bytes, int]:
|
||||||
|
await self.model_manager.load_model("voice-design")
|
||||||
|
_, tts = await self.model_manager.get_current_model()
|
||||||
|
|
||||||
|
result = tts.generate_voice_design(
|
||||||
|
text=params['text'],
|
||||||
|
language=params['language'],
|
||||||
|
instruct=params['instruct'],
|
||||||
|
max_new_tokens=params['max_new_tokens'],
|
||||||
|
temperature=params['temperature'],
|
||||||
|
top_k=params['top_k'],
|
||||||
|
top_p=params['top_p'],
|
||||||
|
repetition_penalty=params['repetition_penalty']
|
||||||
|
)
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
audio_data = result[0] if isinstance(result, tuple) else result
|
||||||
|
return self._numpy_to_bytes(audio_data), 24000
|
||||||
|
|
||||||
|
async def generate_voice_clone(self, params: dict, ref_audio_bytes: bytes) -> Tuple[bytes, int]:
|
||||||
|
from utils.audio import process_ref_audio
|
||||||
|
|
||||||
|
ref_audio_array, ref_sr = process_ref_audio(ref_audio_bytes)
|
||||||
|
|
||||||
|
await self.model_manager.load_model("base")
|
||||||
|
_, tts = await self.model_manager.get_current_model()
|
||||||
|
|
||||||
|
x_vector = tts.create_voice_clone_prompt(
|
||||||
|
ref_audio=(ref_audio_array, ref_sr),
|
||||||
|
ref_text=params.get('ref_text', ''),
|
||||||
|
x_vector_only_mode=False
|
||||||
|
)
|
||||||
|
|
||||||
|
wavs, sample_rate = tts.generate_voice_clone(
|
||||||
|
text=params['text'],
|
||||||
|
language=params['language'],
|
||||||
|
voice_clone_prompt=x_vector,
|
||||||
|
max_new_tokens=params['max_new_tokens'],
|
||||||
|
temperature=params['temperature'],
|
||||||
|
top_k=params['top_k'],
|
||||||
|
top_p=params['top_p'],
|
||||||
|
repetition_penalty=params['repetition_penalty']
|
||||||
|
)
|
||||||
|
|
||||||
|
audio_data = wavs[0] if isinstance(wavs, list) else wavs
|
||||||
|
return self._numpy_to_bytes(audio_data), sample_rate
|
||||||
|
|
||||||
|
async def health_check(self) -> dict:
|
||||||
|
return {
|
||||||
|
"available": self.model_manager is not None,
|
||||||
|
"current_model": self.model_manager.current_model_name if self.model_manager else None
|
||||||
|
}
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _numpy_to_bytes(audio_array) -> bytes:
|
||||||
|
import numpy as np
|
||||||
|
import io
|
||||||
|
import scipy.io.wavfile
|
||||||
|
|
||||||
|
buffer = io.BytesIO()
|
||||||
|
scipy.io.wavfile.write(buffer, 24000, (audio_array * 32767).astype(np.int16))
|
||||||
|
buffer.seek(0)
|
||||||
|
return buffer.read()
|
||||||
|
|
||||||
|
|
||||||
|
class AliyunTTSBackend(TTSBackend):
|
||||||
|
def __init__(self, api_key: str, region: str):
|
||||||
|
self.api_key = api_key
|
||||||
|
self.region = region
|
||||||
|
self.ws_url = self._get_ws_url(region)
|
||||||
|
self.http_url = self._get_http_url(region)
|
||||||
|
|
||||||
|
def _get_ws_url(self, region: str) -> str:
|
||||||
|
if region == "beijing":
|
||||||
|
return "wss://dashscope.aliyuncs.com/api-ws/v1/realtime"
|
||||||
|
else:
|
||||||
|
return "wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime"
|
||||||
|
|
||||||
|
def _get_http_url(self, region: str) -> str:
|
||||||
|
if region == "beijing":
|
||||||
|
return "https://dashscope.aliyuncs.com/api/v1/services/audio/tts/customization"
|
||||||
|
else:
|
||||||
|
return "https://dashscope-intl.aliyuncs.com/api/v1/services/audio/tts/customization"
|
||||||
|
|
||||||
|
async def generate_custom_voice(self, params: dict) -> Tuple[bytes, int]:
|
||||||
|
from core.config import settings
|
||||||
|
|
||||||
|
voice = self._map_speaker(params['speaker'])
|
||||||
|
model = settings.ALIYUN_MODEL_FLASH
|
||||||
|
|
||||||
|
return await self._generate_via_websocket(
|
||||||
|
model=model,
|
||||||
|
text=params['text'],
|
||||||
|
voice=voice,
|
||||||
|
language=params['language']
|
||||||
|
)
|
||||||
|
|
||||||
|
async def generate_voice_design(self, params: dict) -> Tuple[bytes, int]:
|
||||||
|
from core.config import settings
|
||||||
|
|
||||||
|
voice_id = await self._create_voice_design(
|
||||||
|
instruct=params['instruct'],
|
||||||
|
preview_text=params['text']
|
||||||
|
)
|
||||||
|
|
||||||
|
model = settings.ALIYUN_MODEL_VD
|
||||||
|
|
||||||
|
return await self._generate_via_websocket(
|
||||||
|
model=model,
|
||||||
|
text=params['text'],
|
||||||
|
voice=voice_id,
|
||||||
|
language=params['language']
|
||||||
|
)
|
||||||
|
|
||||||
|
async def generate_voice_clone(self, params: dict, ref_audio_bytes: bytes) -> Tuple[bytes, int]:
|
||||||
|
from core.config import settings
|
||||||
|
|
||||||
|
voice_id = await self._create_voice_clone(ref_audio_bytes)
|
||||||
|
|
||||||
|
model = settings.ALIYUN_MODEL_VC
|
||||||
|
|
||||||
|
return await self._generate_via_websocket(
|
||||||
|
model=model,
|
||||||
|
text=params['text'],
|
||||||
|
voice=voice_id,
|
||||||
|
language=params['language']
|
||||||
|
)
|
||||||
|
|
||||||
|
async def _generate_via_websocket(
|
||||||
|
self,
|
||||||
|
model: str,
|
||||||
|
text: str,
|
||||||
|
voice: str,
|
||||||
|
language: str
|
||||||
|
) -> Tuple[bytes, int]:
|
||||||
|
audio_chunks = []
|
||||||
|
|
||||||
|
url = f"{self.ws_url}?model={model}"
|
||||||
|
headers = {"Authorization": f"Bearer {self.api_key}"}
|
||||||
|
|
||||||
|
async with websockets.connect(url, additional_headers=headers) as ws:
|
||||||
|
await ws.send(json.dumps({
|
||||||
|
"type": "session.update",
|
||||||
|
"session": {
|
||||||
|
"mode": "server_commit",
|
||||||
|
"voice": voice,
|
||||||
|
"language_type": language,
|
||||||
|
"response_format": "pcm",
|
||||||
|
"sample_rate": 24000
|
||||||
|
}
|
||||||
|
}))
|
||||||
|
|
||||||
|
await ws.send(json.dumps({
|
||||||
|
"type": "input_text_buffer.append",
|
||||||
|
"text": text
|
||||||
|
}))
|
||||||
|
|
||||||
|
await ws.send(json.dumps({
|
||||||
|
"type": "session.finish"
|
||||||
|
}))
|
||||||
|
|
||||||
|
async for message in ws:
|
||||||
|
event = json.loads(message)
|
||||||
|
event_type = event.get('type')
|
||||||
|
|
||||||
|
if event_type == 'response.audio.delta':
|
||||||
|
audio_data = base64.b64decode(event['delta'])
|
||||||
|
audio_chunks.append(audio_data)
|
||||||
|
elif event_type == 'session.finished':
|
||||||
|
break
|
||||||
|
elif event_type == 'error':
|
||||||
|
raise RuntimeError(f"Aliyun API error: {event.get('error')}")
|
||||||
|
|
||||||
|
pcm_data = b''.join(audio_chunks)
|
||||||
|
wav_bytes = self._pcm_to_wav(pcm_data, 24000)
|
||||||
|
return wav_bytes, 24000
|
||||||
|
|
||||||
|
async def _create_voice_clone(self, ref_audio_bytes: bytes) -> str:
|
||||||
|
from core.config import settings
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
audio_b64 = base64.b64encode(ref_audio_bytes).decode()
|
||||||
|
data_uri = f"data:audio/wav;base64,{audio_b64}"
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"model": "qwen-voice-enrollment",
|
||||||
|
"input": {
|
||||||
|
"action": "create",
|
||||||
|
"target_model": settings.ALIYUN_MODEL_VC,
|
||||||
|
"preferred_name": f"clone_{int(time.time())}",
|
||||||
|
"audio": {"data": data_uri}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"Authorization": f"Bearer {self.api_key}",
|
||||||
|
"Content-Type": "application/json"
|
||||||
|
}
|
||||||
|
|
||||||
|
async with httpx.AsyncClient() as client:
|
||||||
|
resp = await client.post(self.http_url, json=payload, headers=headers, timeout=60)
|
||||||
|
resp.raise_for_status()
|
||||||
|
result = resp.json()
|
||||||
|
return result['output']['voice']
|
||||||
|
|
||||||
|
async def _create_voice_design(self, instruct: str, preview_text: str) -> str:
|
||||||
|
from core.config import settings
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"model": "qwen-voice-design",
|
||||||
|
"input": {
|
||||||
|
"action": "create",
|
||||||
|
"target_model": settings.ALIYUN_MODEL_VD,
|
||||||
|
"voice_prompt": instruct,
|
||||||
|
"preview_text": preview_text,
|
||||||
|
"preferred_name": f"design_{int(time.time())}",
|
||||||
|
"language": "zh"
|
||||||
|
},
|
||||||
|
"parameters": {
|
||||||
|
"sample_rate": 24000,
|
||||||
|
"response_format": "wav"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"Authorization": f"Bearer {self.api_key}",
|
||||||
|
"Content-Type": "application/json"
|
||||||
|
}
|
||||||
|
|
||||||
|
async with httpx.AsyncClient() as client:
|
||||||
|
resp = await client.post(self.http_url, json=payload, headers=headers, timeout=60)
|
||||||
|
resp.raise_for_status()
|
||||||
|
result = resp.json()
|
||||||
|
return result['output']['voice']
|
||||||
|
|
||||||
|
async def health_check(self) -> dict:
|
||||||
|
try:
|
||||||
|
import httpx
|
||||||
|
async with httpx.AsyncClient() as client:
|
||||||
|
resp = await client.get(
|
||||||
|
self.http_url.replace('/customization', '/health'),
|
||||||
|
headers={"Authorization": f"Bearer {self.api_key}"},
|
||||||
|
timeout=5
|
||||||
|
)
|
||||||
|
return {"available": resp.status_code < 500}
|
||||||
|
except:
|
||||||
|
return {"available": False}
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _pcm_to_wav(pcm_data: bytes, sample_rate: int) -> bytes:
|
||||||
|
import io
|
||||||
|
import wave
|
||||||
|
|
||||||
|
wav_buffer = io.BytesIO()
|
||||||
|
with wave.open(wav_buffer, 'wb') as wav_file:
|
||||||
|
wav_file.setnchannels(1)
|
||||||
|
wav_file.setsampwidth(2)
|
||||||
|
wav_file.setframerate(sample_rate)
|
||||||
|
wav_file.writeframes(pcm_data)
|
||||||
|
|
||||||
|
wav_buffer.seek(0)
|
||||||
|
return wav_buffer.read()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _map_speaker(local_speaker: str) -> str:
|
||||||
|
mapping = {
|
||||||
|
"Vivian": "Cherry",
|
||||||
|
"Serena": "Lili",
|
||||||
|
"Uncle_Fu": "Longxiaochun",
|
||||||
|
"Dylan": "Longxiaochun",
|
||||||
|
"Female": "Cherry",
|
||||||
|
"Male": "Longxiaochun"
|
||||||
|
}
|
||||||
|
return mapping.get(local_speaker, "Cherry")
|
||||||
|
|
||||||
|
|
||||||
|
class TTSServiceFactory:
|
||||||
|
_local_backend: Optional[LocalTTSBackend] = None
|
||||||
|
_aliyun_backend: Optional[AliyunTTSBackend] = None
|
||||||
|
_user_aliyun_backends: dict[str, AliyunTTSBackend] = {}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
async def get_backend(cls, backend_type: str = None, user_api_key: Optional[str] = None) -> TTSBackend:
|
||||||
|
from core.config import settings
|
||||||
|
|
||||||
|
if backend_type is None:
|
||||||
|
backend_type = settings.DEFAULT_BACKEND
|
||||||
|
|
||||||
|
if backend_type == "local":
|
||||||
|
if cls._local_backend is None:
|
||||||
|
cls._local_backend = LocalTTSBackend()
|
||||||
|
await cls._local_backend.initialize()
|
||||||
|
return cls._local_backend
|
||||||
|
|
||||||
|
elif backend_type == "aliyun":
|
||||||
|
if not user_api_key:
|
||||||
|
raise ValueError("Aliyun backend requires user API key. Please set your API key first.")
|
||||||
|
|
||||||
|
if user_api_key not in cls._user_aliyun_backends:
|
||||||
|
cls._user_aliyun_backends[user_api_key] = AliyunTTSBackend(
|
||||||
|
api_key=user_api_key,
|
||||||
|
region=settings.ALIYUN_REGION
|
||||||
|
)
|
||||||
|
return cls._user_aliyun_backends[user_api_key]
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown backend type: {backend_type}")
|
||||||
@@ -103,6 +103,21 @@ def change_user_password(
|
|||||||
db.refresh(user)
|
db.refresh(user)
|
||||||
return user
|
return user
|
||||||
|
|
||||||
|
def update_user_aliyun_key(
|
||||||
|
db: Session,
|
||||||
|
user_id: int,
|
||||||
|
encrypted_api_key: str
|
||||||
|
) -> Optional[User]:
|
||||||
|
user = get_user_by_id(db, user_id)
|
||||||
|
if not user:
|
||||||
|
return None
|
||||||
|
|
||||||
|
user.aliyun_api_key = encrypted_api_key
|
||||||
|
user.updated_at = datetime.utcnow()
|
||||||
|
db.commit()
|
||||||
|
db.refresh(user)
|
||||||
|
return user
|
||||||
|
|
||||||
def create_job(db: Session, user_id: int, job_type: str, input_data: Dict[str, Any]) -> Job:
|
def create_job(db: Session, user_id: int, job_type: str, input_data: Dict[str, Any]) -> Job:
|
||||||
job = Job(
|
job = Job(
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ class User(Base):
|
|||||||
hashed_password = Column(String(255), nullable=False)
|
hashed_password = Column(String(255), nullable=False)
|
||||||
is_active = Column(Boolean, default=True, nullable=False)
|
is_active = Column(Boolean, default=True, nullable=False)
|
||||||
is_superuser = Column(Boolean, default=False, nullable=False)
|
is_superuser = Column(Boolean, default=False, nullable=False)
|
||||||
|
aliyun_api_key = Column(Text, nullable=True)
|
||||||
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
|
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
|
||||||
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
|
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
|
||||||
|
|
||||||
@@ -33,6 +34,7 @@ class Job(Base):
|
|||||||
user_id = Column(Integer, ForeignKey("users.id"), nullable=False, index=True)
|
user_id = Column(Integer, ForeignKey("users.id"), nullable=False, index=True)
|
||||||
job_type = Column(String(50), nullable=False)
|
job_type = Column(String(50), nullable=False)
|
||||||
status = Column(String(50), default="pending", nullable=False, index=True)
|
status = Column(String(50), default="pending", nullable=False, index=True)
|
||||||
|
backend_type = Column(String(20), default="local", nullable=False)
|
||||||
input_data = Column(Text, nullable=True)
|
input_data = Column(Text, nullable=True)
|
||||||
input_params = Column(JSON, nullable=True)
|
input_params = Column(JSON, nullable=True)
|
||||||
output_path = Column(String(500), nullable=True)
|
output_path = Column(String(500), nullable=True)
|
||||||
|
|||||||
@@ -194,6 +194,27 @@ async def health_check():
|
|||||||
if queue_length > 50:
|
if queue_length > 50:
|
||||||
minor_issues.append("queue_congested")
|
minor_issues.append("queue_congested")
|
||||||
|
|
||||||
|
backends_status = {}
|
||||||
|
|
||||||
|
try:
|
||||||
|
from core.tts_service import TTSServiceFactory
|
||||||
|
|
||||||
|
try:
|
||||||
|
local_backend = await TTSServiceFactory.get_backend("local")
|
||||||
|
local_health = await local_backend.health_check()
|
||||||
|
backends_status["local"] = local_health
|
||||||
|
except Exception as e:
|
||||||
|
backends_status["local"] = {"available": False, "error": str(e)}
|
||||||
|
|
||||||
|
backends_status["aliyun"] = {
|
||||||
|
"available": True,
|
||||||
|
"region": settings.ALIYUN_REGION,
|
||||||
|
"note": "Requires user API key configuration"
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Backend health check failed: {e}")
|
||||||
|
backends_status = {"error": str(e)}
|
||||||
|
|
||||||
if critical_issues:
|
if critical_issues:
|
||||||
status = "unhealthy"
|
status = "unhealthy"
|
||||||
elif minor_issues:
|
elif minor_issues:
|
||||||
@@ -211,6 +232,7 @@ async def health_check():
|
|||||||
"database_connected": database_connected,
|
"database_connected": database_connected,
|
||||||
"cache_dir_writable": cache_dir_writable,
|
"cache_dir_writable": cache_dir_writable,
|
||||||
"output_dir_writable": output_dir_writable,
|
"output_dir_writable": output_dir_writable,
|
||||||
|
"backends": backends_status,
|
||||||
"issues": {
|
"issues": {
|
||||||
"critical": critical_issues,
|
"critical": critical_issues,
|
||||||
"minor": minor_issues
|
"minor": minor_issues
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ python-multipart==0.0.12
|
|||||||
python-jose[cryptography]==3.3.0
|
python-jose[cryptography]==3.3.0
|
||||||
passlib==1.7.4
|
passlib==1.7.4
|
||||||
bcrypt==3.2.2
|
bcrypt==3.2.2
|
||||||
|
cryptography>=41.0.0
|
||||||
sqlalchemy==2.0.35
|
sqlalchemy==2.0.35
|
||||||
aiosqlite==0.20.0
|
aiosqlite==0.20.0
|
||||||
soundfile==0.12.1
|
soundfile==0.12.1
|
||||||
@@ -17,3 +18,4 @@ pytest==8.3.0
|
|||||||
pytest-cov==4.1.0
|
pytest-cov==4.1.0
|
||||||
pytest-asyncio==0.23.0
|
pytest-asyncio==0.23.0
|
||||||
httpx==0.27.0
|
httpx==0.27.0
|
||||||
|
websockets>=12.0
|
||||||
|
|||||||
@@ -24,6 +24,7 @@ class CustomVoiceRequest(BaseModel):
|
|||||||
top_k: Optional[int] = Field(default=50, ge=1, le=100)
|
top_k: Optional[int] = Field(default=50, ge=1, le=100)
|
||||||
top_p: Optional[float] = Field(default=1.0, ge=0.0, le=1.0)
|
top_p: Optional[float] = Field(default=1.0, ge=0.0, le=1.0)
|
||||||
repetition_penalty: Optional[float] = Field(default=1.05, ge=1.0, le=2.0)
|
repetition_penalty: Optional[float] = Field(default=1.05, ge=1.0, le=2.0)
|
||||||
|
backend: Optional[str] = Field(default=None, description="Backend type: local or aliyun")
|
||||||
|
|
||||||
|
|
||||||
class VoiceDesignRequest(BaseModel):
|
class VoiceDesignRequest(BaseModel):
|
||||||
@@ -35,6 +36,7 @@ class VoiceDesignRequest(BaseModel):
|
|||||||
top_k: Optional[int] = Field(default=50, ge=1, le=100)
|
top_k: Optional[int] = Field(default=50, ge=1, le=100)
|
||||||
top_p: Optional[float] = Field(default=1.0, ge=0.0, le=1.0)
|
top_p: Optional[float] = Field(default=1.0, ge=0.0, le=1.0)
|
||||||
repetition_penalty: Optional[float] = Field(default=1.05, ge=1.0, le=2.0)
|
repetition_penalty: Optional[float] = Field(default=1.05, ge=1.0, le=2.0)
|
||||||
|
backend: Optional[str] = Field(default=None)
|
||||||
|
|
||||||
|
|
||||||
class VoiceCloneRequest(BaseModel):
|
class VoiceCloneRequest(BaseModel):
|
||||||
|
|||||||
@@ -111,3 +111,10 @@ class PasswordChange(BaseModel):
|
|||||||
if self.new_password != self.confirm_password:
|
if self.new_password != self.confirm_password:
|
||||||
raise ValueError('Passwords do not match')
|
raise ValueError('Passwords do not match')
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
class AliyunKeyUpdate(BaseModel):
|
||||||
|
api_key: str = Field(..., min_length=1, max_length=500)
|
||||||
|
|
||||||
|
class AliyunKeyVerifyResponse(BaseModel):
|
||||||
|
valid: bool
|
||||||
|
message: str
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
<html lang="en">
|
<html lang="en">
|
||||||
<head>
|
<head>
|
||||||
<meta charset="UTF-8" />
|
<meta charset="UTF-8" />
|
||||||
<link rel="icon" type="image/svg+xml" href="/vite.svg" />
|
<link rel="icon" type="image/svg+xml" href="/qwen.svg" />
|
||||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||||
<link rel="preload" href="/fonts/noto-serif-regular.woff2" as="font" type="font/woff2" crossorigin>
|
<link rel="preload" href="/fonts/noto-serif-regular.woff2" as="font" type="font/woff2" crossorigin>
|
||||||
<title>Qwen3-TTS-WebUI</title>
|
<title>Qwen3-TTS-WebUI</title>
|
||||||
|
|||||||
1
qwen3-tts-frontend/public/qwen.svg
Normal file
1
qwen3-tts-frontend/public/qwen.svg
Normal file
@@ -0,0 +1 @@
|
|||||||
|
Redirecting to /@lobehub/icons-static-svg@1.78.0/icons/qwen.svg
|
||||||
@@ -32,6 +32,7 @@ const formSchema = z.object({
|
|||||||
top_k: z.number().min(1).max(100).optional(),
|
top_k: z.number().min(1).max(100).optional(),
|
||||||
top_p: z.number().min(0).max(1).optional(),
|
top_p: z.number().min(0).max(1).optional(),
|
||||||
repetition_penalty: z.number().min(0).max(2).optional(),
|
repetition_penalty: z.number().min(0).max(2).optional(),
|
||||||
|
backend: z.string().optional(),
|
||||||
})
|
})
|
||||||
|
|
||||||
type FormData = z.infer<typeof formSchema>
|
type FormData = z.infer<typeof formSchema>
|
||||||
@@ -74,6 +75,7 @@ const CustomVoiceForm = forwardRef<CustomVoiceFormHandle>((_props, ref) => {
|
|||||||
top_k: 20,
|
top_k: 20,
|
||||||
top_p: 0.7,
|
top_p: 0.7,
|
||||||
repetition_penalty: 1.05,
|
repetition_penalty: 1.05,
|
||||||
|
backend: 'local',
|
||||||
},
|
},
|
||||||
})
|
})
|
||||||
|
|
||||||
@@ -88,6 +90,7 @@ const CustomVoiceForm = forwardRef<CustomVoiceFormHandle>((_props, ref) => {
|
|||||||
setValue('top_k', params.top_k || 20)
|
setValue('top_k', params.top_k || 20)
|
||||||
setValue('top_p', params.top_p || 0.7)
|
setValue('top_p', params.top_p || 0.7)
|
||||||
setValue('repetition_penalty', params.repetition_penalty || 1.05)
|
setValue('repetition_penalty', params.repetition_penalty || 1.05)
|
||||||
|
setValue('backend', params.backend || 'local')
|
||||||
}
|
}
|
||||||
}))
|
}))
|
||||||
|
|
||||||
@@ -131,6 +134,22 @@ const CustomVoiceForm = forwardRef<CustomVoiceFormHandle>((_props, ref) => {
|
|||||||
|
|
||||||
return (
|
return (
|
||||||
<form onSubmit={handleSubmit(onSubmit)} className="space-y-2">
|
<form onSubmit={handleSubmit(onSubmit)} className="space-y-2">
|
||||||
|
<div className="space-y-0.5">
|
||||||
|
<Label>后端选择</Label>
|
||||||
|
<Select
|
||||||
|
value={watch('backend')}
|
||||||
|
onValueChange={(value: string) => setValue('backend', value)}
|
||||||
|
>
|
||||||
|
<SelectTrigger>
|
||||||
|
<SelectValue />
|
||||||
|
</SelectTrigger>
|
||||||
|
<SelectContent>
|
||||||
|
<SelectItem value="local">本地模型</SelectItem>
|
||||||
|
<SelectItem value="aliyun">阿里云 API</SelectItem>
|
||||||
|
</SelectContent>
|
||||||
|
</Select>
|
||||||
|
</div>
|
||||||
|
|
||||||
<div className="space-y-0.5">
|
<div className="space-y-0.5">
|
||||||
<IconLabel icon={Globe2} tooltip="语言" required />
|
<IconLabel icon={Globe2} tooltip="语言" required />
|
||||||
<Select
|
<Select
|
||||||
|
|||||||
@@ -37,6 +37,7 @@ const formSchema = z.object({
|
|||||||
top_k: z.number().min(1).max(100).optional(),
|
top_k: z.number().min(1).max(100).optional(),
|
||||||
top_p: z.number().min(0).max(1).optional(),
|
top_p: z.number().min(0).max(1).optional(),
|
||||||
repetition_penalty: z.number().min(0).max(2).optional(),
|
repetition_penalty: z.number().min(0).max(2).optional(),
|
||||||
|
backend: z.string().optional(),
|
||||||
})
|
})
|
||||||
|
|
||||||
type FormData = z.infer<typeof formSchema>
|
type FormData = z.infer<typeof formSchema>
|
||||||
@@ -75,6 +76,7 @@ function VoiceCloneForm() {
|
|||||||
top_k: 20,
|
top_k: 20,
|
||||||
top_p: 0.7,
|
top_p: 0.7,
|
||||||
repetition_penalty: 1.05,
|
repetition_penalty: 1.05,
|
||||||
|
backend: 'local',
|
||||||
} as Partial<FormData>,
|
} as Partial<FormData>,
|
||||||
})
|
})
|
||||||
|
|
||||||
@@ -233,6 +235,22 @@ function VoiceCloneForm() {
|
|||||||
|
|
||||||
<div className={step === 2 ? 'block space-y-4' : 'hidden'}>
|
<div className={step === 2 ? 'block space-y-4' : 'hidden'}>
|
||||||
{/* Step 2: Synthesis Options */}
|
{/* Step 2: Synthesis Options */}
|
||||||
|
<div className="space-y-0.5">
|
||||||
|
<Label>后端选择</Label>
|
||||||
|
<Select
|
||||||
|
value={watch('backend')}
|
||||||
|
onValueChange={(value: string) => setValue('backend', value)}
|
||||||
|
>
|
||||||
|
<SelectTrigger>
|
||||||
|
<SelectValue />
|
||||||
|
</SelectTrigger>
|
||||||
|
<SelectContent>
|
||||||
|
<SelectItem value="local">本地模型</SelectItem>
|
||||||
|
<SelectItem value="aliyun">阿里云 API</SelectItem>
|
||||||
|
</SelectContent>
|
||||||
|
</Select>
|
||||||
|
</div>
|
||||||
|
|
||||||
<div className="space-y-0.5">
|
<div className="space-y-0.5">
|
||||||
<IconLabel icon={Globe2} tooltip="语言(可选)" />
|
<IconLabel icon={Globe2} tooltip="语言(可选)" />
|
||||||
<Select
|
<Select
|
||||||
|
|||||||
@@ -31,6 +31,7 @@ const formSchema = z.object({
|
|||||||
top_k: z.number().min(1).max(100).optional(),
|
top_k: z.number().min(1).max(100).optional(),
|
||||||
top_p: z.number().min(0).max(1).optional(),
|
top_p: z.number().min(0).max(1).optional(),
|
||||||
repetition_penalty: z.number().min(0).max(2).optional(),
|
repetition_penalty: z.number().min(0).max(2).optional(),
|
||||||
|
backend: z.string().optional(),
|
||||||
})
|
})
|
||||||
|
|
||||||
type FormData = z.infer<typeof formSchema>
|
type FormData = z.infer<typeof formSchema>
|
||||||
@@ -71,6 +72,7 @@ const VoiceDesignForm = forwardRef<VoiceDesignFormHandle>((_props, ref) => {
|
|||||||
top_k: 20,
|
top_k: 20,
|
||||||
top_p: 0.7,
|
top_p: 0.7,
|
||||||
repetition_penalty: 1.05,
|
repetition_penalty: 1.05,
|
||||||
|
backend: 'local',
|
||||||
},
|
},
|
||||||
})
|
})
|
||||||
|
|
||||||
@@ -84,6 +86,7 @@ const VoiceDesignForm = forwardRef<VoiceDesignFormHandle>((_props, ref) => {
|
|||||||
setValue('top_k', params.top_k || 20)
|
setValue('top_k', params.top_k || 20)
|
||||||
setValue('top_p', params.top_p || 0.7)
|
setValue('top_p', params.top_p || 0.7)
|
||||||
setValue('repetition_penalty', params.repetition_penalty || 1.05)
|
setValue('repetition_penalty', params.repetition_penalty || 1.05)
|
||||||
|
setValue('backend', params.backend || 'local')
|
||||||
}
|
}
|
||||||
}))
|
}))
|
||||||
|
|
||||||
@@ -122,6 +125,22 @@ const VoiceDesignForm = forwardRef<VoiceDesignFormHandle>((_props, ref) => {
|
|||||||
|
|
||||||
return (
|
return (
|
||||||
<form onSubmit={handleSubmit(onSubmit)} className="space-y-2">
|
<form onSubmit={handleSubmit(onSubmit)} className="space-y-2">
|
||||||
|
<div className="space-y-0.5">
|
||||||
|
<Label>后端选择</Label>
|
||||||
|
<Select
|
||||||
|
value={watch('backend')}
|
||||||
|
onValueChange={(value: string) => setValue('backend', value)}
|
||||||
|
>
|
||||||
|
<SelectTrigger>
|
||||||
|
<SelectValue />
|
||||||
|
</SelectTrigger>
|
||||||
|
<SelectContent>
|
||||||
|
<SelectItem value="local">本地模型</SelectItem>
|
||||||
|
<SelectItem value="aliyun">阿里云 API</SelectItem>
|
||||||
|
</SelectContent>
|
||||||
|
</Select>
|
||||||
|
</div>
|
||||||
|
|
||||||
<div className="space-y-0.5">
|
<div className="space-y-0.5">
|
||||||
<IconLabel icon={Globe2} tooltip="语言" required />
|
<IconLabel icon={Globe2} tooltip="语言" required />
|
||||||
<Select
|
<Select
|
||||||
|
|||||||
@@ -247,6 +247,9 @@ export const ttsApi = {
|
|||||||
if (data.repetition_penalty !== undefined) {
|
if (data.repetition_penalty !== undefined) {
|
||||||
formData.append('repetition_penalty', String(data.repetition_penalty))
|
formData.append('repetition_penalty', String(data.repetition_penalty))
|
||||||
}
|
}
|
||||||
|
if (data.backend) {
|
||||||
|
formData.append('backend', data.backend)
|
||||||
|
}
|
||||||
|
|
||||||
const response = await apiClient.post<JobCreateResponse>(
|
const response = await apiClient.post<JobCreateResponse>(
|
||||||
API_ENDPOINTS.TTS.VOICE_CLONE,
|
API_ENDPOINTS.TTS.VOICE_CLONE,
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ export interface CustomVoiceForm {
|
|||||||
top_k?: number
|
top_k?: number
|
||||||
top_p?: number
|
top_p?: number
|
||||||
repetition_penalty?: number
|
repetition_penalty?: number
|
||||||
|
backend?: string
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface VoiceDesignForm {
|
export interface VoiceDesignForm {
|
||||||
@@ -29,6 +30,7 @@ export interface VoiceDesignForm {
|
|||||||
top_k?: number
|
top_k?: number
|
||||||
top_p?: number
|
top_p?: number
|
||||||
repetition_penalty?: number
|
repetition_penalty?: number
|
||||||
|
backend?: string
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface VoiceCloneForm {
|
export interface VoiceCloneForm {
|
||||||
@@ -43,4 +45,5 @@ export interface VoiceCloneForm {
|
|||||||
top_k?: number
|
top_k?: number
|
||||||
top_p?: number
|
top_p?: number
|
||||||
repetition_penalty?: number
|
repetition_penalty?: number
|
||||||
|
backend?: string
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user