diff --git a/qwen3-tts-backend/api/audiobook.py b/qwen3-tts-backend/api/audiobook.py index b3de797..996e7fb 100644 --- a/qwen3-tts-backend/api/audiobook.py +++ b/qwen3-tts-backend/api/audiobook.py @@ -49,6 +49,7 @@ def _project_to_detail(project, db: Session) -> AudiobookProjectDetail: id=c.id, project_id=c.project_id, name=c.name, + gender=c.gender, description=c.description, instruct=c.instruct, voice_design_id=c.voice_design_id, @@ -213,6 +214,47 @@ async def confirm_characters( return {"message": "Chapters identified", "project_id": project_id} +@router.get("/projects/{project_id}/characters/{char_id}/audio") +async def get_character_audio( + project_id: int, + char_id: int, + current_user: User = Depends(get_current_user), + db: Session = Depends(get_db), +): + project = crud.get_audiobook_project(db, project_id, current_user.id) + if not project: + raise HTTPException(status_code=404, detail="Project not found") + + audio_path = Path(settings.OUTPUT_DIR) / "audiobook" / str(project_id) / "previews" / f"char_{char_id}.wav" + if not audio_path.exists(): + raise HTTPException(status_code=404, detail="Preview audio not generated yet") + + return FileResponse(audio_path, media_type="audio/wav") + + +@router.post("/projects/{project_id}/characters/{char_id}/regenerate-preview") +async def regenerate_character_preview_endpoint( + project_id: int, + char_id: int, + current_user: User = Depends(get_current_user), + db: Session = Depends(get_db), +): + project = crud.get_audiobook_project(db, project_id, current_user.id) + if not project: + raise HTTPException(status_code=404, detail="Project not found") + + from core.audiobook_service import generate_character_preview + + try: + await generate_character_preview(project_id, char_id, current_user, db) + return {"message": "Preview generated successfully"} + except ValueError as e: + raise HTTPException(status_code=400, detail=str(e)) + except Exception as e: + logger.error(f"Failed to regenerate preview: {e}", exc_info=True) + raise HTTPException(status_code=500, detail=str(e)) + + @router.get("/projects/{project_id}/chapters", response_model=list[AudiobookChapterResponse]) async def list_chapters( project_id: int, @@ -385,6 +427,7 @@ async def update_character( id=char.id, project_id=char.project_id, name=char.name, + gender=char.gender, description=char.description, instruct=char.instruct, voice_design_id=char.voice_design_id, diff --git a/qwen3-tts-backend/core/audiobook_service.py b/qwen3-tts-backend/core/audiobook_service.py index aaf2aae..1060a1c 100644 --- a/qwen3-tts-backend/core/audiobook_service.py +++ b/qwen3-tts-backend/core/audiobook_service.py @@ -201,36 +201,86 @@ async def analyze_project(project_id: int, user: User, db: Session, turbo: bool crud.delete_audiobook_characters(db, project_id) backend_type = user.user_preferences.get("default_backend", "aliyun") if user.user_preferences else "aliyun" - - for char_data in characters_data: + + async def _create_char_with_voice(char_data): name = char_data.get("name", "narrator") instruct = char_data.get("instruct", "") description = char_data.get("description", "") gender = char_data.get("gender") or ("未知" if name == "narrator" else None) - voice_design = crud.create_voice_design( - db=db, - user_id=user.id, - name=f"[有声书] {project.title} - {name}", - instruct=instruct, - backend_type=backend_type, - preview_text=description[:100] if description else None, - ) + # Requires isolated DB queries since we're in an async concurrent block + try: + # We need an async wrapper or a local db session for concurrent sync DB pushes + # Because core crud uses synchronous SQLalchemy, executing them in threadpool via asyncio.to_thread + import asyncio + + def db_ops(): + from core.database import SessionLocal + local_db = SessionLocal() + try: + voice_design = crud.create_voice_design( + db=local_db, + user_id=user.id, + name=f"[有声书] {project.title} - {name}", + instruct=instruct, + backend_type=backend_type, + preview_text=description[:100] if description else None, + ) - crud.create_audiobook_character( - db=db, - project_id=project_id, - name=name, - gender=gender, - description=description, - instruct=instruct, - voice_design_id=voice_design.id, - ) + crud.create_audiobook_character( + db=local_db, + project_id=project_id, + name=name, + gender=gender, + description=description, + instruct=instruct, + voice_design_id=voice_design.id, + ) + finally: + local_db.close() + + await asyncio.to_thread(db_ops) + except Exception as e: + logger.error(f"Failed to create char/voice for {name}: {e}") + + import asyncio + batch_tasks = [_create_char_with_voice(cd) for cd in characters_data] + if batch_tasks: + await asyncio.gather(*batch_tasks) crud.update_audiobook_project_status(db, project_id, "characters_ready") ps.mark_done(key) logger.info(f"Project {project_id} character extraction complete: {len(characters_data)} characters") + # Kick off background preview generation + import asyncio + from core.database import SessionLocal + + user_id = user.id + + async def _generate_all_previews(): + async_db = SessionLocal() + try: + db_user = crud.get_user_by_id(async_db, user_id) + characters = crud.list_audiobook_characters(async_db, project_id) + + # Use a semaphore to limit concurrent TTS requests + sem = asyncio.Semaphore(3) + async def _gen(char_id: int): + async with sem: + try: + await generate_character_preview(project_id, char_id, db_user, async_db) + except Exception as e: + logger.error(f"Background preview generation failed for char {char_id}: {e}") + + tasks = [_gen(c.id) for c in characters] + if tasks: + await asyncio.gather(*tasks) + finally: + async_db.close() + + asyncio.create_task(_generate_all_previews()) + except Exception as e: logger.error(f"Analysis failed for project {project_id}: {e}", exc_info=True) ps.append_line(key, f"\n[错误] {e}") @@ -587,6 +637,10 @@ async def parse_all_chapters(project_id: int, user: User, db: Session, statuses: max_concurrent = settings.AUDIOBOOK_PARSE_CONCURRENCY semaphore = asyncio.Semaphore(max_concurrent) logger.info(f"parse_all_chapters: project={project_id}, {len(pending)} chapters, concurrency={max_concurrent}") + + ps = ProgressStore() + key = f"project_{project_id}" + ps.append_line(key, f"\n[状态] 开启章节并发解析,共 {len(pending)} 章待处理,最大并发: {max_concurrent}...\n") async def parse_with_limit(chapter): if cancel_ev.is_set(): @@ -605,6 +659,12 @@ async def parse_all_chapters(project_id: int, user: User, db: Session, statuses: await asyncio.gather(*[parse_with_limit(ch) for ch in pending]) _cancel_events.pop(project_id, None) + + if cancel_ev.is_set(): + ps.append_line(key, f"\n[状态] 章节批量解析被用户取消\n") + else: + ps.append_line(key, f"\n[状态] 所有章节批量解析已完成\n") + logger.info(f"parse_all_chapters: project={project_id} {'cancelled' if cancel_ev.is_set() else 'complete'}") @@ -677,3 +737,150 @@ async def process_all(project_id: int, user: User, db: Session) -> None: logger.info(f"process_all: project={project_id} complete") + +async def generate_character_preview(project_id: int, char_id: int, user: User, db: Session) -> None: + """Generate a short audio preview for a specific character.""" + project = crud.get_audiobook_project(db, project_id, user.id) + if not project: + raise ValueError("Project not found") + + char = crud.get_audiobook_character(db, char_id) + if not char or char.project_id != project_id: + raise ValueError("Character not found or doesn't belong to this project") + + if not char.voice_design_id: + raise ValueError("Character has no associated voice design") + + design = crud.get_voice_design(db, char.voice_design_id, user.id) + if not design: + raise ValueError("Voice design not found") + + output_base = Path(settings.OUTPUT_DIR) / "audiobook" / str(project_id) / "previews" + output_base.mkdir(parents=True, exist_ok=True) + audio_path = output_base / f"char_{char_id}.wav" + + preview_name = char.name + if preview_name == "narrator": + preview_name = "旁白" + + preview_desc = "" + if char.description: + # Take a short snippet of description to make it sound natural + preview_desc = "," + char.description[:30].replace('\n', ',') + if not preview_desc.endswith('。') and not preview_desc.endswith('!'): + preview_desc += "。" + + preview_text = f"你好,我是{preview_name}{preview_desc}" + + from core.tts_service import TTSServiceFactory + from core.security import decrypt_api_key + + backend_type = user.user_preferences.get("default_backend", "aliyun") if user.user_preferences else "aliyun" + user_api_key = None + if backend_type == "aliyun" and user.aliyun_api_key: + user_api_key = decrypt_api_key(user.aliyun_api_key) + + backend = await TTSServiceFactory.get_backend(backend_type, user_api_key) + + try: + if backend_type == "local" and not design.voice_cache_id: + logger.info(f"Local voice cache missing for char {char_id}. Bootstrapping now...") + from core.model_manager import ModelManager + from core.cache_manager import VoiceCacheManager + from utils.audio import process_ref_audio + import hashlib + + ref_text = "你好,这是参考音频。" + ref_audio_bytes, _ = await backend.generate_voice_design({ + "text": ref_text, + "language": "Auto", + "instruct": design.instruct or "", + "max_new_tokens": 512, + "temperature": 0.3, + "top_k": 10, + "top_p": 0.9, + "repetition_penalty": 1.05, + }) + + model_manager = await ModelManager.get_instance() + await model_manager.load_model("base") + _, tts = await model_manager.get_current_model() + + ref_audio_array, ref_sr = process_ref_audio(ref_audio_bytes) + x_vector = tts.create_voice_clone_prompt( + ref_audio=(ref_audio_array, ref_sr), + ref_text=ref_text, + ) + + cache_manager = await VoiceCacheManager.get_instance() + ref_audio_hash = hashlib.sha256(ref_audio_bytes).hexdigest() + cache_id = await cache_manager.set_cache( + user.id, ref_audio_hash, x_vector, + {"ref_text": ref_text, "instruct": design.instruct}, + db + ) + design.voice_cache_id = cache_id + db.commit() + logger.info(f"Bootstrapped local voice cache for preview: design_id={design.id}, cache_id={cache_id}") + + if backend_type == "aliyun": + if design.aliyun_voice_id: + audio_bytes, _ = await backend.generate_voice_design( + {"text": preview_text, "language": "zh"}, + saved_voice_id=design.aliyun_voice_id + ) + else: + audio_bytes, _ = await backend.generate_voice_design({ + "text": preview_text, + "language": "zh", + "instruct": design.instruct, + }) + else: + if design.voice_cache_id: + from core.cache_manager import VoiceCacheManager + cache_manager = await VoiceCacheManager.get_instance() + cache_result = await cache_manager.get_cache_by_id(design.voice_cache_id, db) + x_vector = cache_result['data'] if cache_result else None + if x_vector: + audio_bytes, _ = await backend.generate_voice_clone( + { + "text": preview_text, + "language": "Auto", + "max_new_tokens": 512, + "temperature": 0.3, + "top_k": 10, + "top_p": 0.9, + "repetition_penalty": 1.05, + }, + x_vector=x_vector + ) + else: + audio_bytes, _ = await backend.generate_voice_design({ + "text": preview_text, + "language": "Auto", + "instruct": design.instruct, + "max_new_tokens": 512, + "temperature": 0.3, + "top_k": 10, + "top_p": 0.9, + "repetition_penalty": 1.05, + }) + else: + audio_bytes, _ = await backend.generate_voice_design({ + "text": preview_text, + "language": "Auto", + "instruct": design.instruct, + "max_new_tokens": 512, + "temperature": 0.3, + "top_k": 10, + "top_p": 0.9, + "repetition_penalty": 1.05, + }) + + with open(audio_path, "wb") as f: + f.write(audio_bytes) + + logger.info(f"Preview generated for char {char_id}: {audio_path}") + except Exception as e: + logger.error(f"Failed to generate preview for char {char_id}: {e}") + raise diff --git a/qwen3-tts-backend/core/llm_service.py b/qwen3-tts-backend/core/llm_service.py index 08039fb..3f0ee79 100644 --- a/qwen3-tts-backend/core/llm_service.py +++ b/qwen3-tts-backend/core/llm_service.py @@ -134,13 +134,15 @@ class LLMService: if turbo and len(text_samples) > 1: logger.info(f"Extracting characters in turbo mode: {len(text_samples)} samples concurrent") - async def _extract_one(sample: str) -> list[Dict]: + async def _extract_one(i: int, sample: str) -> list[Dict]: user_message = f"请分析以下小说文本并提取角色:\n\n{sample}" result = await self.stream_chat_json(system_prompt, user_message, None) + if on_sample: + on_sample(i, len(text_samples)) return result.get("characters", []) results = await asyncio.gather( - *[_extract_one(s) for s in text_samples], + *[_extract_one(i, s) for i, s in enumerate(text_samples)], return_exceptions=True, ) raw_all: list[Dict] = [] diff --git a/qwen3-tts-backend/core/tts_service.py b/qwen3-tts-backend/core/tts_service.py index c9ed68d..a61dff2 100644 --- a/qwen3-tts-backend/core/tts_service.py +++ b/qwen3-tts-backend/core/tts_service.py @@ -32,6 +32,8 @@ class TTSBackend(ABC): class LocalTTSBackend(TTSBackend): def __init__(self): self.model_manager = None + # Add a lock to prevent concurrent VRAM contention and CUDA errors on local GPU models + self._gpu_lock = asyncio.Lock() async def initialize(self): from core.model_manager import ModelManager @@ -42,21 +44,22 @@ class LocalTTSBackend(TTSBackend): _, tts = await self.model_manager.get_current_model() loop = asyncio.get_event_loop() - result = await loop.run_in_executor( - None, - functools.partial( - tts.generate_custom_voice, - text=params['text'], - language=params['language'], - speaker=params['speaker'], - instruct=params.get('instruct', ''), - max_new_tokens=params['max_new_tokens'], - temperature=params['temperature'], - top_k=params['top_k'], - top_p=params['top_p'], - repetition_penalty=params['repetition_penalty'], + async with self._gpu_lock: + result = await loop.run_in_executor( + None, + functools.partial( + tts.generate_custom_voice, + text=params['text'], + language=params['language'], + speaker=params['speaker'], + instruct=params.get('instruct', ''), + max_new_tokens=params['max_new_tokens'], + temperature=params['temperature'], + top_k=params['top_k'], + top_p=params['top_p'], + repetition_penalty=params['repetition_penalty'], + ) ) - ) import numpy as np wavs, sample_rate = result if isinstance(result, tuple) else (result, 24000) @@ -68,20 +71,21 @@ class LocalTTSBackend(TTSBackend): _, tts = await self.model_manager.get_current_model() loop = asyncio.get_event_loop() - result = await loop.run_in_executor( - None, - functools.partial( - tts.generate_voice_design, - text=params['text'], - language=params['language'], - instruct=params['instruct'], - max_new_tokens=params['max_new_tokens'], - temperature=params['temperature'], - top_k=params['top_k'], - top_p=params['top_p'], - repetition_penalty=params['repetition_penalty'], + async with self._gpu_lock: + result = await loop.run_in_executor( + None, + functools.partial( + tts.generate_voice_design, + text=params['text'], + language=params['language'], + instruct=params['instruct'], + max_new_tokens=params['max_new_tokens'], + temperature=params['temperature'], + top_k=params['top_k'], + top_p=params['top_p'], + repetition_penalty=params['repetition_penalty'], + ) ) - ) import numpy as np wavs, sample_rate = result if isinstance(result, tuple) else (result, 24000) @@ -96,37 +100,38 @@ class LocalTTSBackend(TTSBackend): loop = asyncio.get_event_loop() - if x_vector is None: - if ref_audio_bytes is None: - raise ValueError("Either ref_audio_bytes or x_vector must be provided") + async with self._gpu_lock: + if x_vector is None: + if ref_audio_bytes is None: + raise ValueError("Either ref_audio_bytes or x_vector must be provided") - ref_audio_array, ref_sr = process_ref_audio(ref_audio_bytes) + ref_audio_array, ref_sr = process_ref_audio(ref_audio_bytes) - x_vector = await loop.run_in_executor( + x_vector = await loop.run_in_executor( + None, + functools.partial( + tts.create_voice_clone_prompt, + ref_audio=(ref_audio_array, ref_sr), + ref_text=params.get('ref_text', ''), + x_vector_only_mode=False, + ) + ) + + wavs, sample_rate = await loop.run_in_executor( None, functools.partial( - tts.create_voice_clone_prompt, - ref_audio=(ref_audio_array, ref_sr), - ref_text=params.get('ref_text', ''), - x_vector_only_mode=False, + tts.generate_voice_clone, + text=params['text'], + language=params['language'], + voice_clone_prompt=x_vector, + max_new_tokens=params['max_new_tokens'], + temperature=params['temperature'], + top_k=params['top_k'], + top_p=params['top_p'], + repetition_penalty=params['repetition_penalty'], ) ) - wavs, sample_rate = await loop.run_in_executor( - None, - functools.partial( - tts.generate_voice_clone, - text=params['text'], - language=params['language'], - voice_clone_prompt=x_vector, - max_new_tokens=params['max_new_tokens'], - temperature=params['temperature'], - top_k=params['top_k'], - top_p=params['top_p'], - repetition_penalty=params['repetition_penalty'], - ) - ) - import numpy as np audio_data = wavs[0] if isinstance(wavs, list) else wavs if isinstance(audio_data, list): diff --git a/qwen3-tts-frontend/src/lib/api/audiobook.ts b/qwen3-tts-frontend/src/lib/api/audiobook.ts index d1ba729..967d3d5 100644 --- a/qwen3-tts-frontend/src/lib/api/audiobook.ts +++ b/qwen3-tts-frontend/src/lib/api/audiobook.ts @@ -139,6 +139,14 @@ export const audiobookApi = { return `/audiobook/projects/${projectId}/segments/${segmentId}/audio` }, + getCharacterAudioUrl: (projectId: number, charId: number): string => { + return `/audiobook/projects/${projectId}/characters/${charId}/audio` + }, + + regenerateCharacterPreview: async (projectId: number, charId: number): Promise => { + await apiClient.post(`/audiobook/projects/${projectId}/characters/${charId}/regenerate-preview`) + }, + parseAllChapters: async (projectId: number, onlyErrors?: boolean): Promise => { const params = onlyErrors ? '?only_errors=true' : '' await apiClient.post(`/audiobook/projects/${projectId}/parse-all${params}`) diff --git a/qwen3-tts-frontend/src/locales/en-US/audiobook.json b/qwen3-tts-frontend/src/locales/en-US/audiobook.json index 123fde9..c1a3622 100644 --- a/qwen3-tts-frontend/src/locales/en-US/audiobook.json +++ b/qwen3-tts-frontend/src/locales/en-US/audiobook.json @@ -15,7 +15,8 @@ "processing": "Processing", "generating": "Generating", "done": "Done", - "error": "Error" + "error": "Error", + "turboActive": "⚡ Turbo" }, "stepHints": { diff --git a/qwen3-tts-frontend/src/locales/zh-CN/audiobook.json b/qwen3-tts-frontend/src/locales/zh-CN/audiobook.json index ca39328..c429218 100644 --- a/qwen3-tts-frontend/src/locales/zh-CN/audiobook.json +++ b/qwen3-tts-frontend/src/locales/zh-CN/audiobook.json @@ -15,7 +15,8 @@ "processing": "处理中", "generating": "生成中", "done": "已完成", - "error": "出错" + "error": "出错", + "turboActive": "⚡ 极速并发" }, "stepHints": { @@ -83,7 +84,10 @@ "descPlaceholder": "角色描述", "voiceDesign": "音色 #{{id}}", "noVoice": "未分配", - "savedSuccess": "角色已保存" + "savedSuccess": "角色已保存", + "regeneratingPreview": "重新生成试听中...", + "regeneratePreview": "重生试听", + "previewNotReady": "试听收集中..." }, "confirm": { diff --git a/qwen3-tts-frontend/src/pages/Audiobook.tsx b/qwen3-tts-frontend/src/pages/Audiobook.tsx index 36c43dc..ef2de2a 100644 --- a/qwen3-tts-frontend/src/pages/Audiobook.tsx +++ b/qwen3-tts-frontend/src/pages/Audiobook.tsx @@ -343,6 +343,8 @@ function ProjectCard({ project, onRefresh }: { project: AudiobookProject; onRefr const [charsCollapsed, setCharsCollapsed] = useState(false) const [chaptersCollapsed, setChaptersCollapsed] = useState(false) const [expandedChapters, setExpandedChapters] = useState>(new Set()) + const [voiceKeys, setVoiceKeys] = useState>({}) + const [regeneratingVoices, setRegeneratingVoices] = useState>(new Set()) const prevStatusRef = useRef(project.status) const autoExpandedRef = useRef(new Set()) @@ -433,7 +435,7 @@ function ProjectCard({ project, onRefresh }: { project: AudiobookProject; onRefr setLoadingAction(true) setIsPolling(true) try { - await audiobookApi.analyze(project.id, {}) + await audiobookApi.analyze(project.id, { turbo: true }) toast.success(t('projectCard.analyzeStarted')) onRefresh() } catch (e: any) { @@ -525,6 +527,24 @@ function ProjectCard({ project, onRefresh }: { project: AudiobookProject; onRefr } } + const handleRegeneratePreview = async (charId: number) => { + if (!project) return + setRegeneratingVoices(prev => new Set(prev).add(charId)) + try { + await audiobookApi.regenerateCharacterPreview(project.id, charId) + toast.success(t('projectCard.characters.savedSuccess')) // or add a new toast key + setVoiceKeys(prev => ({ ...prev, [charId]: (prev[charId] || 0) + 1 })) + } catch (e: any) { + toast.error(formatApiError(e)) + } finally { + setRegeneratingVoices(prev => { + const next = new Set(prev) + next.delete(charId) + return next + }) + } + } + const handleGenerateAll = async () => { if (!detail) return setLoadingAction(true) @@ -667,6 +687,8 @@ function ProjectCard({ project, onRefresh }: { project: AudiobookProject; onRefr return status })() + const isTurboMode = ['analyzing', 'parsing', 'processing'].includes(displayStatus) + return (
@@ -675,6 +697,11 @@ function ProjectCard({ project, onRefresh }: { project: AudiobookProject; onRefr {project.title}
+ {isTurboMode && ( + + {t('status.turboActive')} + + )} {t(`status.${displayStatus}`, { defaultValue: displayStatus })} @@ -863,6 +890,34 @@ function ProjectCard({ project, onRefresh }: { project: AudiobookProject; onRefr
)} + + {!editingCharId && char.voice_design_id && ( +
+
+ +
+ + {status === 'characters_ready' && ( + + )} +
+ )} ))} }