feat: Implement gender-specific TTS instructions, refactor async database session handling for character creation and preview generation, and add Aliyun voice design creation.

2026-03-11 15:58:14 +08:00
parent d3c6297a09
commit ffd3d6675d
2 changed files with 103 additions and 65 deletions
--- a/qwen3-tts-backend/core/audiobook_service.py
+++ b/qwen3-tts-backend/core/audiobook_service.py
@@ -36,6 +36,26 @@ def _get_llm_service(user: User) -> LLMService:
    return LLMService(base_url=user.llm_base_url, api_key=api_key, model=user.llm_model)
 def _get_gendered_instruct(gender: Optional[str], base_instruct: str) -> str:
    """Ensure the instruction sent to the TTS model has explicit gender cues if known."""
    if not gender or gender == "未知":
        return base_instruct
    # We want to force a clear gender bias at the start of the prompt
    prefix = ""
    if gender == "男":
        prefix = "男性声音，"
    elif gender == "女":
        prefix = "女性声音，"
    if prefix and prefix not in base_instruct:
        # Prepend prefix, but try to be smart if the first line starts with "音色信息："
        if base_instruct.startswith("音色信息："):
            return base_instruct.replace("音色信息：", f"音色信息：{prefix}", 1)
        return f"{prefix}{base_instruct}"
    return base_instruct
 def _extract_epub_chapters(file_path: str) -> list[str]:
    try:
        import ebooklib
@@ -166,6 +186,17 @@ async def analyze_project(project_id: int, user: User, db: Session, turbo: bool
        samples = _sample_full_text(text)
        n = len(samples)
        # Ensure previews directory is clean for new analysis
        previews_dir = Path(settings.OUTPUT_DIR) / "audiobook" / str(project_id) / "previews"
        if previews_dir.exists():
            import shutil
            try:
                shutil.rmtree(previews_dir)
            except Exception as e:
                logger.warning(f"Failed to clear previews directory: {e}")
        previews_dir.mkdir(parents=True, exist_ok=True)
        mode_label = "极速并发" if turbo else "顺序"
        ps.append_line(key, f"\n[LLM] 模型：{user.llm_model}，共 {n} 个采样段（{mode_label}模式），正在分析角色...\n")
        ps.append_line(key, "")
@@ -173,9 +204,12 @@ async def analyze_project(project_id: int, user: User, db: Session, turbo: bool
        def on_token(token: str) -> None:
            ps.append_token(key, token)
        completed_count = 0
        def on_sample(i: int, total: int) -> None:
-            if i < total - 1:
+            nonlocal completed_count
-                ps.append_line(key, f"\n[LLM] 采样段 {i + 1}/{total} 完成，继续分析...\n")
+            completed_count += 1
            if completed_count < total:
                ps.append_line(key, f"\n[LLM] 采样段 {completed_count}/{total} 完成，继续分析...\n")
            else:
                ps.append_line(key, f"\n[LLM] 全部 {total} 个采样段完成，正在合并角色列表...\n")
            ps.append_line(key, "")
@@ -201,53 +235,33 @@ async def analyze_project(project_id: int, user: User, db: Session, turbo: bool
        crud.delete_audiobook_characters(db, project_id)
        backend_type = user.user_preferences.get("default_backend", "aliyun") if user.user_preferences else "aliyun"
-        
+
-        async def _create_char_with_voice(char_data):
+        for char_data in characters_data:
            name = char_data.get("name", "narrator")
            instruct = char_data.get("instruct", "")
            description = char_data.get("description", "")
            gender = char_data.get("gender") or ("未知" if name == "narrator" else None)
            # Requires isolated DB queries since we're in an async concurrent block
            try:
-                # We need an async wrapper or a local db session for concurrent sync DB pushes
+                voice_design = crud.create_voice_design(
-                # Because core crud uses synchronous SQLalchemy, executing them in threadpool via asyncio.to_thread
+                    db=db,
-                import asyncio
+                    user_id=user.id,
-                
+                    name=f"[有声书] {project.title} - {name}",
-                def db_ops():
+                    instruct=instruct,
-                    from core.database import SessionLocal
+                    backend_type=backend_type,
-                    local_db = SessionLocal()
+                    preview_text=description[:100] if description else None,
-                    try:
+                )
-                        voice_design = crud.create_voice_design(
+                crud.create_audiobook_character(
-                            db=local_db,
+                    db=db,
-                            user_id=user.id,
+                    project_id=project_id,
-                            name=f"[有声书] {project.title} - {name}",
+                    name=name,
-                            instruct=instruct,
+                    gender=gender,
-                            backend_type=backend_type,
+                    description=description,
-                            preview_text=description[:100] if description else None,
+                    instruct=instruct,
-                        )
+                    voice_design_id=voice_design.id,
-
+                )
                        crud.create_audiobook_character(
                            db=local_db,
                            project_id=project_id,
                            name=name,
                            gender=gender,
                            description=description,
                            instruct=instruct,
                            voice_design_id=voice_design.id,
                        )
                    finally:
                        local_db.close()
                await asyncio.to_thread(db_ops)
            except Exception as e:
                logger.error(f"Failed to create char/voice for {name}: {e}")
        import asyncio
        batch_tasks = [_create_char_with_voice(cd) for cd in characters_data]
        if batch_tasks:
            await asyncio.gather(*batch_tasks)
        crud.update_audiobook_project_status(db, project_id, "characters_ready")
        ps.mark_done(key)
        logger.info(f"Project {project_id} character extraction complete: {len(characters_data)} characters")
@@ -259,25 +273,34 @@ async def analyze_project(project_id: int, user: User, db: Session, turbo: bool
        user_id = user.id
        async def _generate_all_previews():
-            async_db = SessionLocal()
+            # Get character IDs first using a temporary session
            temp_db = SessionLocal()
            try:
-                db_user = crud.get_user_by_id(async_db, user_id)
+                characters = crud.list_audiobook_characters(temp_db, project_id)
-                characters = crud.list_audiobook_characters(async_db, project_id)
+                char_ids = [c.id for c in characters]
                # Use a semaphore to limit concurrent TTS requests
                sem = asyncio.Semaphore(3)
                async def _gen(char_id: int):
                    async with sem:
                        try:
                            await generate_character_preview(project_id, char_id, db_user, async_db)
                        except Exception as e:
                            logger.error(f"Background preview generation failed for char {char_id}: {e}")
                tasks = [_gen(c.id) for c in characters]
                if tasks:
                    await asyncio.gather(*tasks)
            finally:
-                async_db.close()
+                temp_db.close()
            if not char_ids:
                return
            # Use a semaphore to limit concurrent TTS requests
            sem = asyncio.Semaphore(3)
            async def _gen(char_id: int):
                async with sem:
                    # Each concurrent task MUST have its own dedicated session
                    local_db = SessionLocal()
                    try:
                        db_user = crud.get_user_by_id(local_db, user_id)
                        await generate_character_preview(project_id, char_id, db_user, local_db)
                    except Exception as e:
                        logger.error(f"Background preview generation failed for char {char_id}: {e}")
                    finally:
                        local_db.close()
            tasks = [_gen(cid) for cid in char_ids]
            await asyncio.gather(*tasks)
        asyncio.create_task(_generate_all_previews())
@@ -538,7 +561,7 @@ async def generate_project(project_id: int, user: User, db: Session, chapter_ind
                        audio_bytes, _ = await backend.generate_voice_design({
                            "text": seg.text,
                            "language": "zh",
-                            "instruct": design.instruct,
+                            "instruct": _get_gendered_instruct(char.gender, design.instruct),
                        })
                else:
                    if design.voice_cache_id:
@@ -563,7 +586,7 @@ async def generate_project(project_id: int, user: User, db: Session, chapter_ind
                            audio_bytes, _ = await backend.generate_voice_design({
                                "text": seg.text,
                                "language": "Auto",
-                                "instruct": design.instruct,
+                                "instruct": _get_gendered_instruct(char.gender, design.instruct),
                                "max_new_tokens": 2048,
                                "temperature": 0.3,
                                "top_k": 10,
@@ -574,7 +597,7 @@ async def generate_project(project_id: int, user: User, db: Session, chapter_ind
                        audio_bytes, _ = await backend.generate_voice_design({
                            "text": seg.text,
                            "language": "Auto",
-                            "instruct": design.instruct,
+                            "instruct": _get_gendered_instruct(char.gender, design.instruct),
                            "max_new_tokens": 2048,
                            "temperature": 0.3,
                            "top_k": 10,
@@ -789,7 +812,7 @@ async def generate_character_preview(project_id: int, char_id: int, user: User,
            from core.cache_manager import VoiceCacheManager
            from utils.audio import process_ref_audio
            import hashlib
-            
+
            ref_text = "你好，这是参考音频。"
            ref_audio_bytes, _ = await backend.generate_voice_design({
                "text": ref_text,
@@ -823,6 +846,20 @@ async def generate_character_preview(project_id: int, char_id: int, user: User,
            db.commit()
            logger.info(f"Bootstrapped local voice cache for preview: design_id={design.id}, cache_id={cache_id}")
        if backend_type == "aliyun" and not design.aliyun_voice_id:
            from core.tts_service import AliyunTTSBackend
            if isinstance(backend, AliyunTTSBackend):
                try:
                    voice_id = await backend._create_voice_design(
                        instruct=_get_gendered_instruct(char.gender, design.instruct),
                        preview_text=preview_text,
                    )
                    design.aliyun_voice_id = voice_id
                    db.commit()
                    logger.info(f"Bootstrapped aliyun voice_id for preview: design_id={design.id}, voice_id={voice_id}")
                except Exception as e:
                    logger.warning(f"Failed to bootstrap aliyun voice_id for preview, falling back to instruct: {e}")
        if backend_type == "aliyun":
            if design.aliyun_voice_id:
                audio_bytes, _ = await backend.generate_voice_design(
@@ -833,7 +870,7 @@ async def generate_character_preview(project_id: int, char_id: int, user: User,
                audio_bytes, _ = await backend.generate_voice_design({
                    "text": preview_text,
                    "language": "zh",
-                    "instruct": design.instruct,
+                    "instruct": _get_gendered_instruct(char.gender, design.instruct),
                })
        else:
            if design.voice_cache_id:
@@ -858,7 +895,7 @@ async def generate_character_preview(project_id: int, char_id: int, user: User,
                    audio_bytes, _ = await backend.generate_voice_design({
                        "text": preview_text,
                        "language": "Auto",
-                        "instruct": design.instruct,
+                        "instruct": _get_gendered_instruct(char.gender, design.instruct),
                        "max_new_tokens": 512,
                        "temperature": 0.3,
                        "top_k": 10,
--- a/qwen3-tts-backend/core/llm_service.py
+++ b/qwen3-tts-backend/core/llm_service.py
@@ -122,12 +122,13 @@ class LLMService:
            "gender字段必须明确标注性别，只能取以下三个值之一：\"男\"、\"女\"、\"未知\"。\n"
            "narrator的gender固定为\"未知\"。\n"
            "对每个角色，instruct字段必须是详细的声音导演说明，需覆盖以下六个维度，每个维度单独一句，用换行分隔：\n"
-            "1. 音色信息：嗓音质感、音域、音量、气息特征（如：青年男性中低音，音色干净略带沙哑，音量偏小但稳定，情绪激动时呼吸明显）\n"
+            "1. 音色信息：嗓音质感、音域、音量、气息特征（例如，如果是女性角色，此处必须以'女性声音'开头，如：'女性声音，清脆悦耳的高音，嗓音纤细干净，带有一点点少女感'；男性角色则以'男性声音'开头）\n"
            "2. 身份背景：角色身份、职业、出身、所处时代背景对声音的影响\n"
            "3. 年龄设定：具体年龄段及其在声音上的体现\n"
            "4. 外貌特征：体型、面容、精神状态等可影响声音感知的特征\n"
            "5. 性格特质：核心性格、情绪模式、表达习惯\n"
            "6. 叙事风格：语速节奏、停顿习惯、语气色彩、整体叙述感\n\n"
            "注意：instruct 的第一行（音色信息）必须与 gender 字段保持一致。如果 gender 为女，第一行绝对不能出现'男性'字样。\n\n"
            "只输出JSON，格式如下，不要有其他文字：\n"
            '{"characters": [{"name": "narrator", "gender": "未知", "description": "第三人称叙述者", "instruct": "音色信息：...\\n身份背景：...\\n年龄设定：...\\n外貌特征：...\\n性格特质：...\\n叙事风格：..."}, ...]}'
        )