feat: update emotion handling in audiobook segments and UI for multi-emotion selection

2026-03-13 15:14:49 +08:00
parent 161e7fa76d
commit bf1532200a
4 changed files with 131 additions and 60 deletions
--- a/qwen3-tts-backend/core/audiobook_service.py
+++ b/qwen3-tts-backend/core/audiobook_service.py
@@ -16,7 +16,31 @@ from db.models import AudiobookProject, AudiobookCharacter, User
 logger = logging.getLogger(__name__)

 _LINE_RE = re.compile(r'^【(.+?)】(.*)$')
-_EMO_RE = re.compile(r'（([^:）]+):([0-9.]+)）\s*$')
+_EMO_RE = re.compile(r'（([^）]+)）\s*$')
+
+
+def _parse_emo(raw: str) -> tuple[Optional[str], Optional[float]]:
+    tokens = [t.strip() for t in raw.split('+') if t.strip()]
+    if not tokens:
+        return None, None
+    weighted = [(':' in t) for t in tokens]
+    if all(weighted) and len(tokens) > 1:
+        return raw, 1.0
+    elif len(tokens) == 1 and weighted[0]:
+        name, _, a = tokens[0].partition(':')
+        try:
+            return name.strip(), float(a)
+        except ValueError:
+            return None, None
+    elif weighted[-1]:
+        last_name, _, a = tokens[-1].rpartition(':')
+        names = tokens[:-1] + [last_name]
+        try:
+            return '+'.join(n.strip() for n in names), float(a)
+        except ValueError:
+            return None, None
+    else:
+        return None, None

 # Cancellation events for batch operations, keyed by project_id
 _cancel_events: dict[int, asyncio.Event] = {}
@@ -196,12 +220,10 @@ def parse_ai_script(script_text: str, char_map: dict) -> list[dict]:
        emo_alpha = None
        emo_m = _EMO_RE.search(content)
        if emo_m:
-            emo_text = emo_m.group(1)
-            try:
-                emo_alpha = float(emo_m.group(2))
-            except ValueError:
-                emo_alpha = None
-            content = content[:emo_m.start()].strip()
+            et, ea = _parse_emo(emo_m.group(1))
+            if et is not None:
+                emo_text, emo_alpha = et, ea
+                content = content[:emo_m.start()].strip()

        if content.startswith('"') and content.endswith('"'):
            content = content[1:-1].strip()
@@ -211,12 +233,10 @@ def parse_ai_script(script_text: str, char_map: dict) -> list[dict]:
        if emo_text is None:
            emo_m = _EMO_RE.search(content)
            if emo_m:
-                emo_text = emo_m.group(1)
-                try:
-                    emo_alpha = float(emo_m.group(2))
-                except ValueError:
-                    emo_alpha = None
-                content = content[:emo_m.start()].strip()
+                et, ea = _parse_emo(emo_m.group(1))
+                if et is not None:
+                    emo_text, emo_alpha = et, ea
+                    content = content[:emo_m.start()].strip()

        character = speaker

--- a/qwen3-tts-backend/core/llm_service.py
+++ b/qwen3-tts-backend/core/llm_service.py
@@ -378,10 +378,11 @@ class LLMService:
            "  【角色名】\"对话内容\"（情感词:强度）\n\n"
            "情感标注规则：\n"
            "- 情感词可选：开心、愤怒、悲伤、恐惧、厌恶、低沉、惊讶\n"
-            "- 可用 + 拼接多个情感词表达复杂情绪，如（开心+悲伤:0.4）、（愤怒+恐惧:0.5）\n"
-            "- 多情感时强度为混合情感的整体强度，每种情感对合成结果均有贡献\n"
-            f"- 各情感强度上限（严格不超过）：{limits_str}\n"
-            "- 情感不明显时可省略（情感词:强度）整个括号\n"
+            "- 单一情感：（情感词:强度），如（开心:0.5）、（悲伤:0.3）\n"
+            "- 混合情感：（情感1:比重+情感2:比重），如（开心:0.6+悲伤:0.2）、（愤怒:0.3+恐惧:0.4）\n"
+            "- 混合情感时每个情感的比重独立设定，反映各自对情绪的贡献\n"
+            f"- 各情感比重上限（严格不超过）：{limits_str}\n"
+            "- 情感不明显时可省略整个括号\n"
            + narrator_rule
            + emo_guidance_line
            + "\n其他规则：\n"
@@ -453,16 +454,18 @@ class LLMService:
            "你是一个专业的有声书制作助手。请将给定的章节文本解析为对话片段列表。"
            f"已知角色列表（必须从中选择）：{names_str}。"
            "所有非对话的叙述文字归属于旁白角色。\n"
-            "同时根据语境为每个片段判断是否有明显情绪，有则设置情绪类型（emo_text）和强度（emo_alpha），无则留空。\n"
-            "可选情绪：开心、愤怒、悲伤、恐惧、厌恶、低沉、惊讶。\n"
-            "- emo_text 可用 + 拼接多个情感词（如 \"开心+悲伤\"），表达复杂混合情绪\n"
-            "情绪不明显或旁白时，emo_text设为\"\"，emo_alpha设为0。\n"
-            "各情绪强度上限（严格不超过）：开心=0.35、愤怒=0.15、悲伤=0.1、恐惧=0.1、厌恶=0.35、低沉=0.35、惊讶=0.1。\n"
+            "同时根据语境为每个片段判断是否有明显情绪，有则在 emo_text 中标注，无则留空。\n"
+            "可选情绪词：开心、愤怒、悲伤、恐惧、厌恶、低沉、惊讶。\n"
+            "emo_text 格式规则：\n"
+            "  单一情感：直接填情感词，用 emo_alpha 设置强度，如 emo_text=\"开心\", emo_alpha=0.3\n"
+            "  混合情感：用 情感词:比重 格式拼接，emo_alpha 设为 1.0，如 emo_text=\"开心:0.6+悲伤:0.2\", emo_alpha=1.0\n"
+            "情绪不明显或旁白时，emo_text=\"\"，emo_alpha=0。\n"
+            "各情感比重上限（严格不超过）：开心=0.35、愤怒=0.15、悲伤=0.1、恐惧=0.1、厌恶=0.35、低沉=0.35、惊讶=0.1。\n"
            "同一角色的连续台词，情绪应尽量保持一致或仅有微弱变化，避免相邻片段间情绪跳跃。\n"
            "只输出JSON数组，不要有其他文字，格式如下：\n"
            '[{"character": "旁白", "text": "叙述文字", "emo_text": "", "emo_alpha": 0}, '
            '{"character": "角色名", "text": "对话内容", "emo_text": "开心", "emo_alpha": 0.3}, '
-            '{"character": "角色名", "text": "带泪的笑", "emo_text": "开心+悲伤", "emo_alpha": 0.4}]'
+            '{"character": "角色名", "text": "含泪的笑", "emo_text": "开心:0.5+悲伤:0.2", "emo_alpha": 1.0}]'
        )
        user_message = f"请解析以下章节文本：\n\n{chapter_text}"
        result = await self.stream_chat_json(system_prompt, user_message, on_token, max_tokens=16384, usage_callback=usage_callback)
--- a/qwen3-tts-backend/core/tts_service.py
+++ b/qwen3-tts-backend/core/tts_service.py
@@ -445,19 +445,35 @@ class IndexTTS2Backend:

    @staticmethod
    def _emo_text_to_vector(emo_text: str) -> Optional[list]:
-        text = emo_text.lower()
+        tokens = [t.strip() for t in emo_text.split('+') if t.strip()]
        matched = []
-        for idx, words in enumerate(IndexTTS2Backend._EMO_KEYWORDS):
-            for word in words:
-                if word in text:
-                    matched.append(idx)
-                    break
+        for tok in tokens:
+            if ':' in tok:
+                name_part, w_str = tok.rsplit(':', 1)
+                try:
+                    weight: Optional[float] = float(w_str)
+                except ValueError:
+                    weight = None
+            else:
+                name_part = tok
+                weight = None
+            name_lower = name_part.lower().strip()
+            for idx, words in enumerate(IndexTTS2Backend._EMO_KEYWORDS):
+                for word in words:
+                    if word in name_lower:
+                        matched.append((idx, weight))
+                        break
        if not matched:
            return None
        vec = [0.0] * 8
-        score = 0.8 if len(matched) == 1 else 0.5
-        for idx in matched:
-            vec[idx] = 0.2 if idx == 1 else score
+        has_explicit = any(w is not None for _, w in matched)
+        if has_explicit:
+            for idx, w in matched:
+                vec[idx] = w if w is not None else 0.5
+        else:
+            score = 0.8 if len(matched) == 1 else 0.5
+            for idx, _ in matched:
+                vec[idx] = 0.2 if idx == 1 else score
        return vec

    async def generate(