From bf1532200a5eab3ded00510ec885bfa4ca348c19 Mon Sep 17 00:00:00 2001
From: bdim404 <i@bdim.moe>
Date: Fri, 13 Mar 2026 15:14:49 +0800
Subject: [PATCH] feat: update emotion handling in audiobook segments and UI
 for multi-emotion selection

---
 qwen3-tts-backend/core/audiobook_service.py | 46 ++++++++---
 qwen3-tts-backend/core/llm_service.py       | 23 +++---
 qwen3-tts-backend/core/tts_service.py       | 34 +++++---
 qwen3-tts-frontend/src/pages/Audiobook.tsx  | 88 ++++++++++++++-------
 4 files changed, 131 insertions(+), 60 deletions(-)

diff --git a/qwen3-tts-backend/core/audiobook_service.py b/qwen3-tts-backend/core/audiobook_service.py
index abd0268..51d3f00 100644
--- a/qwen3-tts-backend/core/audiobook_service.py
+++ b/qwen3-tts-backend/core/audiobook_service.py
@@ -16,7 +16,31 @@ from db.models import AudiobookProject, AudiobookCharacter, User
 logger = logging.getLogger(__name__)
 
 _LINE_RE = re.compile(r'^【(.+?)】(.*)$')
-_EMO_RE = re.compile(r'（([^:）]+):([0-9.]+)）\s*$')
+_EMO_RE = re.compile(r'（([^）]+)）\s*$')
+
+
+def _parse_emo(raw: str) -> tuple[Optional[str], Optional[float]]:
+    tokens = [t.strip() for t in raw.split('+') if t.strip()]
+    if not tokens:
+        return None, None
+    weighted = [(':' in t) for t in tokens]
+    if all(weighted) and len(tokens) > 1:
+        return raw, 1.0
+    elif len(tokens) == 1 and weighted[0]:
+        name, _, a = tokens[0].partition(':')
+        try:
+            return name.strip(), float(a)
+        except ValueError:
+            return None, None
+    elif weighted[-1]:
+        last_name, _, a = tokens[-1].rpartition(':')
+        names = tokens[:-1] + [last_name]
+        try:
+            return '+'.join(n.strip() for n in names), float(a)
+        except ValueError:
+            return None, None
+    else:
+        return None, None
 
 # Cancellation events for batch operations, keyed by project_id
 _cancel_events: dict[int, asyncio.Event] = {}
@@ -196,12 +220,10 @@ def parse_ai_script(script_text: str, char_map: dict) -> list[dict]:
         emo_alpha = None
         emo_m = _EMO_RE.search(content)
         if emo_m:
-            emo_text = emo_m.group(1)
-            try:
-                emo_alpha = float(emo_m.group(2))
-            except ValueError:
-                emo_alpha = None
-            content = content[:emo_m.start()].strip()
+            et, ea = _parse_emo(emo_m.group(1))
+            if et is not None:
+                emo_text, emo_alpha = et, ea
+                content = content[:emo_m.start()].strip()
 
         if content.startswith('"') and content.endswith('"'):
             content = content[1:-1].strip()
@@ -211,12 +233,10 @@ def parse_ai_script(script_text: str, char_map: dict) -> list[dict]:
         if emo_text is None:
             emo_m = _EMO_RE.search(content)
             if emo_m:
-                emo_text = emo_m.group(1)
-                try:
-                    emo_alpha = float(emo_m.group(2))
-                except ValueError:
-                    emo_alpha = None
-                content = content[:emo_m.start()].strip()
+                et, ea = _parse_emo(emo_m.group(1))
+                if et is not None:
+                    emo_text, emo_alpha = et, ea
+                    content = content[:emo_m.start()].strip()
 
         character = speaker
 
diff --git a/qwen3-tts-backend/core/llm_service.py b/qwen3-tts-backend/core/llm_service.py
index fcf6ea5..0b7246d 100644
--- a/qwen3-tts-backend/core/llm_service.py
+++ b/qwen3-tts-backend/core/llm_service.py
@@ -378,10 +378,11 @@ class LLMService:
             "  【角色名】\"对话内容\"（情感词:强度）\n\n"
             "情感标注规则：\n"
             "- 情感词可选：开心、愤怒、悲伤、恐惧、厌恶、低沉、惊讶\n"
-            "- 可用 + 拼接多个情感词表达复杂情绪，如（开心+悲伤:0.4）、（愤怒+恐惧:0.5）\n"
-            "- 多情感时强度为混合情感的整体强度，每种情感对合成结果均有贡献\n"
-            f"- 各情感强度上限（严格不超过）：{limits_str}\n"
-            "- 情感不明显时可省略（情感词:强度）整个括号\n"
+            "- 单一情感：（情感词:强度），如（开心:0.5）、（悲伤:0.3）\n"
+            "- 混合情感：（情感1:比重+情感2:比重），如（开心:0.6+悲伤:0.2）、（愤怒:0.3+恐惧:0.4）\n"
+            "- 混合情感时每个情感的比重独立设定，反映各自对情绪的贡献\n"
+            f"- 各情感比重上限（严格不超过）：{limits_str}\n"
+            "- 情感不明显时可省略整个括号\n"
             + narrator_rule
             + emo_guidance_line
             + "\n其他规则：\n"
@@ -453,16 +454,18 @@ class LLMService:
             "你是一个专业的有声书制作助手。请将给定的章节文本解析为对话片段列表。"
             f"已知角色列表（必须从中选择）：{names_str}。"
             "所有非对话的叙述文字归属于旁白角色。\n"
-            "同时根据语境为每个片段判断是否有明显情绪，有则设置情绪类型（emo_text）和强度（emo_alpha），无则留空。\n"
-            "可选情绪：开心、愤怒、悲伤、恐惧、厌恶、低沉、惊讶。\n"
-            "- emo_text 可用 + 拼接多个情感词（如 \"开心+悲伤\"），表达复杂混合情绪\n"
-            "情绪不明显或旁白时，emo_text设为\"\"，emo_alpha设为0。\n"
-            "各情绪强度上限（严格不超过）：开心=0.35、愤怒=0.15、悲伤=0.1、恐惧=0.1、厌恶=0.35、低沉=0.35、惊讶=0.1。\n"
+            "同时根据语境为每个片段判断是否有明显情绪，有则在 emo_text 中标注，无则留空。\n"
+            "可选情绪词：开心、愤怒、悲伤、恐惧、厌恶、低沉、惊讶。\n"
+            "emo_text 格式规则：\n"
+            "  单一情感：直接填情感词，用 emo_alpha 设置强度，如 emo_text=\"开心\", emo_alpha=0.3\n"
+            "  混合情感：用 情感词:比重 格式拼接，emo_alpha 设为 1.0，如 emo_text=\"开心:0.6+悲伤:0.2\", emo_alpha=1.0\n"
+            "情绪不明显或旁白时，emo_text=\"\"，emo_alpha=0。\n"
+            "各情感比重上限（严格不超过）：开心=0.35、愤怒=0.15、悲伤=0.1、恐惧=0.1、厌恶=0.35、低沉=0.35、惊讶=0.1。\n"
             "同一角色的连续台词，情绪应尽量保持一致或仅有微弱变化，避免相邻片段间情绪跳跃。\n"
             "只输出JSON数组，不要有其他文字，格式如下：\n"
             '[{"character": "旁白", "text": "叙述文字", "emo_text": "", "emo_alpha": 0}, '
             '{"character": "角色名", "text": "对话内容", "emo_text": "开心", "emo_alpha": 0.3}, '
-            '{"character": "角色名", "text": "带泪的笑", "emo_text": "开心+悲伤", "emo_alpha": 0.4}]'
+            '{"character": "角色名", "text": "含泪的笑", "emo_text": "开心:0.5+悲伤:0.2", "emo_alpha": 1.0}]'
         )
         user_message = f"请解析以下章节文本：\n\n{chapter_text}"
         result = await self.stream_chat_json(system_prompt, user_message, on_token, max_tokens=16384, usage_callback=usage_callback)
diff --git a/qwen3-tts-backend/core/tts_service.py b/qwen3-tts-backend/core/tts_service.py
index d4f96eb..cfb6b0c 100644
--- a/qwen3-tts-backend/core/tts_service.py
+++ b/qwen3-tts-backend/core/tts_service.py
@@ -445,19 +445,35 @@ class IndexTTS2Backend:
 
     @staticmethod
     def _emo_text_to_vector(emo_text: str) -> Optional[list]:
-        text = emo_text.lower()
+        tokens = [t.strip() for t in emo_text.split('+') if t.strip()]
         matched = []
-        for idx, words in enumerate(IndexTTS2Backend._EMO_KEYWORDS):
-            for word in words:
-                if word in text:
-                    matched.append(idx)
-                    break
+        for tok in tokens:
+            if ':' in tok:
+                name_part, w_str = tok.rsplit(':', 1)
+                try:
+                    weight: Optional[float] = float(w_str)
+                except ValueError:
+                    weight = None
+            else:
+                name_part = tok
+                weight = None
+            name_lower = name_part.lower().strip()
+            for idx, words in enumerate(IndexTTS2Backend._EMO_KEYWORDS):
+                for word in words:
+                    if word in name_lower:
+                        matched.append((idx, weight))
+                        break
         if not matched:
             return None
         vec = [0.0] * 8
-        score = 0.8 if len(matched) == 1 else 0.5
-        for idx in matched:
-            vec[idx] = 0.2 if idx == 1 else score
+        has_explicit = any(w is not None for _, w in matched)
+        if has_explicit:
+            for idx, w in matched:
+                vec[idx] = w if w is not None else 0.5
+        else:
+            score = 0.8 if len(matched) == 1 else 0.5
+            for idx, _ in matched:
+                vec[idx] = 0.2 if idx == 1 else score
         return vec
 
     async def generate(
diff --git a/qwen3-tts-frontend/src/pages/Audiobook.tsx b/qwen3-tts-frontend/src/pages/Audiobook.tsx
index d9936b6..7bd7696 100644
--- a/qwen3-tts-frontend/src/pages/Audiobook.tsx
+++ b/qwen3-tts-frontend/src/pages/Audiobook.tsx
@@ -1428,8 +1428,8 @@ function ChaptersPanel({
   const [expandedChapters, setExpandedChapters] = useState<Set<number>>(new Set())
   const [editingSegId, setEditingSegId] = useState<number | null>(null)
   const [editText, setEditText] = useState('')
-  const [editEmoText, setEditEmoText] = useState('')
-  const [editEmoAlpha, setEditEmoAlpha] = useState(0.5)
+  const [editEmoSelections, setEditEmoSelections] = useState<string[]>([])
+  const [editEmoWeights, setEditEmoWeights] = useState<Record<string, number>>({})
   const [savingSegId, setSavingSegId] = useState<number | null>(null)
   const [regeneratingSegs, setRegeneratingSegs] = useState<Set<number>>(new Set())
   const [audioVersions, setAudioVersions] = useState<Record<number, number>>({})
@@ -1463,8 +1463,30 @@ function ChaptersPanel({
   const startEdit = (seg: AudiobookSegment) => {
     setEditingSegId(seg.id)
     setEditText(seg.text)
-    setEditEmoText(seg.emo_text || '')
-    setEditEmoAlpha(seg.emo_alpha ?? 0.5)
+    const rawEmo = seg.emo_text || ''
+    const alpha = seg.emo_alpha ?? 0.5
+    if (!rawEmo) {
+      setEditEmoSelections([])
+      setEditEmoWeights({})
+      return
+    }
+    const tokens = rawEmo.split('+').filter(Boolean)
+    const selections: string[] = []
+    const weights: Record<string, number> = {}
+    if (tokens.length === 1) {
+      const [name] = tokens[0].split(':')
+      selections.push(name.trim())
+      weights[name.trim()] = alpha
+    } else {
+      for (const tok of tokens) {
+        const [name, w] = tok.split(':')
+        const emo = name.trim()
+        selections.push(emo)
+        weights[emo] = w ? parseFloat(w) : parseFloat((0.5 * alpha).toFixed(2))
+      }
+    }
+    setEditEmoSelections(selections)
+    setEditEmoWeights(weights)
   }
 
   const cancelEdit = () => setEditingSegId(null)
@@ -1472,11 +1494,16 @@ function ChaptersPanel({
   const saveEdit = async (segId: number) => {
     setSavingSegId(segId)
     try {
-      await onUpdateSegment(segId, {
-        text: editText,
-        emo_text: editEmoText || null,
-        emo_alpha: editEmoText ? editEmoAlpha : null,
-      })
+      let emo_text: string | null = null
+      let emo_alpha: number | null = null
+      if (editEmoSelections.length === 1) {
+        emo_text = editEmoSelections[0]
+        emo_alpha = editEmoWeights[editEmoSelections[0]] ?? 0.5
+      } else if (editEmoSelections.length > 1) {
+        emo_text = editEmoSelections.map(e => `${e}:${(editEmoWeights[e] ?? 0.5).toFixed(2)}`).join('+')
+        emo_alpha = 1.0
+      }
+      await onUpdateSegment(segId, { text: editText, emo_text, emo_alpha })
       setEditingSegId(null)
     } finally {
       setSavingSegId(null)
@@ -1673,11 +1700,16 @@ function ChaptersPanel({
                             </Badge>
                             {!isEditing && seg.emo_text && (
                               <span className="text-[11px] text-muted-foreground shrink-0 flex items-center gap-0.5 flex-wrap">
-                                {seg.emo_text.split('+').map(e => (
-                                  <span key={e} className="bg-muted rounded px-1">{e.trim()}</span>
-                                ))}
-                                {seg.emo_alpha != null && (
-                                  <span className="opacity-60 ml-0.5">{seg.emo_alpha.toFixed(2)}</span>
+                                {seg.emo_text.split('+').map(tok => {
+                                  const [name, w] = tok.split(':')
+                                  return (
+                                    <span key={tok} className="bg-muted rounded px-1">
+                                      {name.trim()}{w && <span className="opacity-60">:{parseFloat(w).toFixed(2)}</span>}
+                                    </span>
+                                  )
+                                })}
+                                {seg.emo_alpha != null && seg.emo_alpha !== 1 && (
+                                  <span className="opacity-60 ml-0.5">×{seg.emo_alpha.toFixed(2)}</span>
                                 )}
                               </span>
                             )}
@@ -1722,19 +1754,19 @@ function ChaptersPanel({
                                   <div className="flex items-center gap-1 flex-wrap">
                                     <span className="text-xs text-muted-foreground shrink-0">{t('projectCard.segments.emotion')}:</span>
                                     {EMOTION_OPTIONS.map(emo => {
-                                      const selectedEmos = editEmoText.split('+').filter(Boolean)
-                                      const isSelected = selectedEmos.includes(emo)
+                                      const isSelected = editEmoSelections.includes(emo)
                                       return (
                                         <button
                                           key={emo}
                                           type="button"
                                           className={`px-2 py-0.5 rounded text-xs border transition-colors ${isSelected ? "bg-primary text-primary-foreground border-primary" : "bg-muted text-muted-foreground border-transparent"}`}
                                           onClick={() => {
-                                            const current = editEmoText.split('+').filter(Boolean)
-                                            const next = isSelected
-                                              ? current.filter(e => e !== emo)
-                                              : [...current, emo]
-                                            setEditEmoText(next.join('+'))
+                                            if (isSelected) {
+                                              setEditEmoSelections(prev => prev.filter(e => e !== emo))
+                                            } else {
+                                              setEditEmoSelections(prev => [...prev, emo])
+                                              setEditEmoWeights(prev => ({ ...prev, [emo]: prev[emo] ?? 0.5 }))
+                                            }
                                           }}
                                         >
                                           {emo}
@@ -1742,21 +1774,21 @@ function ChaptersPanel({
                                       )
                                     })}
                                   </div>
-                                  {editEmoText && (
-                                    <div className="flex items-center gap-1.5">
-                                      <span className="text-xs text-muted-foreground shrink-0">{t('projectCard.segments.intensity')}:</span>
+                                  {editEmoSelections.map(emo => (
+                                    <div key={emo} className="flex items-center gap-1.5">
+                                      <span className="text-xs text-muted-foreground w-8 shrink-0">{emo}:</span>
                                       <input
                                         type="range"
                                         min={0.05}
                                         max={0.9}
                                         step={0.05}
-                                        value={editEmoAlpha}
-                                        onChange={e => setEditEmoAlpha(Number(e.target.value))}
+                                        value={editEmoWeights[emo] ?? 0.5}
+                                        onChange={e => setEditEmoWeights(prev => ({ ...prev, [emo]: Number(e.target.value) }))}
                                         className="flex-1 h-1.5 accent-primary"
                                       />
-                                      <span className="text-xs text-muted-foreground w-8 text-right">{editEmoAlpha.toFixed(2)}</span>
+                                      <span className="text-xs text-muted-foreground w-8 text-right">{(editEmoWeights[emo] ?? 0.5).toFixed(2)}</span>
                                     </div>
-                                  )}
+                                  ))}
                                 </div>
                               </div>
                             ) : (