feat: update emotion handling and adjust alpha levels in TTS and LLM services

2026-04-07 14:17:29 +08:00
parent a8d6195cdb
commit a144540cbe
3 changed files with 74 additions and 49 deletions
--- a/qwen3-tts-backend/core/llm_service.py
+++ b/qwen3-tts-backend/core/llm_service.py
@@ -321,26 +321,17 @@ class LLMService:
    @staticmethod
    def _emotion_limits(violence_level: int, eroticism_level: int) -> tuple[str, str]:
        v = violence_level / 10
        e = eroticism_level / 10
        female_happy = round(0.20 + 0.45 * e, 2)
        angry   = round(0.15 + 0.65 * v, 2)
        sad     = round(0.10 + 0.40 * v, 2)
        fear    = round(0.10 + 0.60 * v, 2)
        hate    = round(0.35 + 0.25 * max(v, e), 2)
        low     = round(0.35 + 0.45 * e, 2)
        surprise= round(0.10 + 0.35 * max(v, e), 2)
        limits = (
            f"愤怒={angry}、悲伤={sad}、恐惧={fear}、厌恶={hate}、低沉={low}、惊讶={surprise}、"
            f"开心：男性角色上限=0.20，女性角色上限={female_happy}"
        )
        guidance_parts = []
-        if violence_level >= 4:
+        if violence_level >= 7:
-            guidance_parts.append(f"暴力程度{violence_level}/10，台词中的愤怒、恐惧、悲伤情绪必须强烈外露，不得克制")
+            guidance_parts.append(f"暴力程度{violence_level}/10，激烈场景的愤怒/恐惧/悲伤强度应用7-10级，不得克制")
-        if eroticism_level >= 4:
+        elif violence_level >= 4:
-            guidance_parts.append(f"色情程度{eroticism_level}/10，女性台词中的开心、低沉、挑逗情绪应充分表达")
+            guidance_parts.append(f"暴力程度{violence_level}/10，台词中的愤怒/恐惧/悲伤情绪可用4-7级")
        if eroticism_level >= 7:
            guidance_parts.append(f"色情程度{eroticism_level}/10，女性台词中的开心/低沉情绪应用7-10级充分表达")
        elif eroticism_level >= 4:
            guidance_parts.append(f"色情程度{eroticism_level}/10，女性台词中的开心/低沉情绪可用4-7级")
        guidance = "；".join(guidance_parts)
-        return limits, guidance
+        return "", guidance
    async def generate_chapter_script(
        self,
@@ -383,11 +374,9 @@ class LLMService:
            "  【角色名】\"对话内容\"（情感词:强度）\n\n"
            "情感标注规则：\n"
            "- 情感词可选：开心、愤怒、悲伤、恐惧、厌恶、低沉、惊讶\n"
-            "- 单一情感：（情感词:强度），如（开心:0.5）、（悲伤:0.3）\n"
+            "- 每行只允许标注一个情感词，格式：（情感词:强度级别），强度为1–10的整数，10最强\n"
-            "- 混合情感：（情感1:比重+情感2:比重），如（开心:0.6+悲伤:0.2）、（愤怒:0.3+恐惧:0.4）\n"
+            "- 示例：（开心:6）、（悲伤:3）、（愤怒:8）\n"
-            "- 混合情感时每个情感的比重独立设定，反映各自对情绪的贡献\n"
+            "- 鼓励使用低值（1–3）表达微弱、内敛或一闪而过的情绪，无需非强即无\n"
            f"- 各情感比重上限（严格不超过）：{limits_str}\n"
            "- 鼓励使用低值（0.05–0.10）表达微弱、内敛或一闪而过的情绪，无需非强即无\n"
            "- 确实没有任何情绪色彩时可省略整个括号\n"
            + char_personality_str
            + narrator_rule
@@ -468,18 +457,15 @@ class LLMService:
            "所有非对话的叙述文字归属于旁白角色。\n"
            "同时根据语境为每个片段判断是否有明显情绪，有则在 emo_text 中标注，无则留空。\n"
            "可选情绪词：开心、愤怒、悲伤、恐惧、厌恶、低沉、惊讶。\n"
-            "emo_text 格式规则：\n"
+            "emo_text 只允许单一情感词；emo_alpha 为1–10的整数表示强度（10最强）；完全无情绪色彩时 emo_text 置空，emo_alpha 为 0。\n"
-            "  单一情感：直接填情感词，用 emo_alpha 设置强度，如 emo_text=\"开心\", emo_alpha=0.3\n"
+            "鼓励用低值（1–3）表达微弱或内敛的情绪，不要非强即无。\n"
            "  混合情感：用 情感词:比重 格式拼接，emo_alpha 设为 1.0，如 emo_text=\"开心:0.6+悲伤:0.2\", emo_alpha=1.0\n"
            "各情感比重上限（严格不超过）：开心=0.20、愤怒=0.15、悲伤=0.1、恐惧=0.1、厌恶=0.35、低沉=0.35、惊讶=0.10。\n"
            "鼓励用低值（0.05–0.10）表达微弱或内敛的情绪，不要非强即无；完全无情绪色彩时 emo_text 置空。\n"
            + personality_str
            + "同一角色的连续台词，情绪应尽量保持一致或仅有微弱变化，避免相邻片段间情绪跳跃。\n"
            "只输出JSON数组，不要有其他文字，格式如下：\n"
            '[{"character": "旁白", "text": "叙述文字", "emo_text": "", "emo_alpha": 0}, '
-            '{"character": "角色名", "text": "淡淡的问候", "emo_text": "开心", "emo_alpha": 0.08}, '
+            '{"character": "角色名", "text": "淡淡的问候", "emo_text": "开心", "emo_alpha": 3}, '
-            '{"character": "角色名", "text": "激动的欢呼", "emo_text": "开心", "emo_alpha": 0.18}, '
+            '{"character": "角色名", "text": "激动的欢呼", "emo_text": "开心", "emo_alpha": 8}, '
-            '{"character": "角色名", "text": "含泪的笑", "emo_text": "开心:0.12+悲伤:0.08", "emo_alpha": 1.0}]'
+            '{"character": "角色名", "text": "愤怒的质问", "emo_text": "愤怒", "emo_alpha": 7}]'
        )
        user_message = f"请解析以下章节文本：\n\n{chapter_text}"
        result = await self.stream_chat_json(system_prompt, user_message, on_token, max_tokens=16384, usage_callback=usage_callback)
--- a/qwen3-tts-backend/core/tts_service.py
+++ b/qwen3-tts-backend/core/tts_service.py
@@ -166,6 +166,17 @@ class LocalTTSBackend(TTSBackend):
 class IndexTTS2Backend:
    _gpu_lock = asyncio.Lock()
    # Level 10 = these raw weights. Scale linearly: level N → N/10 * max
    EMO_LEVEL_MAX: dict[str, float] = {
        "开心": 0.75, "happy": 0.75,
        "愤怒": 0.08, "angry": 0.08,
        "悲伤": 0.90, "sad": 0.90,
        "恐惧": 0.10, "fear": 0.10,
        "厌恶": 0.50, "hate": 0.50,
        "低沉": 0.35, "low": 0.35,
        "惊讶": 0.35, "surprise": 0.35,
    }
    # Emotion keyword → index mapping
    # Order: [happy, angry, sad, fear, hate, low, surprise, neutral]
    _EMO_KEYWORDS = [
@@ -176,7 +187,6 @@ class IndexTTS2Backend:
        ['厌恶', '厌', 'hate', '讨厌', '反感'],
        ['低落', '沮丧', '消沉', 'low', '抑郁', '颓废'],
        ['惊喜', '惊讶', '意外', 'surprise', '惊', '吃惊', '震惊'],
        ['自然', '平静', '中性', '平和', 'neutral', '平淡', '冷静', '稳定'],
    ]
    @staticmethod
@@ -227,10 +237,27 @@ class IndexTTS2Backend:
        emo_vector = None
        if emo_text and len(emo_text.strip()) > 0:
-            raw_vector = self._emo_text_to_vector(emo_text)
+            resolved_emo_text = emo_text
            resolved_emo_alpha = emo_alpha
            if emo_alpha is not None and emo_alpha > 1:
                level = min(10, max(1, round(emo_alpha)))
                name = emo_text.strip()
                max_val = self.EMO_LEVEL_MAX.get(name)
                if max_val is None:
                    name_lower = name.lower()
                    for key, val in self.EMO_LEVEL_MAX.items():
                        if key in name_lower or name_lower in key:
                            max_val = val
                            break
                if max_val is None:
                    max_val = 0.20
                weight = round(level / 10 * max_val, 4)
                resolved_emo_text = f"{name}:{weight}"
                resolved_emo_alpha = 1.0
            raw_vector = self._emo_text_to_vector(resolved_emo_text)
            if raw_vector is not None:
-                emo_vector = [v * emo_alpha for v in raw_vector]
+                emo_vector = [v * resolved_emo_alpha for v in raw_vector]
-            logger.info(f"IndexTTS2 emo_text={repr(emo_text)} emo_alpha={emo_alpha} → emo_vector={emo_vector}")
+            logger.info(f"IndexTTS2 emo_text={repr(emo_text)} emo_alpha={emo_alpha} → resolved={repr(resolved_emo_text)} emo_vector={emo_vector}")
        async with IndexTTS2Backend._gpu_lock:
            await loop.run_in_executor(
--- a/qwen3-tts-frontend/src/pages/Audiobook.tsx
+++ b/qwen3-tts-frontend/src/pages/Audiobook.tsx
@@ -1416,7 +1416,12 @@ function CharactersPanel({
  )
 }
-const EMOTION_OPTIONS = ['开心', '愤怒', '悲伤', '恐惧', '厌恶', '低沉', '惊讶', '中性']
+const EMOTION_OPTIONS = ['开心', '愤怒', '悲伤', '恐惧', '厌恶', '低沉', '惊讶']
 const EMO_LEVEL_MAX: Record<string, number> = {
  '开心': 0.75, '愤怒': 0.08, '悲伤': 0.90,
  '恐惧': 0.10, '厌恶': 0.50, '低沉': 0.35, '惊讶': 0.35,
 }
 function ChaptersPanel({
@@ -1517,7 +1522,7 @@ function ChaptersPanel({
    setEditingSegId(seg.id)
    setEditText(seg.text)
    const rawEmo = seg.emo_text || ''
-    const alpha = seg.emo_alpha ?? 0.5
+    const alpha = seg.emo_alpha ?? 5
    if (!rawEmo) {
      setEditEmoSelections([])
      setEditEmoWeights({})
@@ -1528,14 +1533,17 @@ function ChaptersPanel({
    const weights: Record<string, number> = {}
    if (tokens.length === 1) {
      const [name] = tokens[0].split(':')
-      selections.push(name.trim())
+      const emoName = name.trim()
-      weights[name.trim()] = alpha
+      selections.push(emoName)
      // Convert old float alpha to level if needed
      weights[emoName] = alpha > 1 ? Math.round(alpha) : Math.round(alpha / (EMO_LEVEL_MAX[emoName] || 0.35) * 10)
    } else {
      for (const tok of tokens) {
        const [name, w] = tok.split(':')
        const emo = name.trim()
        selections.push(emo)
-        weights[emo] = w ? parseFloat(w) : parseFloat((0.5 * alpha).toFixed(2))
+        const rawW = w ? parseFloat(w) : (EMO_LEVEL_MAX[emo] || 0.35) * 0.5
        weights[emo] = Math.round(rawW / (EMO_LEVEL_MAX[emo] || 0.35) * 10)
      }
    }
    setEditEmoSelections(selections)
@@ -1551,9 +1559,13 @@ function ChaptersPanel({
      let emo_alpha: number | null = null
      if (editEmoSelections.length === 1) {
        emo_text = editEmoSelections[0]
-        emo_alpha = editEmoWeights[editEmoSelections[0]] ?? 0.5
+        emo_alpha = editEmoWeights[editEmoSelections[0]] ?? 5
      } else if (editEmoSelections.length > 1) {
-        emo_text = editEmoSelections.map(e => `${e}:${(editEmoWeights[e] ?? 0.5).toFixed(2)}`).join('+')
+        emo_text = editEmoSelections.map(e => {
          const level = editEmoWeights[e] ?? 5
          const weight = parseFloat((level / 10 * (EMO_LEVEL_MAX[e] || 0.35)).toFixed(4))
          return `${e}:${weight}`
        }).join('+')
        emo_alpha = 1.0
      }
      await onUpdateSegment(segId, { text: editText, emo_text, emo_alpha })
@@ -1778,7 +1790,7 @@ function ChaptersPanel({
                                    const [name, w] = tok.split(':')
                                    return <span key={tok}>{i > 0 ? ' ' : ''}{name.trim()}{w ? `:${parseFloat(w).toFixed(2)}` : ''}</span>
                                  })}
-                                  {seg.emo_alpha != null && seg.emo_alpha !== 1 && `  :${seg.emo_alpha.toFixed(2)}`}
+                                  {seg.emo_alpha != null && seg.emo_alpha !== 1 && ` Lv.${seg.emo_alpha > 1 ? Math.round(seg.emo_alpha) : seg.emo_alpha}`}
                                </>
                              )}
                            </span>
@@ -1834,7 +1846,7 @@ function ChaptersPanel({
                                              setEditEmoSelections(prev => prev.filter(e => e !== emo))
                                            } else {
                                              setEditEmoSelections(prev => [...prev, emo])
-                                              setEditEmoWeights(prev => ({ ...prev, [emo]: prev[emo] ?? 0.5 }))
+                                              setEditEmoWeights(prev => ({ ...prev, [emo]: prev[emo] ?? 5 }))
                                            }
                                          }}
                                        >
@@ -1848,14 +1860,14 @@ function ChaptersPanel({
                                      <span className="text-xs text-muted-foreground w-8 shrink-0">{emo}:</span>
                                      <input
                                        type="range"
-                                        min={0.05}
+                                        min={1}
-                                        max={0.9}
+                                        max={10}
-                                        step={0.05}
+                                        step={1}
-                                        value={editEmoWeights[emo] ?? 0.5}
+                                        value={editEmoWeights[emo] ?? 5}
                                        onChange={e => setEditEmoWeights(prev => ({ ...prev, [emo]: Number(e.target.value) }))}
                                        className="flex-1 h-1.5 accent-primary"
                                      />
-                                      <span className="text-xs text-muted-foreground w-8 text-right">{(editEmoWeights[emo] ?? 0.5).toFixed(2)}</span>
+                                      <span className="text-xs text-muted-foreground w-6 text-right">{editEmoWeights[emo] ?? 5}</span>
                                    </div>
                                  ))}
                                </div>