From bf1532200a5eab3ded00510ec885bfa4ca348c19 Mon Sep 17 00:00:00 2001 From: bdim404 Date: Fri, 13 Mar 2026 15:14:49 +0800 Subject: [PATCH] feat: update emotion handling in audiobook segments and UI for multi-emotion selection --- qwen3-tts-backend/core/audiobook_service.py | 46 ++++++++--- qwen3-tts-backend/core/llm_service.py | 23 +++--- qwen3-tts-backend/core/tts_service.py | 34 +++++--- qwen3-tts-frontend/src/pages/Audiobook.tsx | 88 ++++++++++++++------- 4 files changed, 131 insertions(+), 60 deletions(-) diff --git a/qwen3-tts-backend/core/audiobook_service.py b/qwen3-tts-backend/core/audiobook_service.py index abd0268..51d3f00 100644 --- a/qwen3-tts-backend/core/audiobook_service.py +++ b/qwen3-tts-backend/core/audiobook_service.py @@ -16,7 +16,31 @@ from db.models import AudiobookProject, AudiobookCharacter, User logger = logging.getLogger(__name__) _LINE_RE = re.compile(r'^【(.+?)】(.*)$') -_EMO_RE = re.compile(r'(([^:)]+):([0-9.]+))\s*$') +_EMO_RE = re.compile(r'(([^)]+))\s*$') + + +def _parse_emo(raw: str) -> tuple[Optional[str], Optional[float]]: + tokens = [t.strip() for t in raw.split('+') if t.strip()] + if not tokens: + return None, None + weighted = [(':' in t) for t in tokens] + if all(weighted) and len(tokens) > 1: + return raw, 1.0 + elif len(tokens) == 1 and weighted[0]: + name, _, a = tokens[0].partition(':') + try: + return name.strip(), float(a) + except ValueError: + return None, None + elif weighted[-1]: + last_name, _, a = tokens[-1].rpartition(':') + names = tokens[:-1] + [last_name] + try: + return '+'.join(n.strip() for n in names), float(a) + except ValueError: + return None, None + else: + return None, None # Cancellation events for batch operations, keyed by project_id _cancel_events: dict[int, asyncio.Event] = {} @@ -196,12 +220,10 @@ def parse_ai_script(script_text: str, char_map: dict) -> list[dict]: emo_alpha = None emo_m = _EMO_RE.search(content) if emo_m: - emo_text = emo_m.group(1) - try: - emo_alpha = float(emo_m.group(2)) - except ValueError: - emo_alpha = None - content = content[:emo_m.start()].strip() + et, ea = _parse_emo(emo_m.group(1)) + if et is not None: + emo_text, emo_alpha = et, ea + content = content[:emo_m.start()].strip() if content.startswith('"') and content.endswith('"'): content = content[1:-1].strip() @@ -211,12 +233,10 @@ def parse_ai_script(script_text: str, char_map: dict) -> list[dict]: if emo_text is None: emo_m = _EMO_RE.search(content) if emo_m: - emo_text = emo_m.group(1) - try: - emo_alpha = float(emo_m.group(2)) - except ValueError: - emo_alpha = None - content = content[:emo_m.start()].strip() + et, ea = _parse_emo(emo_m.group(1)) + if et is not None: + emo_text, emo_alpha = et, ea + content = content[:emo_m.start()].strip() character = speaker diff --git a/qwen3-tts-backend/core/llm_service.py b/qwen3-tts-backend/core/llm_service.py index fcf6ea5..0b7246d 100644 --- a/qwen3-tts-backend/core/llm_service.py +++ b/qwen3-tts-backend/core/llm_service.py @@ -378,10 +378,11 @@ class LLMService: " 【角色名】\"对话内容\"(情感词:强度)\n\n" "情感标注规则:\n" "- 情感词可选:开心、愤怒、悲伤、恐惧、厌恶、低沉、惊讶\n" - "- 可用 + 拼接多个情感词表达复杂情绪,如(开心+悲伤:0.4)、(愤怒+恐惧:0.5)\n" - "- 多情感时强度为混合情感的整体强度,每种情感对合成结果均有贡献\n" - f"- 各情感强度上限(严格不超过):{limits_str}\n" - "- 情感不明显时可省略(情感词:强度)整个括号\n" + "- 单一情感:(情感词:强度),如(开心:0.5)、(悲伤:0.3)\n" + "- 混合情感:(情感1:比重+情感2:比重),如(开心:0.6+悲伤:0.2)、(愤怒:0.3+恐惧:0.4)\n" + "- 混合情感时每个情感的比重独立设定,反映各自对情绪的贡献\n" + f"- 各情感比重上限(严格不超过):{limits_str}\n" + "- 情感不明显时可省略整个括号\n" + narrator_rule + emo_guidance_line + "\n其他规则:\n" @@ -453,16 +454,18 @@ class LLMService: "你是一个专业的有声书制作助手。请将给定的章节文本解析为对话片段列表。" f"已知角色列表(必须从中选择):{names_str}。" "所有非对话的叙述文字归属于旁白角色。\n" - "同时根据语境为每个片段判断是否有明显情绪,有则设置情绪类型(emo_text)和强度(emo_alpha),无则留空。\n" - "可选情绪:开心、愤怒、悲伤、恐惧、厌恶、低沉、惊讶。\n" - "- emo_text 可用 + 拼接多个情感词(如 \"开心+悲伤\"),表达复杂混合情绪\n" - "情绪不明显或旁白时,emo_text设为\"\",emo_alpha设为0。\n" - "各情绪强度上限(严格不超过):开心=0.35、愤怒=0.15、悲伤=0.1、恐惧=0.1、厌恶=0.35、低沉=0.35、惊讶=0.1。\n" + "同时根据语境为每个片段判断是否有明显情绪,有则在 emo_text 中标注,无则留空。\n" + "可选情绪词:开心、愤怒、悲伤、恐惧、厌恶、低沉、惊讶。\n" + "emo_text 格式规则:\n" + " 单一情感:直接填情感词,用 emo_alpha 设置强度,如 emo_text=\"开心\", emo_alpha=0.3\n" + " 混合情感:用 情感词:比重 格式拼接,emo_alpha 设为 1.0,如 emo_text=\"开心:0.6+悲伤:0.2\", emo_alpha=1.0\n" + "情绪不明显或旁白时,emo_text=\"\",emo_alpha=0。\n" + "各情感比重上限(严格不超过):开心=0.35、愤怒=0.15、悲伤=0.1、恐惧=0.1、厌恶=0.35、低沉=0.35、惊讶=0.1。\n" "同一角色的连续台词,情绪应尽量保持一致或仅有微弱变化,避免相邻片段间情绪跳跃。\n" "只输出JSON数组,不要有其他文字,格式如下:\n" '[{"character": "旁白", "text": "叙述文字", "emo_text": "", "emo_alpha": 0}, ' '{"character": "角色名", "text": "对话内容", "emo_text": "开心", "emo_alpha": 0.3}, ' - '{"character": "角色名", "text": "带泪的笑", "emo_text": "开心+悲伤", "emo_alpha": 0.4}]' + '{"character": "角色名", "text": "含泪的笑", "emo_text": "开心:0.5+悲伤:0.2", "emo_alpha": 1.0}]' ) user_message = f"请解析以下章节文本:\n\n{chapter_text}" result = await self.stream_chat_json(system_prompt, user_message, on_token, max_tokens=16384, usage_callback=usage_callback) diff --git a/qwen3-tts-backend/core/tts_service.py b/qwen3-tts-backend/core/tts_service.py index d4f96eb..cfb6b0c 100644 --- a/qwen3-tts-backend/core/tts_service.py +++ b/qwen3-tts-backend/core/tts_service.py @@ -445,19 +445,35 @@ class IndexTTS2Backend: @staticmethod def _emo_text_to_vector(emo_text: str) -> Optional[list]: - text = emo_text.lower() + tokens = [t.strip() for t in emo_text.split('+') if t.strip()] matched = [] - for idx, words in enumerate(IndexTTS2Backend._EMO_KEYWORDS): - for word in words: - if word in text: - matched.append(idx) - break + for tok in tokens: + if ':' in tok: + name_part, w_str = tok.rsplit(':', 1) + try: + weight: Optional[float] = float(w_str) + except ValueError: + weight = None + else: + name_part = tok + weight = None + name_lower = name_part.lower().strip() + for idx, words in enumerate(IndexTTS2Backend._EMO_KEYWORDS): + for word in words: + if word in name_lower: + matched.append((idx, weight)) + break if not matched: return None vec = [0.0] * 8 - score = 0.8 if len(matched) == 1 else 0.5 - for idx in matched: - vec[idx] = 0.2 if idx == 1 else score + has_explicit = any(w is not None for _, w in matched) + if has_explicit: + for idx, w in matched: + vec[idx] = w if w is not None else 0.5 + else: + score = 0.8 if len(matched) == 1 else 0.5 + for idx, _ in matched: + vec[idx] = 0.2 if idx == 1 else score return vec async def generate( diff --git a/qwen3-tts-frontend/src/pages/Audiobook.tsx b/qwen3-tts-frontend/src/pages/Audiobook.tsx index d9936b6..7bd7696 100644 --- a/qwen3-tts-frontend/src/pages/Audiobook.tsx +++ b/qwen3-tts-frontend/src/pages/Audiobook.tsx @@ -1428,8 +1428,8 @@ function ChaptersPanel({ const [expandedChapters, setExpandedChapters] = useState>(new Set()) const [editingSegId, setEditingSegId] = useState(null) const [editText, setEditText] = useState('') - const [editEmoText, setEditEmoText] = useState('') - const [editEmoAlpha, setEditEmoAlpha] = useState(0.5) + const [editEmoSelections, setEditEmoSelections] = useState([]) + const [editEmoWeights, setEditEmoWeights] = useState>({}) const [savingSegId, setSavingSegId] = useState(null) const [regeneratingSegs, setRegeneratingSegs] = useState>(new Set()) const [audioVersions, setAudioVersions] = useState>({}) @@ -1463,8 +1463,30 @@ function ChaptersPanel({ const startEdit = (seg: AudiobookSegment) => { setEditingSegId(seg.id) setEditText(seg.text) - setEditEmoText(seg.emo_text || '') - setEditEmoAlpha(seg.emo_alpha ?? 0.5) + const rawEmo = seg.emo_text || '' + const alpha = seg.emo_alpha ?? 0.5 + if (!rawEmo) { + setEditEmoSelections([]) + setEditEmoWeights({}) + return + } + const tokens = rawEmo.split('+').filter(Boolean) + const selections: string[] = [] + const weights: Record = {} + if (tokens.length === 1) { + const [name] = tokens[0].split(':') + selections.push(name.trim()) + weights[name.trim()] = alpha + } else { + for (const tok of tokens) { + const [name, w] = tok.split(':') + const emo = name.trim() + selections.push(emo) + weights[emo] = w ? parseFloat(w) : parseFloat((0.5 * alpha).toFixed(2)) + } + } + setEditEmoSelections(selections) + setEditEmoWeights(weights) } const cancelEdit = () => setEditingSegId(null) @@ -1472,11 +1494,16 @@ function ChaptersPanel({ const saveEdit = async (segId: number) => { setSavingSegId(segId) try { - await onUpdateSegment(segId, { - text: editText, - emo_text: editEmoText || null, - emo_alpha: editEmoText ? editEmoAlpha : null, - }) + let emo_text: string | null = null + let emo_alpha: number | null = null + if (editEmoSelections.length === 1) { + emo_text = editEmoSelections[0] + emo_alpha = editEmoWeights[editEmoSelections[0]] ?? 0.5 + } else if (editEmoSelections.length > 1) { + emo_text = editEmoSelections.map(e => `${e}:${(editEmoWeights[e] ?? 0.5).toFixed(2)}`).join('+') + emo_alpha = 1.0 + } + await onUpdateSegment(segId, { text: editText, emo_text, emo_alpha }) setEditingSegId(null) } finally { setSavingSegId(null) @@ -1673,11 +1700,16 @@ function ChaptersPanel({ {!isEditing && seg.emo_text && ( - {seg.emo_text.split('+').map(e => ( - {e.trim()} - ))} - {seg.emo_alpha != null && ( - {seg.emo_alpha.toFixed(2)} + {seg.emo_text.split('+').map(tok => { + const [name, w] = tok.split(':') + return ( + + {name.trim()}{w && :{parseFloat(w).toFixed(2)}} + + ) + })} + {seg.emo_alpha != null && seg.emo_alpha !== 1 && ( + ×{seg.emo_alpha.toFixed(2)} )} )} @@ -1722,19 +1754,19 @@ function ChaptersPanel({
{t('projectCard.segments.emotion')}: {EMOTION_OPTIONS.map(emo => { - const selectedEmos = editEmoText.split('+').filter(Boolean) - const isSelected = selectedEmos.includes(emo) + const isSelected = editEmoSelections.includes(emo) return (
- {editEmoText && ( -
- {t('projectCard.segments.intensity')}: + {editEmoSelections.map(emo => ( +
+ {emo}: setEditEmoAlpha(Number(e.target.value))} + value={editEmoWeights[emo] ?? 0.5} + onChange={e => setEditEmoWeights(prev => ({ ...prev, [emo]: Number(e.target.value) }))} className="flex-1 h-1.5 accent-primary" /> - {editEmoAlpha.toFixed(2)} + {(editEmoWeights[emo] ?? 0.5).toFixed(2)}
- )} + ))}
) : (