feat: update emotion handling in audiobook segments and UI for multi-emotion selection

This commit is contained in:
2026-03-13 15:14:49 +08:00
parent 161e7fa76d
commit bf1532200a
4 changed files with 131 additions and 60 deletions

View File

@@ -16,7 +16,31 @@ from db.models import AudiobookProject, AudiobookCharacter, User
logger = logging.getLogger(__name__)
_LINE_RE = re.compile(r'^【(.+?)】(.*)$')
_EMO_RE = re.compile(r'([^:]+):([0-9.]+)\s*$')
_EMO_RE = re.compile(r'([^]+)\s*$')
def _parse_emo(raw: str) -> tuple[Optional[str], Optional[float]]:
tokens = [t.strip() for t in raw.split('+') if t.strip()]
if not tokens:
return None, None
weighted = [(':' in t) for t in tokens]
if all(weighted) and len(tokens) > 1:
return raw, 1.0
elif len(tokens) == 1 and weighted[0]:
name, _, a = tokens[0].partition(':')
try:
return name.strip(), float(a)
except ValueError:
return None, None
elif weighted[-1]:
last_name, _, a = tokens[-1].rpartition(':')
names = tokens[:-1] + [last_name]
try:
return '+'.join(n.strip() for n in names), float(a)
except ValueError:
return None, None
else:
return None, None
# Cancellation events for batch operations, keyed by project_id
_cancel_events: dict[int, asyncio.Event] = {}
@@ -196,12 +220,10 @@ def parse_ai_script(script_text: str, char_map: dict) -> list[dict]:
emo_alpha = None
emo_m = _EMO_RE.search(content)
if emo_m:
emo_text = emo_m.group(1)
try:
emo_alpha = float(emo_m.group(2))
except ValueError:
emo_alpha = None
content = content[:emo_m.start()].strip()
et, ea = _parse_emo(emo_m.group(1))
if et is not None:
emo_text, emo_alpha = et, ea
content = content[:emo_m.start()].strip()
if content.startswith('"') and content.endswith('"'):
content = content[1:-1].strip()
@@ -211,12 +233,10 @@ def parse_ai_script(script_text: str, char_map: dict) -> list[dict]:
if emo_text is None:
emo_m = _EMO_RE.search(content)
if emo_m:
emo_text = emo_m.group(1)
try:
emo_alpha = float(emo_m.group(2))
except ValueError:
emo_alpha = None
content = content[:emo_m.start()].strip()
et, ea = _parse_emo(emo_m.group(1))
if et is not None:
emo_text, emo_alpha = et, ea
content = content[:emo_m.start()].strip()
character = speaker

View File

@@ -378,10 +378,11 @@ class LLMService:
" 【角色名】\"对话内容\"(情感词:强度)\n\n"
"情感标注规则:\n"
"- 情感词可选:开心、愤怒、悲伤、恐惧、厌恶、低沉、惊讶\n"
"- 可用 + 拼接多个情感词表达复杂情绪,如(开心+悲伤:0.4)、(愤怒+恐惧:0.5\n"
"- 多情感时强度为混合情感的整体强度,每种情感对合成结果均有贡献\n"
f"- 各情感强度上限(严格不超过):{limits_str}\n"
"- 情感不明显时可省略(情感词:强度)整个括号\n"
"- 单一情感:(情感词:强度),如(开心:0.5)、(悲伤:0.3\n"
"- 混合情感情感1:比重+情感2:比重),如(开心:0.6+悲伤:0.2)、(愤怒:0.3+恐惧:0.4\n"
"- 混合情感时每个情感的比重独立设定,反映各自对情绪的贡献\n"
f"- 情感比重上限(严格不超过):{limits_str}\n"
"- 情感不明显时可省略整个括号\n"
+ narrator_rule
+ emo_guidance_line
+ "\n其他规则:\n"
@@ -453,16 +454,18 @@ class LLMService:
"你是一个专业的有声书制作助手。请将给定的章节文本解析为对话片段列表。"
f"已知角色列表(必须从中选择):{names_str}"
"所有非对话的叙述文字归属于旁白角色。\n"
"同时根据语境为每个片段判断是否有明显情绪,有则设置情绪类型(emo_text和强度emo_alpha,无则留空。\n"
"可选情绪:开心、愤怒、悲伤、恐惧、厌恶、低沉、惊讶。\n"
"- emo_text 可用 + 拼接多个情感词(如 \"开心+悲伤\"),表达复杂混合情绪\n"
"情绪不明显或旁白时,emo_text设为\"\"emo_alpha设为0。\n"
"各情绪强度上限(严格不超过):开心=0.35、愤怒=0.15、悲伤=0.1、恐惧=0.1、厌恶=0.35、低沉=0.35、惊讶=0.1。\n"
"同时根据语境为每个片段判断是否有明显情绪,有则emo_text 中标注,无则留空。\n"
"可选情绪:开心、愤怒、悲伤、恐惧、厌恶、低沉、惊讶。\n"
"emo_text 格式规则:\n"
" 单一情感:直接填情感词,用 emo_alpha 设置强度,如 emo_text=\"开心\", emo_alpha=0.3\n"
" 混合情感:用 情感词:比重 格式拼接emo_alpha 设为 1.0,如 emo_text=\"开心:0.6+悲伤:0.2\", emo_alpha=1.0\n"
"情绪不明显或旁白时emo_text=\"\"emo_alpha=0。\n"
"各情感比重上限(严格不超过):开心=0.35、愤怒=0.15、悲伤=0.1、恐惧=0.1、厌恶=0.35、低沉=0.35、惊讶=0.1。\n"
"同一角色的连续台词,情绪应尽量保持一致或仅有微弱变化,避免相邻片段间情绪跳跃。\n"
"只输出JSON数组不要有其他文字格式如下\n"
'[{"character": "旁白", "text": "叙述文字", "emo_text": "", "emo_alpha": 0}, '
'{"character": "角色名", "text": "对话内容", "emo_text": "开心", "emo_alpha": 0.3}, '
'{"character": "角色名", "text": "泪的笑", "emo_text": "开心+悲伤", "emo_alpha": 0.4}]'
'{"character": "角色名", "text": "泪的笑", "emo_text": "开心:0.5+悲伤:0.2", "emo_alpha": 1.0}]'
)
user_message = f"请解析以下章节文本:\n\n{chapter_text}"
result = await self.stream_chat_json(system_prompt, user_message, on_token, max_tokens=16384, usage_callback=usage_callback)

View File

@@ -445,19 +445,35 @@ class IndexTTS2Backend:
@staticmethod
def _emo_text_to_vector(emo_text: str) -> Optional[list]:
text = emo_text.lower()
tokens = [t.strip() for t in emo_text.split('+') if t.strip()]
matched = []
for idx, words in enumerate(IndexTTS2Backend._EMO_KEYWORDS):
for word in words:
if word in text:
matched.append(idx)
break
for tok in tokens:
if ':' in tok:
name_part, w_str = tok.rsplit(':', 1)
try:
weight: Optional[float] = float(w_str)
except ValueError:
weight = None
else:
name_part = tok
weight = None
name_lower = name_part.lower().strip()
for idx, words in enumerate(IndexTTS2Backend._EMO_KEYWORDS):
for word in words:
if word in name_lower:
matched.append((idx, weight))
break
if not matched:
return None
vec = [0.0] * 8
score = 0.8 if len(matched) == 1 else 0.5
for idx in matched:
vec[idx] = 0.2 if idx == 1 else score
has_explicit = any(w is not None for _, w in matched)
if has_explicit:
for idx, w in matched:
vec[idx] = w if w is not None else 0.5
else:
score = 0.8 if len(matched) == 1 else 0.5
for idx, _ in matched:
vec[idx] = 0.2 if idx == 1 else score
return vec
async def generate(