feat: Implement gender-specific TTS instructions, refactor async database session handling for character creation and preview generation, and add Aliyun voice design creation.
This commit is contained in:
@@ -36,6 +36,26 @@ def _get_llm_service(user: User) -> LLMService:
|
|||||||
return LLMService(base_url=user.llm_base_url, api_key=api_key, model=user.llm_model)
|
return LLMService(base_url=user.llm_base_url, api_key=api_key, model=user.llm_model)
|
||||||
|
|
||||||
|
|
||||||
|
def _get_gendered_instruct(gender: Optional[str], base_instruct: str) -> str:
|
||||||
|
"""Ensure the instruction sent to the TTS model has explicit gender cues if known."""
|
||||||
|
if not gender or gender == "未知":
|
||||||
|
return base_instruct
|
||||||
|
|
||||||
|
# We want to force a clear gender bias at the start of the prompt
|
||||||
|
prefix = ""
|
||||||
|
if gender == "男":
|
||||||
|
prefix = "男性声音,"
|
||||||
|
elif gender == "女":
|
||||||
|
prefix = "女性声音,"
|
||||||
|
|
||||||
|
if prefix and prefix not in base_instruct:
|
||||||
|
# Prepend prefix, but try to be smart if the first line starts with "音色信息:"
|
||||||
|
if base_instruct.startswith("音色信息:"):
|
||||||
|
return base_instruct.replace("音色信息:", f"音色信息:{prefix}", 1)
|
||||||
|
return f"{prefix}{base_instruct}"
|
||||||
|
return base_instruct
|
||||||
|
|
||||||
|
|
||||||
def _extract_epub_chapters(file_path: str) -> list[str]:
|
def _extract_epub_chapters(file_path: str) -> list[str]:
|
||||||
try:
|
try:
|
||||||
import ebooklib
|
import ebooklib
|
||||||
@@ -166,6 +186,17 @@ async def analyze_project(project_id: int, user: User, db: Session, turbo: bool
|
|||||||
|
|
||||||
samples = _sample_full_text(text)
|
samples = _sample_full_text(text)
|
||||||
n = len(samples)
|
n = len(samples)
|
||||||
|
|
||||||
|
# Ensure previews directory is clean for new analysis
|
||||||
|
previews_dir = Path(settings.OUTPUT_DIR) / "audiobook" / str(project_id) / "previews"
|
||||||
|
if previews_dir.exists():
|
||||||
|
import shutil
|
||||||
|
try:
|
||||||
|
shutil.rmtree(previews_dir)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to clear previews directory: {e}")
|
||||||
|
previews_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
mode_label = "极速并发" if turbo else "顺序"
|
mode_label = "极速并发" if turbo else "顺序"
|
||||||
ps.append_line(key, f"\n[LLM] 模型:{user.llm_model},共 {n} 个采样段({mode_label}模式),正在分析角色...\n")
|
ps.append_line(key, f"\n[LLM] 模型:{user.llm_model},共 {n} 个采样段({mode_label}模式),正在分析角色...\n")
|
||||||
ps.append_line(key, "")
|
ps.append_line(key, "")
|
||||||
@@ -173,9 +204,12 @@ async def analyze_project(project_id: int, user: User, db: Session, turbo: bool
|
|||||||
def on_token(token: str) -> None:
|
def on_token(token: str) -> None:
|
||||||
ps.append_token(key, token)
|
ps.append_token(key, token)
|
||||||
|
|
||||||
|
completed_count = 0
|
||||||
def on_sample(i: int, total: int) -> None:
|
def on_sample(i: int, total: int) -> None:
|
||||||
if i < total - 1:
|
nonlocal completed_count
|
||||||
ps.append_line(key, f"\n[LLM] 采样段 {i + 1}/{total} 完成,继续分析...\n")
|
completed_count += 1
|
||||||
|
if completed_count < total:
|
||||||
|
ps.append_line(key, f"\n[LLM] 采样段 {completed_count}/{total} 完成,继续分析...\n")
|
||||||
else:
|
else:
|
||||||
ps.append_line(key, f"\n[LLM] 全部 {total} 个采样段完成,正在合并角色列表...\n")
|
ps.append_line(key, f"\n[LLM] 全部 {total} 个采样段完成,正在合并角色列表...\n")
|
||||||
ps.append_line(key, "")
|
ps.append_line(key, "")
|
||||||
@@ -201,53 +235,33 @@ async def analyze_project(project_id: int, user: User, db: Session, turbo: bool
|
|||||||
crud.delete_audiobook_characters(db, project_id)
|
crud.delete_audiobook_characters(db, project_id)
|
||||||
|
|
||||||
backend_type = user.user_preferences.get("default_backend", "aliyun") if user.user_preferences else "aliyun"
|
backend_type = user.user_preferences.get("default_backend", "aliyun") if user.user_preferences else "aliyun"
|
||||||
|
|
||||||
async def _create_char_with_voice(char_data):
|
for char_data in characters_data:
|
||||||
name = char_data.get("name", "narrator")
|
name = char_data.get("name", "narrator")
|
||||||
instruct = char_data.get("instruct", "")
|
instruct = char_data.get("instruct", "")
|
||||||
description = char_data.get("description", "")
|
description = char_data.get("description", "")
|
||||||
gender = char_data.get("gender") or ("未知" if name == "narrator" else None)
|
gender = char_data.get("gender") or ("未知" if name == "narrator" else None)
|
||||||
|
|
||||||
# Requires isolated DB queries since we're in an async concurrent block
|
|
||||||
try:
|
try:
|
||||||
# We need an async wrapper or a local db session for concurrent sync DB pushes
|
voice_design = crud.create_voice_design(
|
||||||
# Because core crud uses synchronous SQLalchemy, executing them in threadpool via asyncio.to_thread
|
db=db,
|
||||||
import asyncio
|
user_id=user.id,
|
||||||
|
name=f"[有声书] {project.title} - {name}",
|
||||||
def db_ops():
|
instruct=instruct,
|
||||||
from core.database import SessionLocal
|
backend_type=backend_type,
|
||||||
local_db = SessionLocal()
|
preview_text=description[:100] if description else None,
|
||||||
try:
|
)
|
||||||
voice_design = crud.create_voice_design(
|
crud.create_audiobook_character(
|
||||||
db=local_db,
|
db=db,
|
||||||
user_id=user.id,
|
project_id=project_id,
|
||||||
name=f"[有声书] {project.title} - {name}",
|
name=name,
|
||||||
instruct=instruct,
|
gender=gender,
|
||||||
backend_type=backend_type,
|
description=description,
|
||||||
preview_text=description[:100] if description else None,
|
instruct=instruct,
|
||||||
)
|
voice_design_id=voice_design.id,
|
||||||
|
)
|
||||||
crud.create_audiobook_character(
|
|
||||||
db=local_db,
|
|
||||||
project_id=project_id,
|
|
||||||
name=name,
|
|
||||||
gender=gender,
|
|
||||||
description=description,
|
|
||||||
instruct=instruct,
|
|
||||||
voice_design_id=voice_design.id,
|
|
||||||
)
|
|
||||||
finally:
|
|
||||||
local_db.close()
|
|
||||||
|
|
||||||
await asyncio.to_thread(db_ops)
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to create char/voice for {name}: {e}")
|
logger.error(f"Failed to create char/voice for {name}: {e}")
|
||||||
|
|
||||||
import asyncio
|
|
||||||
batch_tasks = [_create_char_with_voice(cd) for cd in characters_data]
|
|
||||||
if batch_tasks:
|
|
||||||
await asyncio.gather(*batch_tasks)
|
|
||||||
|
|
||||||
crud.update_audiobook_project_status(db, project_id, "characters_ready")
|
crud.update_audiobook_project_status(db, project_id, "characters_ready")
|
||||||
ps.mark_done(key)
|
ps.mark_done(key)
|
||||||
logger.info(f"Project {project_id} character extraction complete: {len(characters_data)} characters")
|
logger.info(f"Project {project_id} character extraction complete: {len(characters_data)} characters")
|
||||||
@@ -259,25 +273,34 @@ async def analyze_project(project_id: int, user: User, db: Session, turbo: bool
|
|||||||
user_id = user.id
|
user_id = user.id
|
||||||
|
|
||||||
async def _generate_all_previews():
|
async def _generate_all_previews():
|
||||||
async_db = SessionLocal()
|
# Get character IDs first using a temporary session
|
||||||
|
temp_db = SessionLocal()
|
||||||
try:
|
try:
|
||||||
db_user = crud.get_user_by_id(async_db, user_id)
|
characters = crud.list_audiobook_characters(temp_db, project_id)
|
||||||
characters = crud.list_audiobook_characters(async_db, project_id)
|
char_ids = [c.id for c in characters]
|
||||||
|
|
||||||
# Use a semaphore to limit concurrent TTS requests
|
|
||||||
sem = asyncio.Semaphore(3)
|
|
||||||
async def _gen(char_id: int):
|
|
||||||
async with sem:
|
|
||||||
try:
|
|
||||||
await generate_character_preview(project_id, char_id, db_user, async_db)
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Background preview generation failed for char {char_id}: {e}")
|
|
||||||
|
|
||||||
tasks = [_gen(c.id) for c in characters]
|
|
||||||
if tasks:
|
|
||||||
await asyncio.gather(*tasks)
|
|
||||||
finally:
|
finally:
|
||||||
async_db.close()
|
temp_db.close()
|
||||||
|
|
||||||
|
if not char_ids:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Use a semaphore to limit concurrent TTS requests
|
||||||
|
sem = asyncio.Semaphore(3)
|
||||||
|
|
||||||
|
async def _gen(char_id: int):
|
||||||
|
async with sem:
|
||||||
|
# Each concurrent task MUST have its own dedicated session
|
||||||
|
local_db = SessionLocal()
|
||||||
|
try:
|
||||||
|
db_user = crud.get_user_by_id(local_db, user_id)
|
||||||
|
await generate_character_preview(project_id, char_id, db_user, local_db)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Background preview generation failed for char {char_id}: {e}")
|
||||||
|
finally:
|
||||||
|
local_db.close()
|
||||||
|
|
||||||
|
tasks = [_gen(cid) for cid in char_ids]
|
||||||
|
await asyncio.gather(*tasks)
|
||||||
|
|
||||||
asyncio.create_task(_generate_all_previews())
|
asyncio.create_task(_generate_all_previews())
|
||||||
|
|
||||||
@@ -538,7 +561,7 @@ async def generate_project(project_id: int, user: User, db: Session, chapter_ind
|
|||||||
audio_bytes, _ = await backend.generate_voice_design({
|
audio_bytes, _ = await backend.generate_voice_design({
|
||||||
"text": seg.text,
|
"text": seg.text,
|
||||||
"language": "zh",
|
"language": "zh",
|
||||||
"instruct": design.instruct,
|
"instruct": _get_gendered_instruct(char.gender, design.instruct),
|
||||||
})
|
})
|
||||||
else:
|
else:
|
||||||
if design.voice_cache_id:
|
if design.voice_cache_id:
|
||||||
@@ -563,7 +586,7 @@ async def generate_project(project_id: int, user: User, db: Session, chapter_ind
|
|||||||
audio_bytes, _ = await backend.generate_voice_design({
|
audio_bytes, _ = await backend.generate_voice_design({
|
||||||
"text": seg.text,
|
"text": seg.text,
|
||||||
"language": "Auto",
|
"language": "Auto",
|
||||||
"instruct": design.instruct,
|
"instruct": _get_gendered_instruct(char.gender, design.instruct),
|
||||||
"max_new_tokens": 2048,
|
"max_new_tokens": 2048,
|
||||||
"temperature": 0.3,
|
"temperature": 0.3,
|
||||||
"top_k": 10,
|
"top_k": 10,
|
||||||
@@ -574,7 +597,7 @@ async def generate_project(project_id: int, user: User, db: Session, chapter_ind
|
|||||||
audio_bytes, _ = await backend.generate_voice_design({
|
audio_bytes, _ = await backend.generate_voice_design({
|
||||||
"text": seg.text,
|
"text": seg.text,
|
||||||
"language": "Auto",
|
"language": "Auto",
|
||||||
"instruct": design.instruct,
|
"instruct": _get_gendered_instruct(char.gender, design.instruct),
|
||||||
"max_new_tokens": 2048,
|
"max_new_tokens": 2048,
|
||||||
"temperature": 0.3,
|
"temperature": 0.3,
|
||||||
"top_k": 10,
|
"top_k": 10,
|
||||||
@@ -789,7 +812,7 @@ async def generate_character_preview(project_id: int, char_id: int, user: User,
|
|||||||
from core.cache_manager import VoiceCacheManager
|
from core.cache_manager import VoiceCacheManager
|
||||||
from utils.audio import process_ref_audio
|
from utils.audio import process_ref_audio
|
||||||
import hashlib
|
import hashlib
|
||||||
|
|
||||||
ref_text = "你好,这是参考音频。"
|
ref_text = "你好,这是参考音频。"
|
||||||
ref_audio_bytes, _ = await backend.generate_voice_design({
|
ref_audio_bytes, _ = await backend.generate_voice_design({
|
||||||
"text": ref_text,
|
"text": ref_text,
|
||||||
@@ -823,6 +846,20 @@ async def generate_character_preview(project_id: int, char_id: int, user: User,
|
|||||||
db.commit()
|
db.commit()
|
||||||
logger.info(f"Bootstrapped local voice cache for preview: design_id={design.id}, cache_id={cache_id}")
|
logger.info(f"Bootstrapped local voice cache for preview: design_id={design.id}, cache_id={cache_id}")
|
||||||
|
|
||||||
|
if backend_type == "aliyun" and not design.aliyun_voice_id:
|
||||||
|
from core.tts_service import AliyunTTSBackend
|
||||||
|
if isinstance(backend, AliyunTTSBackend):
|
||||||
|
try:
|
||||||
|
voice_id = await backend._create_voice_design(
|
||||||
|
instruct=_get_gendered_instruct(char.gender, design.instruct),
|
||||||
|
preview_text=preview_text,
|
||||||
|
)
|
||||||
|
design.aliyun_voice_id = voice_id
|
||||||
|
db.commit()
|
||||||
|
logger.info(f"Bootstrapped aliyun voice_id for preview: design_id={design.id}, voice_id={voice_id}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to bootstrap aliyun voice_id for preview, falling back to instruct: {e}")
|
||||||
|
|
||||||
if backend_type == "aliyun":
|
if backend_type == "aliyun":
|
||||||
if design.aliyun_voice_id:
|
if design.aliyun_voice_id:
|
||||||
audio_bytes, _ = await backend.generate_voice_design(
|
audio_bytes, _ = await backend.generate_voice_design(
|
||||||
@@ -833,7 +870,7 @@ async def generate_character_preview(project_id: int, char_id: int, user: User,
|
|||||||
audio_bytes, _ = await backend.generate_voice_design({
|
audio_bytes, _ = await backend.generate_voice_design({
|
||||||
"text": preview_text,
|
"text": preview_text,
|
||||||
"language": "zh",
|
"language": "zh",
|
||||||
"instruct": design.instruct,
|
"instruct": _get_gendered_instruct(char.gender, design.instruct),
|
||||||
})
|
})
|
||||||
else:
|
else:
|
||||||
if design.voice_cache_id:
|
if design.voice_cache_id:
|
||||||
@@ -858,7 +895,7 @@ async def generate_character_preview(project_id: int, char_id: int, user: User,
|
|||||||
audio_bytes, _ = await backend.generate_voice_design({
|
audio_bytes, _ = await backend.generate_voice_design({
|
||||||
"text": preview_text,
|
"text": preview_text,
|
||||||
"language": "Auto",
|
"language": "Auto",
|
||||||
"instruct": design.instruct,
|
"instruct": _get_gendered_instruct(char.gender, design.instruct),
|
||||||
"max_new_tokens": 512,
|
"max_new_tokens": 512,
|
||||||
"temperature": 0.3,
|
"temperature": 0.3,
|
||||||
"top_k": 10,
|
"top_k": 10,
|
||||||
|
|||||||
@@ -122,12 +122,13 @@ class LLMService:
|
|||||||
"gender字段必须明确标注性别,只能取以下三个值之一:\"男\"、\"女\"、\"未知\"。\n"
|
"gender字段必须明确标注性别,只能取以下三个值之一:\"男\"、\"女\"、\"未知\"。\n"
|
||||||
"narrator的gender固定为\"未知\"。\n"
|
"narrator的gender固定为\"未知\"。\n"
|
||||||
"对每个角色,instruct字段必须是详细的声音导演说明,需覆盖以下六个维度,每个维度单独一句,用换行分隔:\n"
|
"对每个角色,instruct字段必须是详细的声音导演说明,需覆盖以下六个维度,每个维度单独一句,用换行分隔:\n"
|
||||||
"1. 音色信息:嗓音质感、音域、音量、气息特征(如:青年男性中低音,音色干净略带沙哑,音量偏小但稳定,情绪激动时呼吸明显)\n"
|
"1. 音色信息:嗓音质感、音域、音量、气息特征(例如,如果是女性角色,此处必须以'女性声音'开头,如:'女性声音,清脆悦耳的高音,嗓音纤细干净,带有一点点少女感';男性角色则以'男性声音'开头)\n"
|
||||||
"2. 身份背景:角色身份、职业、出身、所处时代背景对声音的影响\n"
|
"2. 身份背景:角色身份、职业、出身、所处时代背景对声音的影响\n"
|
||||||
"3. 年龄设定:具体年龄段及其在声音上的体现\n"
|
"3. 年龄设定:具体年龄段及其在声音上的体现\n"
|
||||||
"4. 外貌特征:体型、面容、精神状态等可影响声音感知的特征\n"
|
"4. 外貌特征:体型、面容、精神状态等可影响声音感知的特征\n"
|
||||||
"5. 性格特质:核心性格、情绪模式、表达习惯\n"
|
"5. 性格特质:核心性格、情绪模式、表达习惯\n"
|
||||||
"6. 叙事风格:语速节奏、停顿习惯、语气色彩、整体叙述感\n\n"
|
"6. 叙事风格:语速节奏、停顿习惯、语气色彩、整体叙述感\n\n"
|
||||||
|
"注意:instruct 的第一行(音色信息)必须与 gender 字段保持一致。如果 gender 为女,第一行绝对不能出现'男性'字样。\n\n"
|
||||||
"只输出JSON,格式如下,不要有其他文字:\n"
|
"只输出JSON,格式如下,不要有其他文字:\n"
|
||||||
'{"characters": [{"name": "narrator", "gender": "未知", "description": "第三人称叙述者", "instruct": "音色信息:...\\n身份背景:...\\n年龄设定:...\\n外貌特征:...\\n性格特质:...\\n叙事风格:..."}, ...]}'
|
'{"characters": [{"name": "narrator", "gender": "未知", "description": "第三人称叙述者", "instruct": "音色信息:...\\n身份背景:...\\n年龄设定:...\\n外貌特征:...\\n性格特质:...\\n叙事风格:..."}, ...]}'
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user