From bb6ad9b0a3959e3f45b3688557961656cc691814 Mon Sep 17 00:00:00 2001 From: bdim404 Date: Thu, 12 Mar 2026 15:48:35 +0800 Subject: [PATCH] feat: Implement segment update and regeneration features in Audiobook API and frontend --- qwen3-tts-backend/api/audiobook.py | 84 +++++++ qwen3-tts-backend/core/audiobook_service.py | 112 ++++++++++ qwen3-tts-backend/core/llm_service.py | 13 +- qwen3-tts-backend/db/crud.py | 20 ++ qwen3-tts-backend/schemas/audiobook.py | 6 + qwen3-tts-frontend/src/components/Navbar.tsx | 2 +- qwen3-tts-frontend/src/lib/api/audiobook.ts | 19 ++ .../src/locales/en-US/audiobook.json | 11 +- .../src/locales/ja-JP/audiobook.json | 11 +- .../src/locales/ko-KR/audiobook.json | 11 +- .../src/locales/zh-CN/audiobook.json | 11 +- .../src/locales/zh-TW/audiobook.json | 11 +- qwen3-tts-frontend/src/pages/Audiobook.tsx | 205 ++++++++++++++++-- 13 files changed, 485 insertions(+), 31 deletions(-) diff --git a/qwen3-tts-backend/api/audiobook.py b/qwen3-tts-backend/api/audiobook.py index aeb7d67..6dd13d6 100644 --- a/qwen3-tts-backend/api/audiobook.py +++ b/qwen3-tts-backend/api/audiobook.py @@ -20,6 +20,7 @@ from schemas.audiobook import ( AudiobookChapterResponse, AudiobookCharacterEdit, AudiobookSegmentResponse, + AudiobookSegmentUpdate, AudiobookGenerateRequest, AudiobookAnalyzeRequest, ) @@ -543,6 +544,89 @@ async def get_segments( return result +@router.put("/projects/{project_id}/segments/{segment_id}", response_model=AudiobookSegmentResponse) +async def update_segment( + project_id: int, + segment_id: int, + data: AudiobookSegmentUpdate, + current_user: User = Depends(get_current_user), + db: Session = Depends(get_db), +): + project = crud.get_audiobook_project(db, project_id, current_user.id) + if not project: + raise HTTPException(status_code=404, detail="Project not found") + + seg = db.query(AudiobookSegment).filter( + AudiobookSegment.id == segment_id, + AudiobookSegment.project_id == project_id, + ).first() + if not seg: + raise HTTPException(status_code=404, detail="Segment not found") + + seg = crud.update_audiobook_segment(db, segment_id, data.text, data.emo_text, data.emo_alpha) + char_name = seg.character.name if seg.character else None + return AudiobookSegmentResponse( + id=seg.id, + project_id=seg.project_id, + chapter_index=seg.chapter_index, + segment_index=seg.segment_index, + character_id=seg.character_id, + character_name=char_name, + text=seg.text, + emo_text=seg.emo_text, + emo_alpha=seg.emo_alpha, + audio_path=seg.audio_path, + status=seg.status, + ) + + +@router.post("/projects/{project_id}/segments/{segment_id}/regenerate", response_model=AudiobookSegmentResponse) +async def regenerate_segment( + project_id: int, + segment_id: int, + current_user: User = Depends(get_current_user), + db: Session = Depends(get_db), +): + project = crud.get_audiobook_project(db, project_id, current_user.id) + if not project: + raise HTTPException(status_code=404, detail="Project not found") + + seg = db.query(AudiobookSegment).filter( + AudiobookSegment.id == segment_id, + AudiobookSegment.project_id == project_id, + ).first() + if not seg: + raise HTTPException(status_code=404, detail="Segment not found") + + from core.audiobook_service import generate_single_segment + from core.database import SessionLocal + + async def run(): + async_db = SessionLocal() + try: + db_user = crud.get_user_by_id(async_db, current_user.id) + await generate_single_segment(segment_id, db_user, async_db) + finally: + async_db.close() + + asyncio.create_task(run()) + + char_name = seg.character.name if seg.character else None + return AudiobookSegmentResponse( + id=seg.id, + project_id=seg.project_id, + chapter_index=seg.chapter_index, + segment_index=seg.segment_index, + character_id=seg.character_id, + character_name=char_name, + text=seg.text, + emo_text=seg.emo_text, + emo_alpha=seg.emo_alpha, + audio_path=seg.audio_path, + status="generating", + ) + + @router.get("/projects/{project_id}/segments/{segment_id}/audio") async def get_segment_audio( project_id: int, diff --git a/qwen3-tts-backend/core/audiobook_service.py b/qwen3-tts-backend/core/audiobook_service.py index 0df2a72..2cc90e2 100644 --- a/qwen3-tts-backend/core/audiobook_service.py +++ b/qwen3-tts-backend/core/audiobook_service.py @@ -672,6 +672,118 @@ async def generate_project(project_id: int, user: User, db: Session, chapter_ind crud.update_audiobook_project_status(db, project_id, "error", error_message=str(e)) +async def generate_single_segment(segment_id: int, user: User, db: Session) -> None: + from db.models import AudiobookSegment as _Seg + seg = db.query(_Seg).filter(_Seg.id == segment_id).first() + if not seg: + return + + output_base = Path(settings.OUTPUT_DIR) / "audiobook" / str(seg.project_id) / "segments" + output_base.mkdir(parents=True, exist_ok=True) + + crud.update_audiobook_segment_status(db, segment_id, "generating") + try: + from core.tts_service import TTSServiceFactory + from core.security import decrypt_api_key + + backend_type = user.user_preferences.get("default_backend", "aliyun") if user.user_preferences else "aliyun" + user_api_key = None + if backend_type == "aliyun" and user.aliyun_api_key: + user_api_key = decrypt_api_key(user.aliyun_api_key) + + backend = await TTSServiceFactory.get_backend(backend_type, user_api_key) + + char = crud.get_audiobook_character(db, seg.character_id) + if not char or not char.voice_design_id: + crud.update_audiobook_segment_status(db, segment_id, "error") + return + + design = crud.get_voice_design(db, char.voice_design_id, user.id) + if not design: + crud.update_audiobook_segment_status(db, segment_id, "error") + return + + await _bootstrap_character_voices([seg], user, backend, backend_type, db) + db.refresh(design) + + audio_filename = f"ch{seg.chapter_index:03d}_seg{seg.segment_index:04d}.wav" + audio_path = output_base / audio_filename + + if backend_type == "aliyun": + if design.aliyun_voice_id: + audio_bytes, _ = await backend.generate_voice_design( + {"text": seg.text, "language": "zh"}, + saved_voice_id=design.aliyun_voice_id + ) + else: + audio_bytes, _ = await backend.generate_voice_design({ + "text": seg.text, + "language": "zh", + "instruct": _get_gendered_instruct(char.gender, design.instruct), + }) + elif char.use_indextts2 and design.ref_audio_path and Path(design.ref_audio_path).exists(): + from core.tts_service import IndexTTS2Backend + indextts2 = IndexTTS2Backend() + audio_bytes = await indextts2.generate( + text=seg.text, + spk_audio_prompt=design.ref_audio_path, + output_path=str(audio_path), + emo_text=seg.emo_text or None, + emo_alpha=seg.emo_alpha if seg.emo_text else 0.5, + ) + else: + if design.voice_cache_id: + from core.cache_manager import VoiceCacheManager + cache_manager = await VoiceCacheManager.get_instance() + cache_result = await cache_manager.get_cache_by_id(design.voice_cache_id, db) + x_vector = cache_result['data'] if cache_result else None + if x_vector: + audio_bytes, _ = await backend.generate_voice_clone( + { + "text": seg.text, + "language": "Auto", + "max_new_tokens": 2048, + "temperature": 0.3, + "top_k": 10, + "top_p": 0.9, + "repetition_penalty": 1.05, + }, + x_vector=x_vector + ) + else: + audio_bytes, _ = await backend.generate_voice_design({ + "text": seg.text, + "language": "Auto", + "instruct": _get_gendered_instruct(char.gender, design.instruct), + "max_new_tokens": 2048, + "temperature": 0.3, + "top_k": 10, + "top_p": 0.9, + "repetition_penalty": 1.05, + }) + else: + audio_bytes, _ = await backend.generate_voice_design({ + "text": seg.text, + "language": "Auto", + "instruct": _get_gendered_instruct(char.gender, design.instruct), + "max_new_tokens": 2048, + "temperature": 0.3, + "top_k": 10, + "top_p": 0.9, + "repetition_penalty": 1.05, + }) + + with open(audio_path, "wb") as f: + f.write(audio_bytes) + + crud.update_audiobook_segment_status(db, segment_id, "done", audio_path=str(audio_path)) + logger.info(f"Single segment {segment_id} generated: {audio_path}") + + except Exception as e: + logger.error(f"Single segment {segment_id} generation failed: {e}", exc_info=True) + crud.update_audiobook_segment_status(db, segment_id, "error") + + def merge_audio_files(audio_paths: list[str], output_path: str) -> None: from pydub import AudioSegment diff --git a/qwen3-tts-backend/core/llm_service.py b/qwen3-tts-backend/core/llm_service.py index 43f8a37..985043a 100644 --- a/qwen3-tts-backend/core/llm_service.py +++ b/qwen3-tts-backend/core/llm_service.py @@ -131,13 +131,12 @@ class LLMService: "5. 性格特质:核心性格、情绪模式、表达习惯\n" "6. 叙事风格:语速节奏、停顿习惯、语气色彩、整体叙述感\n\n" "注意:instruct 的第一行(音色信息)必须与 gender 字段保持一致。如果 gender 为女,第一行绝对不能出现'男性'字样。\n\n" - "【特别规定】narrator(旁白)的 instruct 必须固定描述为传统说书人风格,参考如下模板(根据书籍风格可微调措辞,但风格不变):\n" - "音色信息:浑厚醇厚的男性中低音,嗓音饱满有力,带有传统说书人的磁性与感染力\n" - "身份背景:中国传统说书艺人,精通评书、章回小说叙述艺术,深谙故事节奏与听众心理\n" - "年龄设定:中年男性,四五十岁,声音历经岁月沉淀,成熟稳重而不失活力\n" - "外貌特征:面容沉稳,气度从容,台风大气,给人以可信赖的叙述者印象\n" - "性格特质:沉稳睿智,叙事冷静客观,情到深处能引发共鸣,不动声色间娓娓道来\n" - "叙事风格:语速适中偏慢,抑扬顿挫,擅长铺垫悬念,停顿恰到好处,语气庄重而生动,富有画面感\n\n" + "【特别规定】narrator(旁白)的 instruct 必须根据小说类型选择对应的叙述者音色风格,规则如下:\n" + "▸ 古风/武侠/历史/玄幻/仙侠/奇幻 → 传统说书人风格:浑厚醇厚的男性中低音,嗓音饱满有力,带有说书人的磁性与感染力;中年男性,四五十岁;语速适中偏慢,抑扬顿挫,停顿恰到好处,语气庄重生动,富有画面感\n" + "▸ 现代言情/都市爱情/青春校园 → 年轻女性叙述者风格:女性声音,清亮柔和的中高音,嗓音清新干净,带有亲切温柔的娓娓道来感;二三十岁年轻女性;语速轻快自然,情感细腻,语气温柔而富有感染力\n" + "▸ 悬疑/推理/惊悚/恐怖 → 低沉神秘风格:男性声音,低沉压抑的男性低音,嗓音干练克制,带有一丝神秘与张力;中年男性;语速沉稳偏慢,停顿制造悬念,语气冷静克制,暗藏紧张感\n" + "▸ 科幻/末世/赛博朋克 → 理性宏观风格:男性声音,清晰有力的男性中音,嗓音冷静客观,带有纪录片解说员的宏大叙事感;语速稳定,条理清晰,语气客观宏观,富有科技感与史诗感\n" + "▸ 其他/无法判断 → 传统说书人风格(同古风类型)\n\n" "只输出JSON,格式如下,不要有其他文字:\n" '{"characters": [{"name": "narrator", "gender": "未知", "description": "第三人称叙述者", "instruct": "音色信息:...\\n身份背景:...\\n年龄设定:...\\n外貌特征:...\\n性格特质:...\\n叙事风格:..."}, ...]}' ) diff --git a/qwen3-tts-backend/db/crud.py b/qwen3-tts-backend/db/crud.py index 789f2e3..d533b68 100644 --- a/qwen3-tts-backend/db/crud.py +++ b/qwen3-tts-backend/db/crud.py @@ -640,6 +640,26 @@ def update_audiobook_segment_status( return seg +def update_audiobook_segment( + db: Session, + segment_id: int, + text: str, + emo_text: Optional[str], + emo_alpha: Optional[float], +) -> Optional[AudiobookSegment]: + seg = db.query(AudiobookSegment).filter(AudiobookSegment.id == segment_id).first() + if not seg: + return None + seg.text = text + seg.emo_text = emo_text or None + seg.emo_alpha = emo_alpha + seg.status = "pending" + seg.audio_path = None + db.commit() + db.refresh(seg) + return seg + + def delete_audiobook_segments(db: Session, project_id: int) -> None: db.query(AudiobookSegment).filter(AudiobookSegment.project_id == project_id).delete() db.commit() diff --git a/qwen3-tts-backend/schemas/audiobook.py b/qwen3-tts-backend/schemas/audiobook.py index c2ee3f3..4dda842 100644 --- a/qwen3-tts-backend/schemas/audiobook.py +++ b/qwen3-tts-backend/schemas/audiobook.py @@ -89,6 +89,12 @@ class AudiobookSegmentResponse(BaseModel): model_config = ConfigDict(from_attributes=True) +class AudiobookSegmentUpdate(BaseModel): + text: str + emo_text: Optional[str] = None + emo_alpha: Optional[float] = None + + class LLMConfigUpdate(BaseModel): base_url: str api_key: str diff --git a/qwen3-tts-frontend/src/components/Navbar.tsx b/qwen3-tts-frontend/src/components/Navbar.tsx index 6706b6a..541752b 100644 --- a/qwen3-tts-frontend/src/components/Navbar.tsx +++ b/qwen3-tts-frontend/src/components/Navbar.tsx @@ -36,7 +36,7 @@ export function Navbar({ onToggleSidebar }: NavbarProps) { )} {location.pathname !== '/' && ( - + diff --git a/qwen3-tts-frontend/src/lib/api/audiobook.ts b/qwen3-tts-frontend/src/lib/api/audiobook.ts index 99d4ec8..ba91cb0 100644 --- a/qwen3-tts-frontend/src/lib/api/audiobook.ts +++ b/qwen3-tts-frontend/src/lib/api/audiobook.ts @@ -138,6 +138,25 @@ export const audiobookApi = { return `/audiobook/projects/${id}/download${chapterParam}` }, + updateSegment: async ( + projectId: number, + segmentId: number, + data: { text: string; emo_text?: string | null; emo_alpha?: number | null } + ): Promise => { + const response = await apiClient.put( + `/audiobook/projects/${projectId}/segments/${segmentId}`, + data + ) + return response.data + }, + + regenerateSegment: async (projectId: number, segmentId: number): Promise => { + const response = await apiClient.post( + `/audiobook/projects/${projectId}/segments/${segmentId}/regenerate` + ) + return response.data + }, + getSegmentAudioUrl: (projectId: number, segmentId: number): string => { return `/audiobook/projects/${projectId}/segments/${segmentId}/audio` }, diff --git a/qwen3-tts-frontend/src/locales/en-US/audiobook.json b/qwen3-tts-frontend/src/locales/en-US/audiobook.json index c1a3622..d48f0b8 100644 --- a/qwen3-tts-frontend/src/locales/en-US/audiobook.json +++ b/qwen3-tts-frontend/src/locales/en-US/audiobook.json @@ -115,7 +115,16 @@ "segments": { "errorBadge": "Error", - "unknownCharacter": "?" + "unknownCharacter": "?", + "edit": "Edit", + "save": "Save", + "cancel": "Cancel", + "regenerate": "Regenerate", + "regenerating": "Generating...", + "savedSuccess": "Segment saved", + "emotion": "Emotion", + "noEmotion": "No emotion", + "intensity": "Intensity" }, "sequential": { diff --git a/qwen3-tts-frontend/src/locales/ja-JP/audiobook.json b/qwen3-tts-frontend/src/locales/ja-JP/audiobook.json index b06b51c..c6c9cdb 100644 --- a/qwen3-tts-frontend/src/locales/ja-JP/audiobook.json +++ b/qwen3-tts-frontend/src/locales/ja-JP/audiobook.json @@ -114,7 +114,16 @@ "segments": { "errorBadge": "エラー", - "unknownCharacter": "?" + "unknownCharacter": "?", + "edit": "編集", + "save": "保存", + "cancel": "キャンセル", + "regenerate": "再生成", + "regenerating": "生成中...", + "savedSuccess": "セグメントを保存しました", + "emotion": "感情", + "noEmotion": "感情なし", + "intensity": "強度" }, "sequential": { diff --git a/qwen3-tts-frontend/src/locales/ko-KR/audiobook.json b/qwen3-tts-frontend/src/locales/ko-KR/audiobook.json index 8045e24..4d6dd1f 100644 --- a/qwen3-tts-frontend/src/locales/ko-KR/audiobook.json +++ b/qwen3-tts-frontend/src/locales/ko-KR/audiobook.json @@ -114,7 +114,16 @@ "segments": { "errorBadge": "오류", - "unknownCharacter": "?" + "unknownCharacter": "?", + "edit": "편집", + "save": "저장", + "cancel": "취소", + "regenerate": "재생성", + "regenerating": "생성 중...", + "savedSuccess": "세그먼트가 저장되었습니다", + "emotion": "감정", + "noEmotion": "감정 없음", + "intensity": "강도" }, "sequential": { diff --git a/qwen3-tts-frontend/src/locales/zh-CN/audiobook.json b/qwen3-tts-frontend/src/locales/zh-CN/audiobook.json index c429218..a2d1bb9 100644 --- a/qwen3-tts-frontend/src/locales/zh-CN/audiobook.json +++ b/qwen3-tts-frontend/src/locales/zh-CN/audiobook.json @@ -118,7 +118,16 @@ "segments": { "errorBadge": "出错", - "unknownCharacter": "?" + "unknownCharacter": "?", + "edit": "编辑", + "save": "保存", + "cancel": "取消", + "regenerate": "重新生成", + "regenerating": "生成中...", + "savedSuccess": "片段已保存", + "emotion": "情绪", + "noEmotion": "无情绪", + "intensity": "强度" }, "sequential": { diff --git a/qwen3-tts-frontend/src/locales/zh-TW/audiobook.json b/qwen3-tts-frontend/src/locales/zh-TW/audiobook.json index 4e276b8..a607a7d 100644 --- a/qwen3-tts-frontend/src/locales/zh-TW/audiobook.json +++ b/qwen3-tts-frontend/src/locales/zh-TW/audiobook.json @@ -114,7 +114,16 @@ "segments": { "errorBadge": "出錯", - "unknownCharacter": "?" + "unknownCharacter": "?", + "edit": "編輯", + "save": "儲存", + "cancel": "取消", + "regenerate": "重新生成", + "regenerating": "生成中...", + "savedSuccess": "片段已儲存", + "emotion": "情緒", + "noEmotion": "無情緒", + "intensity": "強度" }, "sequential": { diff --git a/qwen3-tts-frontend/src/pages/Audiobook.tsx b/qwen3-tts-frontend/src/pages/Audiobook.tsx index 91bfe14..da853fb 100644 --- a/qwen3-tts-frontend/src/pages/Audiobook.tsx +++ b/qwen3-tts-frontend/src/pages/Audiobook.tsx @@ -482,7 +482,7 @@ function CharactersPanel({ return (
-
+
{t('projectCard.characters.title', { count: charCount })} @@ -625,6 +625,11 @@ function CharactersPanel({ ) } +const EMOTION_OPTIONS = ['开心', '愤怒', '悲伤', '恐惧', '厌恶', '低沉', '惊讶', '中性'] +const EMOTION_ALPHA_DEFAULTS: Record = { + 开心: 0.6, 愤怒: 0.15, 悲伤: 0.4, 恐惧: 0.4, 厌恶: 0.6, 低沉: 0.6, 惊讶: 0.3, 中性: 0.5, +} + function ChaptersPanel({ project, detail, @@ -639,6 +644,8 @@ function ChaptersPanel({ onProcessAll, onDownload, onSequentialPlayingChange, + onUpdateSegment, + onRegenerateSegment, }: { project: AudiobookProject detail: AudiobookProjectDetail | null @@ -653,9 +660,50 @@ function ChaptersPanel({ onProcessAll: () => void onDownload: (chapterIndex?: number) => void onSequentialPlayingChange: (id: number | null) => void + onUpdateSegment: (segmentId: number, data: { text: string; emo_text?: string | null; emo_alpha?: number | null }) => Promise + onRegenerateSegment: (segmentId: number) => Promise }) { const { t } = useTranslation('audiobook') const [expandedChapters, setExpandedChapters] = useState>(new Set()) + const [editingSegId, setEditingSegId] = useState(null) + const [editText, setEditText] = useState('') + const [editEmoText, setEditEmoText] = useState('') + const [editEmoAlpha, setEditEmoAlpha] = useState(0.5) + const [savingSegId, setSavingSegId] = useState(null) + const [regeneratingSegs, setRegeneratingSegs] = useState>(new Set()) + + const startEdit = (seg: AudiobookSegment) => { + setEditingSegId(seg.id) + setEditText(seg.text) + setEditEmoText(seg.emo_text || '') + setEditEmoAlpha(seg.emo_alpha ?? 0.5) + } + + const cancelEdit = () => setEditingSegId(null) + + const saveEdit = async (segId: number) => { + setSavingSegId(segId) + try { + await onUpdateSegment(segId, { + text: editText, + emo_text: editEmoText || null, + emo_alpha: editEmoText ? editEmoAlpha : null, + }) + setEditingSegId(null) + } finally { + setSavingSegId(null) + } + } + + const handleRegenerate = async (segId: number) => { + setRegeneratingSegs(prev => new Set([...prev, segId])) + try { + await onRegenerateSegment(segId) + } finally { + setRegeneratingSegs(prev => { const n = new Set(prev); n.delete(segId); return n }) + } + } + const status = project.status const doneCount = segments.filter(s => s.status === 'done').length @@ -676,7 +724,7 @@ function ChaptersPanel({ return (
-
+
{t('projectCard.chapters.title', { count: detail?.chapters.length ?? 0 })} @@ -786,24 +834,119 @@ function ChaptersPanel({ )} {chExpanded && chSegs.length > 0 && (
- {chSegs.map(seg => ( -
-
- - {seg.character_name || t('projectCard.segments.unknownCharacter')} - - {seg.emo_text && ( - {seg.emo_text} + {chSegs.map(seg => { + const isEditing = editingSegId === seg.id + const isRegenerating = regeneratingSegs.has(seg.id) || seg.status === 'generating' + const isSaving = savingSegId === seg.id + return ( +
+
+ + {seg.character_name || t('projectCard.segments.unknownCharacter')} + + {!isEditing && seg.emo_text && ( + {seg.emo_text} + )} + {isRegenerating && } + {!isRegenerating && seg.status === 'error' && {t('projectCard.segments.errorBadge')}} +
+ {!isEditing && ( + <> + + + + )} + {isEditing && ( + <> + + + + )} +
+
+ {isEditing ? ( +
+