- Rename qwen3-tts-backend → canto-backend - Rename qwen3-tts-frontend → canto-frontend - Remove NovelWriter embedded repo Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
108 lines
3.5 KiB
Python
108 lines
3.5 KiB
Python
"""
|
|
IndexTTS2 standalone test - following official webui best practices.
|
|
|
|
Key differences from our current backend:
|
|
- is_fp16=False (official webui default, we were using is_fp16=True)
|
|
- Tests all emotion modes: none, audio-ref, emo_text, emo_vector
|
|
"""
|
|
import os
|
|
import sys
|
|
import time
|
|
|
|
MODEL_DIR = os.path.join(os.path.dirname(__file__), "qwen3-tts-backend", "Qwen", "IndexTTS2")
|
|
CFG_PATH = os.path.join(MODEL_DIR, "config.yaml")
|
|
REF_AUDIO = os.path.join(os.path.dirname(__file__), "test_indextts2_outputs", "ref_audio_id242_test54321.wav")
|
|
OUT_DIR = os.path.join(os.path.dirname(__file__), "test_indextts2_v2")
|
|
|
|
os.makedirs(OUT_DIR, exist_ok=True)
|
|
|
|
sys.path.insert(0, "/home/bdim/Documents/github/indexTTS2")
|
|
|
|
print(f"Model dir: {MODEL_DIR}")
|
|
print(f"Config: {CFG_PATH}")
|
|
print(f"Ref audio: {REF_AUDIO}")
|
|
print(f"Output dir: {OUT_DIR}")
|
|
print()
|
|
|
|
print("Loading IndexTTS2 model (is_fp16=False, matching official webui)...")
|
|
t0 = time.time()
|
|
from indextts.infer_indextts2 import IndexTTS2
|
|
|
|
tts = IndexTTS2(
|
|
cfg_path=CFG_PATH,
|
|
model_dir=MODEL_DIR,
|
|
is_fp16=False,
|
|
use_cuda_kernel=False,
|
|
use_deepspeed=False,
|
|
)
|
|
print(f"Model loaded in {time.time() - t0:.1f}s\n")
|
|
|
|
|
|
def run(name, **kwargs):
|
|
out = os.path.join(OUT_DIR, f"{name}.wav")
|
|
print(f"--- [{name}] ---")
|
|
t = time.time()
|
|
tts.infer(
|
|
spk_audio_prompt=REF_AUDIO,
|
|
output_path=out,
|
|
verbose=True,
|
|
**kwargs,
|
|
)
|
|
sz = os.path.getsize(out)
|
|
print(f"Done in {time.time()-t:.1f}s, size={sz} bytes -> {out}\n")
|
|
|
|
|
|
TEXT = "今天天气真不错,阳光明媚,感觉一切都很美好。"
|
|
|
|
# Keyword-mapped emo_vector (bypasses broken QwenEmotion)
|
|
# Uses _emo_text_to_vector() logic from IndexTTS2Backend
|
|
def emo_keywords_to_vector(emo_text):
|
|
EMO_KEYWORDS = [
|
|
['喜', '开心', '快乐', '高兴', '欢乐', '愉快', 'happy', '热情', '兴奋', '愉悦', '激动'],
|
|
['怒', '愤怒', '生气', '恼', 'angry', '气愤', '愤慨'],
|
|
['哀', '悲伤', '难过', '忧郁', '伤心', '悲', 'sad', '感慨', '沉重', '沉痛', '哭'],
|
|
['惧', '恐惧', '害怕', '恐', 'fear', '担心', '紧张'],
|
|
['厌恶', '厌', 'hate', '讨厌', '反感'],
|
|
['低落', '沮丧', '消沉', 'low', '抑郁', '颓废'],
|
|
['惊喜', '惊讶', '意外', 'surprise', '惊', '吃惊', '震惊'],
|
|
['自然', '平静', '中性', '平和', 'neutral', '平淡', '冷静', '稳定'],
|
|
]
|
|
text = emo_text.lower()
|
|
matched = []
|
|
for idx, words in enumerate(EMO_KEYWORDS):
|
|
for word in words:
|
|
if word in text:
|
|
matched.append(idx)
|
|
break
|
|
if not matched:
|
|
return None
|
|
vec = [0.0] * 8
|
|
score = 0.8 if len(matched) == 1 else 0.5
|
|
for idx in matched:
|
|
vec[idx] = score
|
|
return vec
|
|
|
|
# Baseline: no emotion
|
|
run("v3_00_no_emotion", text=TEXT)
|
|
|
|
# Test each emotion via keyword → vector mapping
|
|
cases = [
|
|
("v3_01_happy", TEXT, "开心愉悦"),
|
|
("v3_02_sad", TEXT, "悲伤难过"),
|
|
("v3_03_angry", TEXT, "愤怒生气"),
|
|
("v3_04_low", TEXT, "低落沮丧"),
|
|
("v3_05_surprise",TEXT, "惊讶意外"),
|
|
("v3_06_calm", TEXT, "平静自然"),
|
|
]
|
|
|
|
for name, t, emo in cases:
|
|
vec = emo_keywords_to_vector(emo)
|
|
print(f" emo_text={repr(emo)} → emo_vector={vec}")
|
|
run(name, text=t, emo_vector=vec, emo_alpha=1.0)
|
|
|
|
print("All tests complete. Files saved to:", OUT_DIR)
|
|
print("Files:")
|
|
for f in sorted(os.listdir(OUT_DIR)):
|
|
path = os.path.join(OUT_DIR, f)
|
|
print(f" {f} ({os.path.getsize(path)} bytes)")
|