Canto/test.py

"""
IndexTTS2 standalone test - following official webui best practices.

Key differences from our current backend:
- is_fp16=False (official webui default, we were using is_fp16=True)
- Tests all emotion modes: none, audio-ref, emo_text, emo_vector
"""
import os
import sys
import time

MODEL_DIR = os.path.join(os.path.dirname(__file__), "qwen3-tts-backend", "Qwen", "IndexTTS2")
CFG_PATH = os.path.join(MODEL_DIR, "config.yaml")
REF_AUDIO = os.path.join(os.path.dirname(__file__), "test_indextts2_outputs", "ref_audio_id242_test54321.wav")
OUT_DIR = os.path.join(os.path.dirname(__file__), "test_indextts2_v2")

os.makedirs(OUT_DIR, exist_ok=True)

sys.path.insert(0, "/home/bdim/Documents/github/indexTTS2")

print(f"Model dir: {MODEL_DIR}")
print(f"Config: {CFG_PATH}")
print(f"Ref audio: {REF_AUDIO}")
print(f"Output dir: {OUT_DIR}")
print()

print("Loading IndexTTS2 model (is_fp16=False, matching official webui)...")
t0 = time.time()
from indextts.infer_indextts2 import IndexTTS2

tts = IndexTTS2(
    cfg_path=CFG_PATH,
    model_dir=MODEL_DIR,
    is_fp16=False,
    use_cuda_kernel=False,
    use_deepspeed=False,
)
print(f"Model loaded in {time.time() - t0:.1f}s\n")


def run(name, **kwargs):
    out = os.path.join(OUT_DIR, f"{name}.wav")
    print(f"--- [{name}] ---")
    t = time.time()
    tts.infer(
        spk_audio_prompt=REF_AUDIO,
        output_path=out,
        verbose=True,
        **kwargs,
    )
    sz = os.path.getsize(out)
    print(f"Done in {time.time()-t:.1f}s, size={sz} bytes -> {out}\n")


TEXT = "今天天气真不错，阳光明媚，感觉一切都很美好。"

# Keyword-mapped emo_vector (bypasses broken QwenEmotion)
# Uses _emo_text_to_vector() logic from IndexTTS2Backend
def emo_keywords_to_vector(emo_text):
    EMO_KEYWORDS = [
        ['喜', '开心', '快乐', '高兴', '欢乐', '愉快', 'happy', '热情', '兴奋', '愉悦', '激动'],
        ['怒', '愤怒', '生气', '恼', 'angry', '气愤', '愤慨'],
        ['哀', '悲伤', '难过', '忧郁', '伤心', '悲', 'sad', '感慨', '沉重', '沉痛', '哭'],
        ['惧', '恐惧', '害怕', '恐', 'fear', '担心', '紧张'],
        ['厌恶', '厌', 'hate', '讨厌', '反感'],
        ['低落', '沮丧', '消沉', 'low', '抑郁', '颓废'],
        ['惊喜', '惊讶', '意外', 'surprise', '惊', '吃惊', '震惊'],
        ['自然', '平静', '中性', '平和', 'neutral', '平淡', '冷静', '稳定'],
    ]
    text = emo_text.lower()
    matched = []
    for idx, words in enumerate(EMO_KEYWORDS):
        for word in words:
            if word in text:
                matched.append(idx)
                break
    if not matched:
        return None
    vec = [0.0] * 8
    score = 0.8 if len(matched) == 1 else 0.5
    for idx in matched:
        vec[idx] = score
    return vec

# Baseline: no emotion
run("v3_00_no_emotion", text=TEXT)

# Test each emotion via keyword → vector mapping
cases = [
    ("v3_01_happy",   TEXT, "开心愉悦"),
    ("v3_02_sad",     TEXT, "悲伤难过"),
    ("v3_03_angry",   TEXT, "愤怒生气"),
    ("v3_04_low",     TEXT, "低落沮丧"),
    ("v3_05_surprise",TEXT, "惊讶意外"),
    ("v3_06_calm",    TEXT, "平静自然"),
]

for name, t, emo in cases:
    vec = emo_keywords_to_vector(emo)
    print(f"  emo_text={repr(emo)} → emo_vector={vec}")
    run(name, text=t, emo_vector=vec, emo_alpha=1.0)

print("All tests complete. Files saved to:", OUT_DIR)
print("Files:")
for f in sorted(os.listdir(OUT_DIR)):
    path = os.path.join(OUT_DIR, f)
    print(f"  {f}  ({os.path.getsize(path)} bytes)")