""" IndexTTS2 standalone test - following official webui best practices. Key differences from our current backend: - is_fp16=False (official webui default, we were using is_fp16=True) - Tests all emotion modes: none, audio-ref, emo_text, emo_vector """ import os import sys import time MODEL_DIR = os.path.join(os.path.dirname(__file__), "qwen3-tts-backend", "Qwen", "IndexTTS2") CFG_PATH = os.path.join(MODEL_DIR, "config.yaml") REF_AUDIO = os.path.join(os.path.dirname(__file__), "test_indextts2_outputs", "ref_audio_id242_test54321.wav") OUT_DIR = os.path.join(os.path.dirname(__file__), "test_indextts2_v2") os.makedirs(OUT_DIR, exist_ok=True) sys.path.insert(0, "/home/bdim/Documents/github/indexTTS2") print(f"Model dir: {MODEL_DIR}") print(f"Config: {CFG_PATH}") print(f"Ref audio: {REF_AUDIO}") print(f"Output dir: {OUT_DIR}") print() print("Loading IndexTTS2 model (is_fp16=False, matching official webui)...") t0 = time.time() from indextts.infer_indextts2 import IndexTTS2 tts = IndexTTS2( cfg_path=CFG_PATH, model_dir=MODEL_DIR, is_fp16=False, use_cuda_kernel=False, use_deepspeed=False, ) print(f"Model loaded in {time.time() - t0:.1f}s\n") def run(name, **kwargs): out = os.path.join(OUT_DIR, f"{name}.wav") print(f"--- [{name}] ---") t = time.time() tts.infer( spk_audio_prompt=REF_AUDIO, output_path=out, verbose=True, **kwargs, ) sz = os.path.getsize(out) print(f"Done in {time.time()-t:.1f}s, size={sz} bytes -> {out}\n") TEXT = "今天天气真不错,阳光明媚,感觉一切都很美好。" # Keyword-mapped emo_vector (bypasses broken QwenEmotion) # Uses _emo_text_to_vector() logic from IndexTTS2Backend def emo_keywords_to_vector(emo_text): EMO_KEYWORDS = [ ['喜', '开心', '快乐', '高兴', '欢乐', '愉快', 'happy', '热情', '兴奋', '愉悦', '激动'], ['怒', '愤怒', '生气', '恼', 'angry', '气愤', '愤慨'], ['哀', '悲伤', '难过', '忧郁', '伤心', '悲', 'sad', '感慨', '沉重', '沉痛', '哭'], ['惧', '恐惧', '害怕', '恐', 'fear', '担心', '紧张'], ['厌恶', '厌', 'hate', '讨厌', '反感'], ['低落', '沮丧', '消沉', 'low', '抑郁', '颓废'], ['惊喜', '惊讶', '意外', 'surprise', '惊', '吃惊', '震惊'], ['自然', '平静', '中性', '平和', 'neutral', '平淡', '冷静', '稳定'], ] text = emo_text.lower() matched = [] for idx, words in enumerate(EMO_KEYWORDS): for word in words: if word in text: matched.append(idx) break if not matched: return None vec = [0.0] * 8 score = 0.8 if len(matched) == 1 else 0.5 for idx in matched: vec[idx] = score return vec # Baseline: no emotion run("v3_00_no_emotion", text=TEXT) # Test each emotion via keyword → vector mapping cases = [ ("v3_01_happy", TEXT, "开心愉悦"), ("v3_02_sad", TEXT, "悲伤难过"), ("v3_03_angry", TEXT, "愤怒生气"), ("v3_04_low", TEXT, "低落沮丧"), ("v3_05_surprise",TEXT, "惊讶意外"), ("v3_06_calm", TEXT, "平静自然"), ] for name, t, emo in cases: vec = emo_keywords_to_vector(emo) print(f" emo_text={repr(emo)} → emo_vector={vec}") run(name, text=t, emo_vector=vec, emo_alpha=1.0) print("All tests complete. Files saved to:", OUT_DIR) print("Files:") for f in sorted(os.listdir(OUT_DIR)): path = os.path.join(OUT_DIR, f) print(f" {f} ({os.path.getsize(path)} bytes)")