From e1dbb79564857e3a06c9ffc28b22b892e42e536f Mon Sep 17 00:00:00 2001 From: bdim404 Date: Mon, 9 Mar 2026 11:53:16 +0800 Subject: [PATCH] refactor(tts_service): simplify audio data handling in LocalTTSBackend --- qwen3-tts-backend/api/voice_designs.py | 4 ---- qwen3-tts-backend/core/tts_service.py | 19 ++++++------------- 2 files changed, 6 insertions(+), 17 deletions(-) diff --git a/qwen3-tts-backend/api/voice_designs.py b/qwen3-tts-backend/api/voice_designs.py index 77e2140..5d26453 100644 --- a/qwen3-tts-backend/api/voice_designs.py +++ b/qwen3-tts-backend/api/voice_designs.py @@ -132,7 +132,6 @@ async def prepare_and_create_voice_design( x_vector = tts.create_voice_clone_prompt( ref_audio=(ref_audio_array, ref_sr), ref_text=ref_text, - x_vector_only_mode=True ) cache_manager = await VoiceCacheManager.get_instance() @@ -142,7 +141,6 @@ async def prepare_and_create_voice_design( 'duration': features['duration'], 'sample_rate': features['sample_rate'], 'ref_text': ref_text, - 'x_vector_only_mode': True, 'instruct': data.instruct } cache_id = await cache_manager.set_cache( @@ -257,7 +255,6 @@ async def prepare_voice_clone_prompt( x_vector = tts.create_voice_clone_prompt( ref_audio=(ref_audio_array, ref_sr), ref_text=ref_text, - x_vector_only_mode=True ) cache_manager = await VoiceCacheManager.get_instance() @@ -268,7 +265,6 @@ async def prepare_voice_clone_prompt( 'duration': features['duration'], 'sample_rate': features['sample_rate'], 'ref_text': ref_text, - 'x_vector_only_mode': True, 'voice_design_id': design_id, 'instruct': design.instruct } diff --git a/qwen3-tts-backend/core/tts_service.py b/qwen3-tts-backend/core/tts_service.py index a410858..813adcf 100644 --- a/qwen3-tts-backend/core/tts_service.py +++ b/qwen3-tts-backend/core/tts_service.py @@ -52,15 +52,9 @@ class LocalTTSBackend(TTSBackend): ) import numpy as np - if isinstance(result, tuple): - audio_data = result[0] - else: - audio_data = result - - if isinstance(audio_data, list): - audio_data = np.array(audio_data) - - return self._numpy_to_bytes(audio_data), 24000 + wavs, sample_rate = result if isinstance(result, tuple) else (result, 24000) + audio_data = wavs[0] if isinstance(wavs, list) else wavs + return self._numpy_to_bytes(audio_data), sample_rate async def generate_voice_design(self, params: dict) -> Tuple[bytes, int]: await self.model_manager.load_model("voice-design") @@ -78,10 +72,9 @@ class LocalTTSBackend(TTSBackend): ) import numpy as np - audio_data = result[0] if isinstance(result, tuple) else result - if isinstance(audio_data, list): - audio_data = np.array(audio_data) - return self._numpy_to_bytes(audio_data), 24000 + wavs, sample_rate = result if isinstance(result, tuple) else (result, 24000) + audio_data = wavs[0] if isinstance(wavs, list) else wavs + return self._numpy_to_bytes(audio_data), sample_rate async def generate_voice_clone(self, params: dict, ref_audio_bytes: bytes = None, x_vector=None) -> Tuple[bytes, int]: from utils.audio import process_ref_audio