refactor(tts_service): simplify audio data handling in LocalTTSBackend

This commit is contained in:
2026-03-09 11:53:16 +08:00
parent 9b6691bffe
commit e1dbb79564
2 changed files with 6 additions and 17 deletions

View File

@@ -132,7 +132,6 @@ async def prepare_and_create_voice_design(
x_vector = tts.create_voice_clone_prompt( x_vector = tts.create_voice_clone_prompt(
ref_audio=(ref_audio_array, ref_sr), ref_audio=(ref_audio_array, ref_sr),
ref_text=ref_text, ref_text=ref_text,
x_vector_only_mode=True
) )
cache_manager = await VoiceCacheManager.get_instance() cache_manager = await VoiceCacheManager.get_instance()
@@ -142,7 +141,6 @@ async def prepare_and_create_voice_design(
'duration': features['duration'], 'duration': features['duration'],
'sample_rate': features['sample_rate'], 'sample_rate': features['sample_rate'],
'ref_text': ref_text, 'ref_text': ref_text,
'x_vector_only_mode': True,
'instruct': data.instruct 'instruct': data.instruct
} }
cache_id = await cache_manager.set_cache( cache_id = await cache_manager.set_cache(
@@ -257,7 +255,6 @@ async def prepare_voice_clone_prompt(
x_vector = tts.create_voice_clone_prompt( x_vector = tts.create_voice_clone_prompt(
ref_audio=(ref_audio_array, ref_sr), ref_audio=(ref_audio_array, ref_sr),
ref_text=ref_text, ref_text=ref_text,
x_vector_only_mode=True
) )
cache_manager = await VoiceCacheManager.get_instance() cache_manager = await VoiceCacheManager.get_instance()
@@ -268,7 +265,6 @@ async def prepare_voice_clone_prompt(
'duration': features['duration'], 'duration': features['duration'],
'sample_rate': features['sample_rate'], 'sample_rate': features['sample_rate'],
'ref_text': ref_text, 'ref_text': ref_text,
'x_vector_only_mode': True,
'voice_design_id': design_id, 'voice_design_id': design_id,
'instruct': design.instruct 'instruct': design.instruct
} }

View File

@@ -52,15 +52,9 @@ class LocalTTSBackend(TTSBackend):
) )
import numpy as np import numpy as np
if isinstance(result, tuple): wavs, sample_rate = result if isinstance(result, tuple) else (result, 24000)
audio_data = result[0] audio_data = wavs[0] if isinstance(wavs, list) else wavs
else: return self._numpy_to_bytes(audio_data), sample_rate
audio_data = result
if isinstance(audio_data, list):
audio_data = np.array(audio_data)
return self._numpy_to_bytes(audio_data), 24000
async def generate_voice_design(self, params: dict) -> Tuple[bytes, int]: async def generate_voice_design(self, params: dict) -> Tuple[bytes, int]:
await self.model_manager.load_model("voice-design") await self.model_manager.load_model("voice-design")
@@ -78,10 +72,9 @@ class LocalTTSBackend(TTSBackend):
) )
import numpy as np import numpy as np
audio_data = result[0] if isinstance(result, tuple) else result wavs, sample_rate = result if isinstance(result, tuple) else (result, 24000)
if isinstance(audio_data, list): audio_data = wavs[0] if isinstance(wavs, list) else wavs
audio_data = np.array(audio_data) return self._numpy_to_bytes(audio_data), sample_rate
return self._numpy_to_bytes(audio_data), 24000
async def generate_voice_clone(self, params: dict, ref_audio_bytes: bytes = None, x_vector=None) -> Tuple[bytes, int]: async def generate_voice_clone(self, params: dict, ref_audio_bytes: bytes = None, x_vector=None) -> Tuple[bytes, int]:
from utils.audio import process_ref_audio from utils.audio import process_ref_audio