feat: Add voice design support for voice cloning and enhance cache management

This commit is contained in:
2026-02-04 17:52:24 +08:00
parent 13820e38c7
commit 9e5d12c9fb
5 changed files with 247 additions and 27 deletions

View File

@@ -72,6 +72,36 @@ class VoiceCacheManager:
logger.error(f"Cache retrieval error: {e}", exc_info=True)
return None
async def get_cache_by_id(self, cache_id: int, db: Session) -> Optional[Dict[str, Any]]:
try:
cache_entry = db.query(VoiceCache).filter(VoiceCache.id == cache_id).first()
if not cache_entry:
logger.debug(f"Cache not found: id={cache_id}")
return None
cache_file = Path(cache_entry.cache_path)
if not cache_file.exists():
logger.warning(f"Cache file missing: {cache_file}")
return None
with open(cache_file, 'rb') as f:
cache_data = pickle.load(f)
cache_entry.last_accessed = datetime.utcnow()
cache_entry.access_count += 1
db.commit()
logger.info(f"Cache loaded by id: cache_id={cache_id}, access_count={cache_entry.access_count}")
return {
'cache_id': cache_entry.id,
'data': cache_data,
'metadata': cache_entry.meta_data
}
except Exception as e:
logger.error(f"Cache retrieval by id error: {e}", exc_info=True)
return None
async def set_cache(
self,
user_id: int,

View File

@@ -83,19 +83,23 @@ class LocalTTSBackend(TTSBackend):
audio_data = np.array(audio_data)
return self._numpy_to_bytes(audio_data), 24000
async def generate_voice_clone(self, params: dict, ref_audio_bytes: bytes) -> Tuple[bytes, int]:
async def generate_voice_clone(self, params: dict, ref_audio_bytes: bytes = None, x_vector=None) -> Tuple[bytes, int]:
from utils.audio import process_ref_audio
ref_audio_array, ref_sr = process_ref_audio(ref_audio_bytes)
await self.model_manager.load_model("base")
_, tts = await self.model_manager.get_current_model()
x_vector = tts.create_voice_clone_prompt(
ref_audio=(ref_audio_array, ref_sr),
ref_text=params.get('ref_text', ''),
x_vector_only_mode=False
)
if x_vector is None:
if ref_audio_bytes is None:
raise ValueError("Either ref_audio_bytes or x_vector must be provided")
ref_audio_array, ref_sr = process_ref_audio(ref_audio_bytes)
x_vector = tts.create_voice_clone_prompt(
ref_audio=(ref_audio_array, ref_sr),
ref_text=params.get('ref_text', ''),
x_vector_only_mode=False
)
wavs, sample_rate = tts.generate_voice_clone(
text=params['text'],