init commit

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-26 15:34:31 +08:00
commit 80513a3258
141 changed files with 24966 additions and 0 deletions

View File

View File

@@ -0,0 +1,113 @@
import base64
import io
from pathlib import Path
import numpy as np
import soundfile as sf
from scipy import signal
def validate_ref_audio(audio_data: bytes, max_size_mb: int = 10) -> bool:
try:
size_mb = len(audio_data) / (1024 * 1024)
if size_mb > max_size_mb:
return False
buffer = io.BytesIO(audio_data)
audio_array, sample_rate = sf.read(buffer)
duration = len(audio_array) / sample_rate
if duration < 1.0 or duration > 30.0:
return False
return True
except Exception:
return False
def process_ref_audio(audio_data: bytes) -> tuple[np.ndarray, int]:
buffer = io.BytesIO(audio_data)
audio_array, orig_sr = sf.read(buffer)
if audio_array.ndim > 1:
audio_array = np.mean(audio_array, axis=1)
target_sr = 24000
if orig_sr != target_sr:
audio_array = resample_audio(audio_array, orig_sr, target_sr)
audio_array = audio_array.astype(np.float32)
return audio_array, target_sr
def resample_audio(audio_array: np.ndarray, orig_sr: int, target_sr: int = 24000) -> np.ndarray:
if orig_sr == target_sr:
return audio_array
num_samples = int(len(audio_array) * target_sr / orig_sr)
resampled = signal.resample(audio_array, num_samples)
return resampled.astype(np.float32)
def extract_audio_features(audio_array: np.ndarray, sample_rate: int) -> dict:
duration = len(audio_array) / sample_rate
rms_energy = np.sqrt(np.mean(audio_array ** 2))
return {
'duration': float(duration),
'sample_rate': int(sample_rate),
'num_samples': int(len(audio_array)),
'rms_energy': float(rms_energy)
}
def encode_audio_to_base64(audio_array: np.ndarray, sample_rate: int) -> str:
buffer = io.BytesIO()
sf.write(buffer, audio_array, sample_rate, format='WAV')
buffer.seek(0)
audio_bytes = buffer.read()
return base64.b64encode(audio_bytes).decode('utf-8')
def decode_base64_to_audio(base64_string: str) -> tuple[np.ndarray, int]:
audio_bytes = base64.b64decode(base64_string)
buffer = io.BytesIO(audio_bytes)
audio_array, sample_rate = sf.read(buffer)
return audio_array, sample_rate
def validate_audio_format(audio_data: bytes) -> bool:
try:
buffer = io.BytesIO(audio_data)
sf.read(buffer)
return True
except Exception:
return False
def get_audio_duration(audio_array: np.ndarray, sample_rate: int) -> float:
return len(audio_array) / sample_rate
def save_audio_file(
audio_array: np.ndarray,
sample_rate: int,
output_path: str | Path
) -> str:
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
if not isinstance(audio_array, np.ndarray):
audio_array = np.array(audio_array, dtype=np.float32)
if audio_array.ndim == 1:
pass
elif audio_array.ndim == 2:
if audio_array.shape[0] < audio_array.shape[1]:
audio_array = audio_array.T
else:
raise ValueError(f"Unexpected audio array shape: {audio_array.shape}")
audio_array = audio_array.astype(np.float32)
sf.write(str(output_path), audio_array, sample_rate, format='WAV', subtype='PCM_16')
return str(output_path)

View File

@@ -0,0 +1,80 @@
import threading
from typing import Dict
from pathlib import Path
from sqlalchemy.orm import Session
from db.models import VoiceCache
class CacheMetrics:
def __init__(self):
self._lock = threading.Lock()
self.cache_hits = 0
self.cache_misses = 0
self._user_hits: Dict[int, int] = {}
self._user_misses: Dict[int, int] = {}
def record_hit(self, user_id: int):
with self._lock:
self.cache_hits += 1
self._user_hits[user_id] = self._user_hits.get(user_id, 0) + 1
def record_miss(self, user_id: int):
with self._lock:
self.cache_misses += 1
self._user_misses[user_id] = self._user_misses.get(user_id, 0) + 1
def get_stats(self, db: Session, cache_dir: str) -> dict:
with self._lock:
total_requests = self.cache_hits + self.cache_misses
hit_rate = self.cache_hits / total_requests if total_requests > 0 else 0.0
total_entries = db.query(VoiceCache).count()
total_size_bytes = 0
cache_path = Path(cache_dir)
if cache_path.exists():
for cache_file in cache_path.glob("*.pkl"):
total_size_bytes += cache_file.stat().st_size
total_size_mb = total_size_bytes / (1024 * 1024)
user_stats = []
for user_id in set(list(self._user_hits.keys()) + list(self._user_misses.keys())):
hits = self._user_hits.get(user_id, 0)
misses = self._user_misses.get(user_id, 0)
total = hits + misses
user_hit_rate = hits / total if total > 0 else 0.0
user_cache_count = db.query(VoiceCache).filter(
VoiceCache.user_id == user_id
).count()
user_stats.append({
'user_id': user_id,
'hits': hits,
'misses': misses,
'hit_rate': user_hit_rate,
'cache_entries': user_cache_count
})
return {
'global': {
'total_requests': total_requests,
'cache_hits': self.cache_hits,
'cache_misses': self.cache_misses,
'hit_rate': hit_rate,
'total_entries': total_entries,
'total_size_mb': total_size_mb
},
'users': user_stats
}
def reset(self):
with self._lock:
self.cache_hits = 0
self.cache_misses = 0
self._user_hits.clear()
self._user_misses.clear()
cache_metrics = CacheMetrics()

View File

@@ -0,0 +1,102 @@
from typing import List, Dict
SUPPORTED_LANGUAGES = [
"Chinese", "English", "Japanese", "Korean", "German",
"French", "Russian", "Portuguese", "Spanish", "Italian",
"Auto", "Cantonese"
]
SUPPORTED_SPEAKERS = [
"Vivian", "Serena", "Uncle_Fu", "Dylan", "Eric",
"Ryan", "Aiden", "Ono_Anna", "Sohee"
]
SPEAKER_DESCRIPTIONS = {
"Vivian": "Female, professional and clear",
"Serena": "Female, gentle and warm",
"Uncle_Fu": "Male, mature and authoritative",
"Dylan": "Male, young and energetic",
"Eric": "Male, calm and steady",
"Ryan": "Male, friendly and casual",
"Aiden": "Male, deep and resonant",
"Ono_Anna": "Female, cute and lively",
"Sohee": "Female, soft and melodious"
}
def validate_language(language: str) -> str:
normalized = language.strip()
for supported in SUPPORTED_LANGUAGES:
if normalized.lower() == supported.lower():
return supported
raise ValueError(
f"Unsupported language: {language}. "
f"Supported languages: {', '.join(SUPPORTED_LANGUAGES)}"
)
def validate_speaker(speaker: str) -> str:
normalized = speaker.strip()
for supported in SUPPORTED_SPEAKERS:
if normalized.lower() == supported.lower():
return supported
raise ValueError(
f"Unsupported speaker: {speaker}. "
f"Supported speakers: {', '.join(SUPPORTED_SPEAKERS)}"
)
def validate_text_length(text: str, max_length: int = 1000) -> str:
if not text or not text.strip():
raise ValueError("Text cannot be empty")
if len(text) > max_length:
raise ValueError(
f"Text length ({len(text)}) exceeds maximum ({max_length})"
)
return text.strip()
def validate_generation_params(params: dict) -> dict:
validated = {}
validated['max_new_tokens'] = params.get('max_new_tokens', 2048)
if not 128 <= validated['max_new_tokens'] <= 4096:
raise ValueError("max_new_tokens must be between 128 and 4096")
validated['temperature'] = params.get('temperature', 0.9)
if not 0.1 <= validated['temperature'] <= 2.0:
raise ValueError("temperature must be between 0.1 and 2.0")
validated['top_k'] = params.get('top_k', 50)
if not 1 <= validated['top_k'] <= 100:
raise ValueError("top_k must be between 1 and 100")
validated['top_p'] = params.get('top_p', 1.0)
if not 0.0 <= validated['top_p'] <= 1.0:
raise ValueError("top_p must be between 0.0 and 1.0")
validated['repetition_penalty'] = params.get('repetition_penalty', 1.05)
if not 1.0 <= validated['repetition_penalty'] <= 2.0:
raise ValueError("repetition_penalty must be between 1.0 and 2.0")
return validated
def get_supported_languages() -> List[str]:
return SUPPORTED_LANGUAGES.copy()
def get_supported_speakers() -> List[dict]:
return [
{
"name": speaker,
"description": SPEAKER_DESCRIPTIONS.get(speaker, "")
}
for speaker in SUPPORTED_SPEAKERS
]