init commit

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-26 15:34:31 +08:00
commit 80513a3258
141 changed files with 24966 additions and 0 deletions
--- a/qwen3-tts-backend/utils/init.py
+++ b/qwen3-tts-backend/utils/init.py
--- a/qwen3-tts-backend/utils/audio.py
+++ b/qwen3-tts-backend/utils/audio.py
@@ -0,0 +1,113 @@
+import base64
+import io
+from pathlib import Path
+import numpy as np
+import soundfile as sf
+from scipy import signal
+
+
+def validate_ref_audio(audio_data: bytes, max_size_mb: int = 10) -> bool:
+    try:
+        size_mb = len(audio_data) / (1024 * 1024)
+        if size_mb > max_size_mb:
+            return False
+
+        buffer = io.BytesIO(audio_data)
+        audio_array, sample_rate = sf.read(buffer)
+
+        duration = len(audio_array) / sample_rate
+        if duration < 1.0 or duration > 30.0:
+            return False
+
+        return True
+    except Exception:
+        return False
+
+
+def process_ref_audio(audio_data: bytes) -> tuple[np.ndarray, int]:
+    buffer = io.BytesIO(audio_data)
+    audio_array, orig_sr = sf.read(buffer)
+
+    if audio_array.ndim > 1:
+        audio_array = np.mean(audio_array, axis=1)
+
+    target_sr = 24000
+    if orig_sr != target_sr:
+        audio_array = resample_audio(audio_array, orig_sr, target_sr)
+
+    audio_array = audio_array.astype(np.float32)
+    return audio_array, target_sr
+
+
+def resample_audio(audio_array: np.ndarray, orig_sr: int, target_sr: int = 24000) -> np.ndarray:
+    if orig_sr == target_sr:
+        return audio_array
+
+    num_samples = int(len(audio_array) * target_sr / orig_sr)
+    resampled = signal.resample(audio_array, num_samples)
+    return resampled.astype(np.float32)
+
+
+def extract_audio_features(audio_array: np.ndarray, sample_rate: int) -> dict:
+    duration = len(audio_array) / sample_rate
+    rms_energy = np.sqrt(np.mean(audio_array ** 2))
+
+    return {
+        'duration': float(duration),
+        'sample_rate': int(sample_rate),
+        'num_samples': int(len(audio_array)),
+        'rms_energy': float(rms_energy)
+    }
+
+
+def encode_audio_to_base64(audio_array: np.ndarray, sample_rate: int) -> str:
+    buffer = io.BytesIO()
+    sf.write(buffer, audio_array, sample_rate, format='WAV')
+    buffer.seek(0)
+    audio_bytes = buffer.read()
+    return base64.b64encode(audio_bytes).decode('utf-8')
+
+
+def decode_base64_to_audio(base64_string: str) -> tuple[np.ndarray, int]:
+    audio_bytes = base64.b64decode(base64_string)
+    buffer = io.BytesIO(audio_bytes)
+    audio_array, sample_rate = sf.read(buffer)
+    return audio_array, sample_rate
+
+
+def validate_audio_format(audio_data: bytes) -> bool:
+    try:
+        buffer = io.BytesIO(audio_data)
+        sf.read(buffer)
+        return True
+    except Exception:
+        return False
+
+
+def get_audio_duration(audio_array: np.ndarray, sample_rate: int) -> float:
+    return len(audio_array) / sample_rate
+
+
+def save_audio_file(
+    audio_array: np.ndarray,
+    sample_rate: int,
+    output_path: str | Path
+) -> str:
+    output_path = Path(output_path)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    if not isinstance(audio_array, np.ndarray):
+        audio_array = np.array(audio_array, dtype=np.float32)
+
+    if audio_array.ndim == 1:
+        pass
+    elif audio_array.ndim == 2:
+        if audio_array.shape[0] < audio_array.shape[1]:
+            audio_array = audio_array.T
+    else:
+        raise ValueError(f"Unexpected audio array shape: {audio_array.shape}")
+
+    audio_array = audio_array.astype(np.float32)
+
+    sf.write(str(output_path), audio_array, sample_rate, format='WAV', subtype='PCM_16')
+    return str(output_path)
--- a/qwen3-tts-backend/utils/metrics.py
+++ b/qwen3-tts-backend/utils/metrics.py
@@ -0,0 +1,80 @@
+import threading
+from typing import Dict
+from pathlib import Path
+from sqlalchemy.orm import Session
+from db.models import VoiceCache
+
+
+class CacheMetrics:
+    def __init__(self):
+        self._lock = threading.Lock()
+        self.cache_hits = 0
+        self.cache_misses = 0
+        self._user_hits: Dict[int, int] = {}
+        self._user_misses: Dict[int, int] = {}
+
+    def record_hit(self, user_id: int):
+        with self._lock:
+            self.cache_hits += 1
+            self._user_hits[user_id] = self._user_hits.get(user_id, 0) + 1
+
+    def record_miss(self, user_id: int):
+        with self._lock:
+            self.cache_misses += 1
+            self._user_misses[user_id] = self._user_misses.get(user_id, 0) + 1
+
+    def get_stats(self, db: Session, cache_dir: str) -> dict:
+        with self._lock:
+            total_requests = self.cache_hits + self.cache_misses
+            hit_rate = self.cache_hits / total_requests if total_requests > 0 else 0.0
+
+            total_entries = db.query(VoiceCache).count()
+
+            total_size_bytes = 0
+            cache_path = Path(cache_dir)
+            if cache_path.exists():
+                for cache_file in cache_path.glob("*.pkl"):
+                    total_size_bytes += cache_file.stat().st_size
+
+            total_size_mb = total_size_bytes / (1024 * 1024)
+
+            user_stats = []
+            for user_id in set(list(self._user_hits.keys()) + list(self._user_misses.keys())):
+                hits = self._user_hits.get(user_id, 0)
+                misses = self._user_misses.get(user_id, 0)
+                total = hits + misses
+                user_hit_rate = hits / total if total > 0 else 0.0
+
+                user_cache_count = db.query(VoiceCache).filter(
+                    VoiceCache.user_id == user_id
+                ).count()
+
+                user_stats.append({
+                    'user_id': user_id,
+                    'hits': hits,
+                    'misses': misses,
+                    'hit_rate': user_hit_rate,
+                    'cache_entries': user_cache_count
+                })
+
+            return {
+                'global': {
+                    'total_requests': total_requests,
+                    'cache_hits': self.cache_hits,
+                    'cache_misses': self.cache_misses,
+                    'hit_rate': hit_rate,
+                    'total_entries': total_entries,
+                    'total_size_mb': total_size_mb
+                },
+                'users': user_stats
+            }
+
+    def reset(self):
+        with self._lock:
+            self.cache_hits = 0
+            self.cache_misses = 0
+            self._user_hits.clear()
+            self._user_misses.clear()
+
+
+cache_metrics = CacheMetrics()
--- a/qwen3-tts-backend/utils/validation.py
+++ b/qwen3-tts-backend/utils/validation.py
@@ -0,0 +1,102 @@
+from typing import List, Dict
+
+SUPPORTED_LANGUAGES = [
+    "Chinese", "English", "Japanese", "Korean", "German",
+    "French", "Russian", "Portuguese", "Spanish", "Italian",
+    "Auto", "Cantonese"
+]
+
+SUPPORTED_SPEAKERS = [
+    "Vivian", "Serena", "Uncle_Fu", "Dylan", "Eric",
+    "Ryan", "Aiden", "Ono_Anna", "Sohee"
+]
+
+SPEAKER_DESCRIPTIONS = {
+    "Vivian": "Female, professional and clear",
+    "Serena": "Female, gentle and warm",
+    "Uncle_Fu": "Male, mature and authoritative",
+    "Dylan": "Male, young and energetic",
+    "Eric": "Male, calm and steady",
+    "Ryan": "Male, friendly and casual",
+    "Aiden": "Male, deep and resonant",
+    "Ono_Anna": "Female, cute and lively",
+    "Sohee": "Female, soft and melodious"
+}
+
+
+def validate_language(language: str) -> str:
+    normalized = language.strip()
+
+    for supported in SUPPORTED_LANGUAGES:
+        if normalized.lower() == supported.lower():
+            return supported
+
+    raise ValueError(
+        f"Unsupported language: {language}. "
+        f"Supported languages: {', '.join(SUPPORTED_LANGUAGES)}"
+    )
+
+
+def validate_speaker(speaker: str) -> str:
+    normalized = speaker.strip()
+
+    for supported in SUPPORTED_SPEAKERS:
+        if normalized.lower() == supported.lower():
+            return supported
+
+    raise ValueError(
+        f"Unsupported speaker: {speaker}. "
+        f"Supported speakers: {', '.join(SUPPORTED_SPEAKERS)}"
+    )
+
+
+def validate_text_length(text: str, max_length: int = 1000) -> str:
+    if not text or not text.strip():
+        raise ValueError("Text cannot be empty")
+
+    if len(text) > max_length:
+        raise ValueError(
+            f"Text length ({len(text)}) exceeds maximum ({max_length})"
+        )
+
+    return text.strip()
+
+
+def validate_generation_params(params: dict) -> dict:
+    validated = {}
+
+    validated['max_new_tokens'] = params.get('max_new_tokens', 2048)
+    if not 128 <= validated['max_new_tokens'] <= 4096:
+        raise ValueError("max_new_tokens must be between 128 and 4096")
+
+    validated['temperature'] = params.get('temperature', 0.9)
+    if not 0.1 <= validated['temperature'] <= 2.0:
+        raise ValueError("temperature must be between 0.1 and 2.0")
+
+    validated['top_k'] = params.get('top_k', 50)
+    if not 1 <= validated['top_k'] <= 100:
+        raise ValueError("top_k must be between 1 and 100")
+
+    validated['top_p'] = params.get('top_p', 1.0)
+    if not 0.0 <= validated['top_p'] <= 1.0:
+        raise ValueError("top_p must be between 0.0 and 1.0")
+
+    validated['repetition_penalty'] = params.get('repetition_penalty', 1.05)
+    if not 1.0 <= validated['repetition_penalty'] <= 2.0:
+        raise ValueError("repetition_penalty must be between 1.0 and 2.0")
+
+    return validated
+
+
+def get_supported_languages() -> List[str]:
+    return SUPPORTED_LANGUAGES.copy()
+
+
+def get_supported_speakers() -> List[dict]:
+    return [
+        {
+            "name": speaker,
+            "description": SPEAKER_DESCRIPTIONS.get(speaker, "")
+        }
+        for speaker in SUPPORTED_SPEAKERS
+    ]