refactor: rename canto-backend → backend, canto-frontend → frontend

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 18:11:00 +08:00
parent 2fa9c1fcb6
commit 60489eab59
327 changed files with 0 additions and 0 deletions
--- a/backend/indextts/utils/maskgct/models/codec/facodec/facodec_dataset.py
+++ b/backend/indextts/utils/maskgct/models/codec/facodec/facodec_dataset.py
@@ -0,0 +1,98 @@
+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import random
+
+import numpy as np
+
+import torchaudio
+import librosa
+from torch.nn import functional as F
+
+from torch.nn.utils.rnn import pad_sequence
+from utils.data_utils import *
+from models.codec.codec_dataset import CodecDataset
+
+
+class FAcodecDataset(torch.utils.data.Dataset):
+    def __init__(self, cfg, dataset, is_valid=False):
+        """
+        Args:
+            cfg: config
+            dataset: dataset name
+            is_valid: whether to use train or valid dataset
+        """
+        self.data_root_dir = cfg.dataset
+        self.data_list = []
+        # walk through the dataset directory recursively, save all files ends with .wav/.mp3/.opus/.flac/.m4a
+        for root, _, files in os.walk(self.data_root_dir):
+            for file in files:
+                if file.endswith((".wav", ".mp3", ".opus", ".flac", ".m4a")):
+                    self.data_list.append(os.path.join(root, file))
+        self.sr = cfg.preprocess_params.sr
+        self.duration_range = cfg.preprocess_params.duration_range
+        self.to_mel = torchaudio.transforms.MelSpectrogram(
+            n_mels=cfg.preprocess_params.spect_params.n_mels,
+            n_fft=cfg.preprocess_params.spect_params.n_fft,
+            win_length=cfg.preprocess_params.spect_params.win_length,
+            hop_length=cfg.preprocess_params.spect_params.hop_length,
+        )
+        self.mean, self.std = -4, 4
+
+    def preprocess(self, wave):
+        wave_tensor = (
+            torch.from_numpy(wave).float() if isinstance(wave, np.ndarray) else wave
+        )
+        mel_tensor = self.to_mel(wave_tensor)
+        mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - self.mean) / self.std
+        return mel_tensor
+
+    def __len__(self):
+        # return len(self.data_list)
+        return len(self.data_list)  # return a fixed number for testing
+
+    def __getitem__(self, index):
+        wave, _ = librosa.load(self.data_list[index], sr=self.sr)
+        wave = np.random.randn(self.sr * random.randint(*self.duration_range))
+        wave = wave / np.max(np.abs(wave))
+        mel = self.preprocess(wave).squeeze(0)
+        wave = torch.from_numpy(wave).float()
+        return wave, mel
+
+
+class FAcodecCollator(object):
+    """Zero-pads model inputs and targets based on number of frames per step"""
+
+    def __init__(self, cfg):
+        self.cfg = cfg
+
+    def __call__(self, batch):
+        # batch[0] = wave, mel, text, f0, speakerid
+        batch_size = len(batch)
+
+        # sort by mel length
+        lengths = [b[1].shape[1] for b in batch]
+        batch_indexes = np.argsort(lengths)[::-1]
+        batch = [batch[bid] for bid in batch_indexes]
+
+        nmels = batch[0][1].size(0)
+        max_mel_length = max([b[1].shape[1] for b in batch])
+        max_wave_length = max([b[0].size(0) for b in batch])
+
+        mels = torch.zeros((batch_size, nmels, max_mel_length)).float() - 10
+        waves = torch.zeros((batch_size, max_wave_length)).float()
+
+        mel_lengths = torch.zeros(batch_size).long()
+        wave_lengths = torch.zeros(batch_size).long()
+
+        for bid, (wave, mel) in enumerate(batch):
+            mel_size = mel.size(1)
+            mels[bid, :, :mel_size] = mel
+            waves[bid, : wave.size(0)] = wave
+            mel_lengths[bid] = mel_size
+            wave_lengths[bid] = wave.size(0)
+
+        return waves, mels, wave_lengths, mel_lengths