refactor: rename canto-backend → backend, canto-frontend → frontend

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 18:11:00 +08:00
parent 2fa9c1fcb6
commit 60489eab59
327 changed files with 0 additions and 0 deletions
--- a/backend/indextts/s2mel/modules/campplus/layers.py
+++ b/backend/indextts/s2mel/modules/campplus/layers.py
@@ -0,0 +1,253 @@
+# Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from torch import nn
+
+
+def get_nonlinear(config_str, channels):
+    nonlinear = nn.Sequential()
+    for name in config_str.split('-'):
+        if name == 'relu':
+            nonlinear.add_module('relu', nn.ReLU(inplace=True))
+        elif name == 'prelu':
+            nonlinear.add_module('prelu', nn.PReLU(channels))
+        elif name == 'batchnorm':
+            nonlinear.add_module('batchnorm', nn.BatchNorm1d(channels))
+        elif name == 'batchnorm_':
+            nonlinear.add_module('batchnorm',
+                                 nn.BatchNorm1d(channels, affine=False))
+        else:
+            raise ValueError('Unexpected module ({}).'.format(name))
+    return nonlinear
+
+def statistics_pooling(x, dim=-1, keepdim=False, unbiased=True, eps=1e-2):
+    mean = x.mean(dim=dim)
+    std = x.std(dim=dim, unbiased=unbiased)
+    stats = torch.cat([mean, std], dim=-1)
+    if keepdim:
+        stats = stats.unsqueeze(dim=dim)
+    return stats
+
+
+class StatsPool(nn.Module):
+    def forward(self, x):
+        return statistics_pooling(x)
+
+
+class TDNNLayer(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 bias=False,
+                 config_str='batchnorm-relu'):
+        super(TDNNLayer, self).__init__()
+        if padding < 0:
+            assert kernel_size % 2 == 1, 'Expect equal paddings, but got even kernel size ({})'.format(
+                kernel_size)
+            padding = (kernel_size - 1) // 2 * dilation
+        self.linear = nn.Conv1d(in_channels,
+                                out_channels,
+                                kernel_size,
+                                stride=stride,
+                                padding=padding,
+                                dilation=dilation,
+                                bias=bias)
+        self.nonlinear = get_nonlinear(config_str, out_channels)
+
+    def forward(self, x):
+        x = self.linear(x)
+        x = self.nonlinear(x)
+        return x
+
+
+class CAMLayer(nn.Module):
+    def __init__(self,
+                bn_channels,
+                out_channels,
+                kernel_size,
+                stride,
+                padding,
+                dilation,
+                bias,
+                reduction=2):
+        super(CAMLayer, self).__init__()
+        self.linear_local = nn.Conv1d(bn_channels,
+                                 out_channels,
+                                 kernel_size,
+                                 stride=stride,
+                                 padding=padding,
+                                 dilation=dilation,
+                                 bias=bias)
+        self.linear1 = nn.Conv1d(bn_channels, bn_channels // reduction, 1)
+        self.relu = nn.ReLU(inplace=True)
+        self.linear2 = nn.Conv1d(bn_channels // reduction, out_channels, 1)
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, x):
+        y = self.linear_local(x)
+        context = x.mean(-1, keepdim=True)+self.seg_pooling(x)
+        context = self.relu(self.linear1(context))
+        m = self.sigmoid(self.linear2(context))
+        return y*m
+
+    def seg_pooling(self, x, seg_len=100, stype='avg'):
+        if stype == 'avg':
+            seg = F.avg_pool1d(x, kernel_size=seg_len, stride=seg_len, ceil_mode=True)
+        elif stype == 'max':
+            seg = F.max_pool1d(x, kernel_size=seg_len, stride=seg_len, ceil_mode=True)
+        else:
+            raise ValueError('Wrong segment pooling type.')
+        shape = seg.shape
+        seg = seg.unsqueeze(-1).expand(*shape, seg_len).reshape(*shape[:-1], -1)
+        seg = seg[..., :x.shape[-1]]
+        return seg
+
+
+class CAMDenseTDNNLayer(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 bn_channels,
+                 kernel_size,
+                 stride=1,
+                 dilation=1,
+                 bias=False,
+                 config_str='batchnorm-relu',
+                 memory_efficient=False):
+        super(CAMDenseTDNNLayer, self).__init__()
+        assert kernel_size % 2 == 1, 'Expect equal paddings, but got even kernel size ({})'.format(
+            kernel_size)
+        padding = (kernel_size - 1) // 2 * dilation
+        self.memory_efficient = memory_efficient
+        self.nonlinear1 = get_nonlinear(config_str, in_channels)
+        self.linear1 = nn.Conv1d(in_channels, bn_channels, 1, bias=False)
+        self.nonlinear2 = get_nonlinear(config_str, bn_channels)
+        self.cam_layer = CAMLayer(bn_channels,
+                                out_channels,
+                                kernel_size,
+                                stride=stride,
+                                padding=padding,
+                                dilation=dilation,
+                                bias=bias)
+
+    def bn_function(self, x):
+        return self.linear1(self.nonlinear1(x))
+
+    def forward(self, x):
+        if self.training and self.memory_efficient:
+            x = cp.checkpoint(self.bn_function, x)
+        else:
+            x = self.bn_function(x)
+        x = self.cam_layer(self.nonlinear2(x))
+        return x
+
+
+class CAMDenseTDNNBlock(nn.ModuleList):
+    def __init__(self,
+                 num_layers,
+                 in_channels,
+                 out_channels,
+                 bn_channels,
+                 kernel_size,
+                 stride=1,
+                 dilation=1,
+                 bias=False,
+                 config_str='batchnorm-relu',
+                 memory_efficient=False):
+        super(CAMDenseTDNNBlock, self).__init__()
+        for i in range(num_layers):
+            layer = CAMDenseTDNNLayer(in_channels=in_channels + i * out_channels,
+                                   out_channels=out_channels,
+                                   bn_channels=bn_channels,
+                                   kernel_size=kernel_size,
+                                   stride=stride,
+                                   dilation=dilation,
+                                   bias=bias,
+                                   config_str=config_str,
+                                   memory_efficient=memory_efficient)
+            self.add_module('tdnnd%d' % (i + 1), layer)
+
+    def forward(self, x):
+        for layer in self:
+            x = torch.cat([x, layer(x)], dim=1)
+        return x
+
+
+class TransitLayer(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 bias=True,
+                 config_str='batchnorm-relu'):
+        super(TransitLayer, self).__init__()
+        self.nonlinear = get_nonlinear(config_str, in_channels)
+        self.linear = nn.Conv1d(in_channels, out_channels, 1, bias=bias)
+
+    def forward(self, x):
+        x = self.nonlinear(x)
+        x = self.linear(x)
+        return x
+
+
+class DenseLayer(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 bias=False,
+                 config_str='batchnorm-relu'):
+        super(DenseLayer, self).__init__()
+        self.linear = nn.Conv1d(in_channels, out_channels, 1, bias=bias)
+        self.nonlinear = get_nonlinear(config_str, out_channels)
+
+    def forward(self, x):
+        if len(x.shape) == 2:
+            x = self.linear(x.unsqueeze(dim=-1)).squeeze(dim=-1)
+        else:
+            x = self.linear(x)
+        x = self.nonlinear(x)
+        return x
+
+
+class BasicResBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, in_planes, planes, stride=1):
+        super(BasicResBlock, self).__init__()
+        self.conv1 = nn.Conv2d(in_planes,
+                               planes,
+                               kernel_size=3,
+                               stride=(stride, 1),
+                               padding=1,
+                               bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes,
+                               planes,
+                               kernel_size=3,
+                               stride=1,
+                               padding=1,
+                               bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+
+        self.shortcut = nn.Sequential()
+        if stride != 1 or in_planes != self.expansion * planes:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_planes,
+                          self.expansion * planes,
+                          kernel_size=1,
+                          stride=(stride, 1),
+                          bias=False),
+                nn.BatchNorm2d(self.expansion * planes))
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.bn2(self.conv2(out))
+        out += self.shortcut(x)
+        out = F.relu(out)
+        return out