refactor: rename canto-backend → backend, canto-frontend → frontend
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
121
backend/indextts/utils/common.py
Normal file
121
backend/indextts/utils/common.py
Normal file
@@ -0,0 +1,121 @@
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
|
||||
import torch
|
||||
import torchaudio
|
||||
|
||||
MATPLOTLIB_FLAG = False
|
||||
|
||||
|
||||
def load_audio(audiopath, sampling_rate):
|
||||
audio, sr = torchaudio.load(audiopath)
|
||||
# print(f"wave shape: {audio.shape}, sample_rate: {sr}")
|
||||
|
||||
if audio.size(0) > 1: # mix to mono
|
||||
audio = audio[0].unsqueeze(0)
|
||||
|
||||
if sr != sampling_rate:
|
||||
try:
|
||||
audio = torchaudio.functional.resample(audio, sr, sampling_rate)
|
||||
except Exception as e:
|
||||
print(f"Warning: {audiopath}, wave shape: {audio.shape}, sample_rate: {sr}")
|
||||
return None
|
||||
# clip audio invalid values
|
||||
audio.clip_(-1, 1)
|
||||
return audio
|
||||
|
||||
|
||||
def tokenize_by_CJK_char(line: str, do_upper_case=True) -> str:
|
||||
"""
|
||||
Tokenize a line of text with CJK char.
|
||||
|
||||
Note: All return charaters will be upper case.
|
||||
|
||||
Example:
|
||||
input = "你好世界是 hello world 的中文"
|
||||
output = "你 好 世 界 是 HELLO WORLD 的 中 文"
|
||||
|
||||
Args:
|
||||
line:
|
||||
The input text.
|
||||
|
||||
Return:
|
||||
A new string tokenize by CJK char.
|
||||
"""
|
||||
# The CJK ranges is from https://github.com/alvations/nltk/blob/79eed6ddea0d0a2c212c1060b477fc268fec4d4b/nltk/tokenize/util.py
|
||||
CJK_RANGE_PATTERN = (
|
||||
r"([\u1100-\u11ff\u2e80-\ua4cf\ua840-\uD7AF\uF900-\uFAFF\uFE30-\uFE4F\uFF65-\uFFDC\U00020000-\U0002FFFF])"
|
||||
)
|
||||
chars = re.split(CJK_RANGE_PATTERN, line.strip())
|
||||
return " ".join([w.strip().upper() if do_upper_case else w.strip() for w in chars if w.strip()])
|
||||
|
||||
|
||||
def de_tokenized_by_CJK_char(line: str, do_lower_case=False) -> str:
|
||||
"""
|
||||
Example:
|
||||
input = "你 好 世 界 是 HELLO WORLD 的 中 文"
|
||||
output = "你好世界是 hello world 的中文"
|
||||
|
||||
do_lower_case:
|
||||
input = "SEE YOU!"
|
||||
output = "see you!"
|
||||
"""
|
||||
# replace english words in the line with placeholders
|
||||
english_word_pattern = re.compile(r"([A-Z]+(?:[\s-][A-Z-]+)*)", re.IGNORECASE)
|
||||
english_sents = english_word_pattern.findall(line)
|
||||
for i, sent in enumerate(english_sents):
|
||||
line = line.replace(sent, f"<sent_{i}>")
|
||||
|
||||
words = line.split()
|
||||
# restore english sentences
|
||||
sent_placeholder_pattern = re.compile(r"^.*?(<sent_(\d+)>)")
|
||||
for i in range(len(words)):
|
||||
m = sent_placeholder_pattern.match(words[i])
|
||||
if m:
|
||||
# restore the english word
|
||||
placeholder_index = int(m.group(2))
|
||||
words[i] = words[i].replace(m.group(1), english_sents[placeholder_index])
|
||||
if do_lower_case:
|
||||
words[i] = words[i].lower()
|
||||
return "".join(words)
|
||||
|
||||
|
||||
def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor:
|
||||
"""Make mask tensor containing indices of padded part.
|
||||
|
||||
See description of make_non_pad_mask.
|
||||
|
||||
Args:
|
||||
lengths (torch.Tensor): Batch of lengths (B,).
|
||||
Returns:
|
||||
torch.Tensor: Mask tensor containing indices of padded part.
|
||||
|
||||
Examples:
|
||||
>>> lengths = [5, 3, 2]
|
||||
>>> make_pad_mask(lengths)
|
||||
masks = [[0, 0, 0, 0 ,0],
|
||||
[0, 0, 0, 1, 1],
|
||||
[0, 0, 1, 1, 1]]
|
||||
"""
|
||||
batch_size = lengths.size(0)
|
||||
max_len = max_len if max_len > 0 else lengths.max().item()
|
||||
seq_range = torch.arange(0, max_len, dtype=torch.int64, device=lengths.device)
|
||||
seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
|
||||
seq_length_expand = lengths.unsqueeze(-1)
|
||||
mask = seq_range_expand >= seq_length_expand
|
||||
return mask
|
||||
|
||||
|
||||
def safe_log(x: torch.Tensor, clip_val: float = 1e-7) -> torch.Tensor:
|
||||
"""
|
||||
Computes the element-wise logarithm of the input tensor with clipping to avoid near-zero values.
|
||||
|
||||
Args:
|
||||
x (Tensor): Input tensor.
|
||||
clip_val (float, optional): Minimum value to clip the input tensor. Defaults to 1e-7.
|
||||
|
||||
Returns:
|
||||
Tensor: Element-wise logarithm of the input tensor with clipping applied.
|
||||
"""
|
||||
return torch.log(torch.clip(x, min=clip_val))
|
||||
Reference in New Issue
Block a user