Merge branch '4.1-Stable' into 4.1-Stable
This commit is contained in:
commit
f090e33e9c
|
@ -0,0 +1,4 @@
|
|||
select = ["E", "F", "I"]
|
||||
|
||||
# Never enforce `E501` (line length violations).
|
||||
ignore = ["E501"]
|
|
@ -0,0 +1,6 @@
|
|||
{
|
||||
"recommendations": [
|
||||
"charliermarsh.ruff",
|
||||
"ms-python.python"
|
||||
]
|
||||
}
|
|
@ -396,7 +396,7 @@ python train_index.py -c configs/config.json
|
|||
模型的输出会在`logs/44k/feature_and_index.pkl`
|
||||
|
||||
+ 推理过程:
|
||||
+ 需要首先制定`--feature_retrieval`,此时聚类方案会自动切换到特征检索方案
|
||||
+ 需要首先指定`--feature_retrieval`,此时聚类方案会自动切换到特征检索方案
|
||||
+ `inference_main.py`中指定`cluster_model_path` 为模型输出文件, 留空则默认为`logs/44k/feature_and_index.pkl`
|
||||
+ `inference_main.py`中指定`cluster_infer_ratio`,`0`为完全不使用特征检索,`1`为只使用特征检索,通常设置`0.5`即可
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import numpy as np
|
||||
import torch
|
||||
from sklearn.cluster import KMeans
|
||||
|
||||
|
||||
def get_cluster_model(ckpt_path):
|
||||
checkpoint = torch.load(ckpt_path)
|
||||
kmeans_dict = {}
|
||||
|
|
|
@ -1,8 +1,11 @@
|
|||
import math,pdb
|
||||
import torch,pynvml
|
||||
from torch.nn.functional import normalize
|
||||
from time import time
|
||||
|
||||
import numpy as np
|
||||
import pynvml
|
||||
import torch
|
||||
from torch.nn.functional import normalize
|
||||
|
||||
|
||||
# device=torch.device("cuda:0")
|
||||
def _kpp(data: torch.Tensor, k: int, sample_size: int = -1):
|
||||
""" Picks k points in the data based on the kmeans++ method.
|
||||
|
|
|
@ -1,19 +1,17 @@
|
|||
import time,pdb
|
||||
import tqdm
|
||||
from time import time as ttime
|
||||
import os
|
||||
from pathlib import Path
|
||||
import logging
|
||||
import argparse
|
||||
from kmeans import KMeansGPU
|
||||
import torch
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
from sklearn.cluster import KMeans,MiniBatchKMeans
|
||||
import torch
|
||||
import tqdm
|
||||
from kmeans import KMeansGPU
|
||||
from sklearn.cluster import KMeans, MiniBatchKMeans
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
from time import time as ttime
|
||||
import pynvml,torch
|
||||
|
||||
def train_cluster(in_dir, n_clusters, use_minibatch=True, verbose=False,use_gpu=False):#gpu_minibatch真拉,虽然库支持但是也不考虑
|
||||
logger.info(f"Loading features from {in_dir}")
|
||||
|
@ -29,7 +27,7 @@ def train_cluster(in_dir, n_clusters, use_minibatch=True, verbose=False,use_gpu=
|
|||
features = features.astype(np.float32)
|
||||
logger.info(f"Clustering features of shape: {features.shape}")
|
||||
t = time.time()
|
||||
if(use_gpu==False):
|
||||
if(use_gpu is False):
|
||||
if use_minibatch:
|
||||
kmeans = MiniBatchKMeans(n_clusters=n_clusters,verbose=verbose, batch_size=4096, max_iter=80).fit(features)
|
||||
else:
|
||||
|
@ -37,14 +35,14 @@ def train_cluster(in_dir, n_clusters, use_minibatch=True, verbose=False,use_gpu=
|
|||
else:
|
||||
kmeans = KMeansGPU(n_clusters=n_clusters, mode='euclidean', verbose=2 if verbose else 0,max_iter=500,tol=1e-2)#
|
||||
features=torch.from_numpy(features)#.to(device)
|
||||
labels = kmeans.fit_predict(features)#
|
||||
kmeans.fit_predict(features)#
|
||||
|
||||
print(time.time()-t, "s")
|
||||
|
||||
x = {
|
||||
"n_features_in_": kmeans.n_features_in_ if use_gpu==False else features.shape[1],
|
||||
"_n_threads": kmeans._n_threads if use_gpu==False else 4,
|
||||
"cluster_centers_": kmeans.cluster_centers_ if use_gpu==False else kmeans.centroids.cpu().numpy(),
|
||||
"n_features_in_": kmeans.n_features_in_ if use_gpu is False else features.shape[1],
|
||||
"_n_threads": kmeans._n_threads if use_gpu is False else 4,
|
||||
"cluster_centers_": kmeans.cluster_centers_ if use_gpu is False else kmeans.centroids.cpu().numpy(),
|
||||
}
|
||||
print("end")
|
||||
|
||||
|
|
|
@ -1,14 +1,13 @@
|
|||
import time
|
||||
import os
|
||||
import random
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.utils.data
|
||||
|
||||
import modules.commons as commons
|
||||
import utils
|
||||
from modules.mel_processing import spectrogram_torch, spec_to_mel_torch, spectrogram_torch
|
||||
from utils import load_wav_to_torch, load_filepaths_and_text
|
||||
from modules.mel_processing import spectrogram_torch
|
||||
from utils import load_filepaths_and_text, load_wav_to_torch
|
||||
|
||||
# import h5py
|
||||
|
||||
|
@ -87,7 +86,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
|||
assert abs(audio_norm.shape[1]-lmin * self.hop_length) < 3 * self.hop_length
|
||||
spec, c, f0, uv = spec[:, :lmin], c[:, :lmin], f0[:lmin], uv[:lmin]
|
||||
audio_norm = audio_norm[:, :lmin * self.hop_length]
|
||||
if volume!= None:
|
||||
if volume is not None:
|
||||
volume = volume[:lmin]
|
||||
return c, f0, spec, audio_norm, spk, uv, volume
|
||||
|
||||
|
@ -96,7 +95,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
|||
# print("skip too short audio:", filename)
|
||||
# return None
|
||||
|
||||
if random.choice([True, False]) and self.vol_aug and volume!=None:
|
||||
if random.choice([True, False]) and self.vol_aug and volume is not None:
|
||||
max_amp = float(torch.max(torch.abs(audio_norm))) + 1e-5
|
||||
max_shift = min(1, np.log10(1/max_amp))
|
||||
log10_vol_shift = random.uniform(-1, max_shift)
|
||||
|
@ -114,7 +113,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
|||
end = start + 790
|
||||
spec, c, f0, uv = spec[:, start:end], c[:, start:end], f0[start:end], uv[start:end]
|
||||
audio_norm = audio_norm[:, start * self.hop_length : end * self.hop_length]
|
||||
if volume !=None:
|
||||
if volume is not None:
|
||||
volume = volume[start:end]
|
||||
return c, f0, spec, audio_norm, spk, uv,volume
|
||||
|
||||
|
@ -178,7 +177,7 @@ class TextAudioCollate:
|
|||
uv = row[5]
|
||||
uv_padded[i, :uv.size(0)] = uv
|
||||
volume = row[6]
|
||||
if volume != None:
|
||||
if volume is not None:
|
||||
volume_padded[i, :volume.size(0)] = volume
|
||||
else :
|
||||
volume_padded = None
|
||||
|
|
|
@ -1,13 +1,14 @@
|
|||
import os
|
||||
import random
|
||||
import re
|
||||
import numpy as np
|
||||
|
||||
import librosa
|
||||
import numpy as np
|
||||
import torch
|
||||
import random
|
||||
from utils import repeat_expand_2d
|
||||
from tqdm import tqdm
|
||||
from torch.utils.data import Dataset
|
||||
from tqdm import tqdm
|
||||
|
||||
from utils import repeat_expand_2d
|
||||
|
||||
|
||||
def traverse_dir(
|
||||
root_dir,
|
||||
|
@ -130,7 +131,7 @@ class AudioDataset(Dataset):
|
|||
with open(filelists,"r") as f:
|
||||
self.paths = f.read().splitlines()
|
||||
for name_ext in tqdm(self.paths, total=len(self.paths)):
|
||||
name = os.path.splitext(name_ext)[0]
|
||||
os.path.splitext(name_ext)[0]
|
||||
path_audio = name_ext
|
||||
duration = librosa.get_duration(filename = path_audio, sr = self.sample_rate)
|
||||
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
from collections import deque
|
||||
from functools import partial
|
||||
from inspect import isfunction
|
||||
import torch.nn.functional as F
|
||||
import librosa.sequence
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from torch import nn
|
||||
from tqdm import tqdm
|
||||
|
||||
|
@ -26,8 +26,10 @@ def extract(a, t, x_shape):
|
|||
|
||||
|
||||
def noise_like(shape, device, repeat=False):
|
||||
repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1)))
|
||||
noise = lambda: torch.randn(shape, device=device)
|
||||
def repeat_noise():
|
||||
return torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1)))
|
||||
def noise():
|
||||
return torch.randn(shape, device=device)
|
||||
return repeat_noise() if repeat else noise()
|
||||
|
||||
|
||||
|
@ -253,7 +255,11 @@ class GaussianDiffusion(nn.Module):
|
|||
|
||||
if method is not None and infer_speedup > 1:
|
||||
if method == 'dpm-solver' or method == 'dpm-solver++':
|
||||
from .dpm_solver_pytorch import NoiseScheduleVP, model_wrapper, DPM_Solver
|
||||
from .dpm_solver_pytorch import (
|
||||
DPM_Solver,
|
||||
NoiseScheduleVP,
|
||||
model_wrapper,
|
||||
)
|
||||
# 1. Define the noise schedule.
|
||||
noise_schedule = NoiseScheduleVP(schedule='discrete', betas=self.betas[:t])
|
||||
|
||||
|
@ -331,7 +337,7 @@ class GaussianDiffusion(nn.Module):
|
|||
infer_speedup, cond=cond
|
||||
)
|
||||
elif method == 'unipc':
|
||||
from .uni_pc import NoiseScheduleVP, model_wrapper, UniPC
|
||||
from .uni_pc import NoiseScheduleVP, UniPC, model_wrapper
|
||||
# 1. Define the noise schedule.
|
||||
noise_schedule = NoiseScheduleVP(schedule='discrete', betas=self.betas[:t])
|
||||
|
||||
|
|
|
@ -1,15 +1,14 @@
|
|||
import math
|
||||
from collections import deque
|
||||
from functools import partial
|
||||
from inspect import isfunction
|
||||
import torch.nn.functional as F
|
||||
import librosa.sequence
|
||||
|
||||
import numpy as np
|
||||
from torch.nn import Conv1d
|
||||
from torch.nn import Mish
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from torch import nn
|
||||
from torch.nn import Conv1d, Mish
|
||||
from tqdm import tqdm
|
||||
import math
|
||||
|
||||
|
||||
def exists(x):
|
||||
|
@ -27,8 +26,10 @@ def extract(a, t):
|
|||
|
||||
|
||||
def noise_like(shape, device, repeat=False):
|
||||
repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1)))
|
||||
noise = lambda: torch.randn(shape, device=device)
|
||||
def repeat_noise():
|
||||
return torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1)))
|
||||
def noise():
|
||||
return torch.randn(shape, device=device)
|
||||
return repeat_noise() if repeat else noise()
|
||||
|
||||
|
||||
|
@ -389,7 +390,11 @@ class GaussianDiffusion(nn.Module):
|
|||
|
||||
if method is not None and infer_speedup > 1:
|
||||
if method == 'dpm-solver':
|
||||
from .dpm_solver_pytorch import NoiseScheduleVP, model_wrapper, DPM_Solver
|
||||
from .dpm_solver_pytorch import (
|
||||
DPM_Solver,
|
||||
NoiseScheduleVP,
|
||||
model_wrapper,
|
||||
)
|
||||
# 1. Define the noise schedule.
|
||||
noise_schedule = NoiseScheduleVP(schedule='discrete', betas=self.betas[:t])
|
||||
|
||||
|
@ -577,7 +582,7 @@ class GaussianDiffusion(nn.Module):
|
|||
noise_list = torch.zeros((0, 1, 1, self.mel_bins, n_frames), device=device)
|
||||
|
||||
ot = step_range[0]
|
||||
ot_1 = torch.full((1,), ot, device=device, dtype=torch.long)
|
||||
torch.full((1,), ot, device=device, dtype=torch.long)
|
||||
|
||||
for t in step_range:
|
||||
t_1 = torch.full((1,), t, device=device, dtype=torch.long)
|
||||
|
|
|
@ -1,6 +1,4 @@
|
|||
import torch
|
||||
import torch.nn.functional as F
|
||||
import math
|
||||
|
||||
|
||||
class NoiseScheduleVP:
|
||||
|
@ -559,7 +557,7 @@ class DPM_Solver:
|
|||
x_t: A pytorch tensor. The approximated solution at time `t`.
|
||||
"""
|
||||
ns = self.noise_schedule
|
||||
dims = x.dim()
|
||||
x.dim()
|
||||
lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t)
|
||||
h = lambda_t - lambda_s
|
||||
log_alpha_s, log_alpha_t = ns.marginal_log_mean_coeff(s), ns.marginal_log_mean_coeff(t)
|
||||
|
@ -984,12 +982,16 @@ class DPM_Solver:
|
|||
nfe = 0
|
||||
if order == 2:
|
||||
r1 = 0.5
|
||||
lower_update = lambda x, s, t: self.dpm_solver_first_update(x, s, t, return_intermediate=True)
|
||||
higher_update = lambda x, s, t, **kwargs: self.singlestep_dpm_solver_second_update(x, s, t, r1=r1, solver_type=solver_type, **kwargs)
|
||||
def lower_update(x, s, t):
|
||||
return self.dpm_solver_first_update(x, s, t, return_intermediate=True)
|
||||
def higher_update(x, s, t, **kwargs):
|
||||
return self.singlestep_dpm_solver_second_update(x, s, t, r1=r1, solver_type=solver_type, **kwargs)
|
||||
elif order == 3:
|
||||
r1, r2 = 1. / 3., 2. / 3.
|
||||
lower_update = lambda x, s, t: self.singlestep_dpm_solver_second_update(x, s, t, r1=r1, return_intermediate=True, solver_type=solver_type)
|
||||
higher_update = lambda x, s, t, **kwargs: self.singlestep_dpm_solver_third_update(x, s, t, r1=r1, r2=r2, solver_type=solver_type, **kwargs)
|
||||
def lower_update(x, s, t):
|
||||
return self.singlestep_dpm_solver_second_update(x, s, t, r1=r1, return_intermediate=True, solver_type=solver_type)
|
||||
def higher_update(x, s, t, **kwargs):
|
||||
return self.singlestep_dpm_solver_third_update(x, s, t, r1=r1, r2=r2, solver_type=solver_type, **kwargs)
|
||||
else:
|
||||
raise ValueError("For adaptive step size solver, order must be 2 or 3, got {}".format(order))
|
||||
while torch.abs((s - t_0)).mean() > t_err:
|
||||
|
@ -997,7 +999,8 @@ class DPM_Solver:
|
|||
x_lower, lower_noise_kwargs = lower_update(x, s, t)
|
||||
x_higher = higher_update(x, s, t, **lower_noise_kwargs)
|
||||
delta = torch.max(torch.ones_like(x).to(x) * atol, rtol * torch.max(torch.abs(x_lower), torch.abs(x_prev)))
|
||||
norm_fn = lambda v: torch.sqrt(torch.square(v.reshape((v.shape[0], -1))).mean(dim=-1, keepdim=True))
|
||||
def norm_fn(v):
|
||||
return torch.sqrt(torch.square(v.reshape((v.shape[0], -1))).mean(dim=-1, keepdim=True))
|
||||
E = norm_fn((x_higher - x_lower) / delta).max()
|
||||
if torch.all(E <= 1.):
|
||||
x = x_higher
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
import numpy as np
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from diffusion.unit2mel import load_model_vocoder
|
||||
|
||||
|
||||
|
|
|
@ -2,16 +2,16 @@
|
|||
author: wayn391@mastertones
|
||||
'''
|
||||
|
||||
import os
|
||||
import json
|
||||
import time
|
||||
import yaml
|
||||
import datetime
|
||||
import torch
|
||||
import os
|
||||
import time
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
from . import utils
|
||||
import torch
|
||||
import yaml
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
|
||||
|
||||
class Saver(object):
|
||||
def __init__(
|
||||
self,
|
||||
|
|
|
@ -1,8 +1,9 @@
|
|||
import os
|
||||
import yaml
|
||||
import json
|
||||
import pickle
|
||||
import os
|
||||
|
||||
import torch
|
||||
import yaml
|
||||
|
||||
|
||||
def traverse_dir(
|
||||
root_dir,
|
||||
|
@ -121,6 +122,6 @@ def load_model(
|
|||
ckpt = torch.load(path_pt, map_location=torch.device(device))
|
||||
global_step = ckpt['global_step']
|
||||
model.load_state_dict(ckpt['model'], strict=False)
|
||||
if ckpt.get('optimizer') != None:
|
||||
if ckpt.get("optimizer") is not None:
|
||||
optimizer.load_state_dict(ckpt['optimizer'])
|
||||
return global_step, model, optimizer
|
||||
|
|
|
@ -1,12 +1,12 @@
|
|||
from diffusion_onnx import GaussianDiffusion
|
||||
import os
|
||||
import yaml
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import numpy as np
|
||||
from wavenet import WaveNet
|
||||
import torch.nn.functional as F
|
||||
import diffusion
|
||||
import yaml
|
||||
from diffusion_onnx import GaussianDiffusion
|
||||
|
||||
|
||||
class DotDict(dict):
|
||||
def __getattr__(*args):
|
||||
|
@ -147,8 +147,8 @@ class Unit2Mel(nn.Module):
|
|||
spks.update({i:1.0/float(self.n_spk)})
|
||||
spk_mix = torch.tensor(spk_mix)
|
||||
spk_mix = spk_mix.repeat(n_frames, 1)
|
||||
orgouttt = self.init_spkembed(hubert, f0.unsqueeze(-1), volume.unsqueeze(-1), spk_mix_dict=spks)
|
||||
outtt = self.forward(hubert, mel2ph, f0, volume, spk_mix)
|
||||
self.init_spkembed(hubert, f0.unsqueeze(-1), volume.unsqueeze(-1), spk_mix_dict=spks)
|
||||
self.forward(hubert, mel2ph, f0, volume, spk_mix)
|
||||
if export_encoder:
|
||||
torch.onnx.export(
|
||||
self,
|
||||
|
@ -182,8 +182,8 @@ class Unit2Mel(nn.Module):
|
|||
spk_mix.append(1.0/float(self.n_spk))
|
||||
spks.update({i:1.0/float(self.n_spk)})
|
||||
spk_mix = torch.tensor(spk_mix)
|
||||
orgouttt = self.orgforward(hubert, f0.unsqueeze(-1), volume.unsqueeze(-1), spk_mix_dict=spks)
|
||||
outtt = self.forward(hubert, mel2ph, f0, volume, spk_mix)
|
||||
self.orgforward(hubert, f0.unsqueeze(-1), volume.unsqueeze(-1), spk_mix_dict=spks)
|
||||
self.forward(hubert, mel2ph, f0, volume, spk_mix)
|
||||
|
||||
torch.onnx.export(
|
||||
self,
|
||||
|
|
|
@ -1,13 +1,15 @@
|
|||
import os
|
||||
import time
|
||||
|
||||
import librosa
|
||||
import numpy as np
|
||||
import torch
|
||||
import librosa
|
||||
from diffusion.logger.saver import Saver
|
||||
from diffusion.logger import utils
|
||||
from torch import autocast
|
||||
from torch.cuda.amp import GradScaler
|
||||
|
||||
from diffusion.logger import utils
|
||||
from diffusion.logger.saver import Saver
|
||||
|
||||
|
||||
def test(args, model, vocoder, loader_test, saver):
|
||||
print(' [*] testing...')
|
||||
model.eval()
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import torch
|
||||
import torch.nn.functional as F
|
||||
import math
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
class NoiseScheduleVP:
|
||||
def __init__(
|
||||
|
@ -109,7 +109,8 @@ class NoiseScheduleVP:
|
|||
elif self.schedule == 'linear':
|
||||
return -0.25 * t ** 2 * (self.beta_1 - self.beta_0) - 0.5 * t * self.beta_0
|
||||
elif self.schedule == 'cosine':
|
||||
log_alpha_fn = lambda s: torch.log(torch.cos((s + self.cosine_s) / (1. + self.cosine_s) * math.pi / 2.))
|
||||
def log_alpha_fn(s):
|
||||
return torch.log(torch.cos((s + self.cosine_s) / (1.0 + self.cosine_s) * math.pi / 2.0))
|
||||
log_alpha_t = log_alpha_fn(t) - self.cosine_log_alpha_0
|
||||
return log_alpha_t
|
||||
|
||||
|
@ -147,7 +148,8 @@ class NoiseScheduleVP:
|
|||
return t.reshape((-1,))
|
||||
else:
|
||||
log_alpha = -0.5 * torch.logaddexp(-2. * lamb, torch.zeros((1,)).to(lamb))
|
||||
t_fn = lambda log_alpha_t: torch.arccos(torch.exp(log_alpha_t + self.cosine_log_alpha_0)) * 2. * (1. + self.cosine_s) / math.pi - self.cosine_s
|
||||
def t_fn(log_alpha_t):
|
||||
return torch.arccos(torch.exp(log_alpha_t + self.cosine_log_alpha_0)) * 2.0 * (1.0 + self.cosine_s) / math.pi - self.cosine_s
|
||||
t = t_fn(log_alpha)
|
||||
return t
|
||||
|
||||
|
|
|
@ -1,11 +1,14 @@
|
|||
import os
|
||||
import yaml
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import numpy as np
|
||||
import yaml
|
||||
|
||||
from .diffusion import GaussianDiffusion
|
||||
from .wavenet import WaveNet
|
||||
from .vocoder import Vocoder
|
||||
from .wavenet import WaveNet
|
||||
|
||||
|
||||
class DotDict(dict):
|
||||
def __getattr__(*args):
|
||||
|
@ -21,9 +24,11 @@ def load_model_vocoder(
|
|||
device='cpu',
|
||||
config_path = None
|
||||
):
|
||||
if config_path is None: config_file = os.path.join(os.path.split(model_path)[0], 'config.yaml')
|
||||
else: config_file = config_path
|
||||
|
||||
if config_path is None:
|
||||
config_file = os.path.join(os.path.split(model_path)[0], 'config.yaml')
|
||||
else:
|
||||
config_file = config_path
|
||||
|
||||
with open(config_file, "r") as config:
|
||||
args = yaml.safe_load(config)
|
||||
args = DotDict(args)
|
||||
|
@ -116,13 +121,13 @@ class Unit2Mel(nn.Module):
|
|||
hubert_hidden_size = self.input_channel
|
||||
n_frames = 10
|
||||
hubert = torch.randn((1, n_frames, hubert_hidden_size))
|
||||
mel2ph = torch.arange(end=n_frames).unsqueeze(0).long()
|
||||
torch.arange(end=n_frames).unsqueeze(0).long()
|
||||
f0 = torch.randn((1, n_frames))
|
||||
volume = torch.randn((1, n_frames))
|
||||
spks = {}
|
||||
for i in range(n_spk):
|
||||
spks.update({i:1.0/float(self.n_spk)})
|
||||
orgouttt = self.init_spkembed(hubert, f0.unsqueeze(-1), volume.unsqueeze(-1), spk_mix_dict=spks)
|
||||
self.init_spkembed(hubert, f0.unsqueeze(-1), volume.unsqueeze(-1), spk_mix_dict=spks)
|
||||
|
||||
def forward(self, units, f0, volume, spk_id = None, spk_mix_dict = None, aug_shift = None,
|
||||
gt_spec=None, infer=True, infer_speedup=10, method='dpm-solver', k_step=300, use_tqdm=True):
|
||||
|
|
|
@ -1,9 +1,10 @@
|
|||
import torch
|
||||
from vdecoder.nsf_hifigan.nvSTFT import STFT
|
||||
from vdecoder.nsf_hifigan.models import load_model,load_config
|
||||
from torchaudio.transforms import Resample
|
||||
|
||||
|
||||
from vdecoder.nsf_hifigan.models import load_config, load_model
|
||||
from vdecoder.nsf_hifigan.nvSTFT import STFT
|
||||
|
||||
|
||||
class Vocoder:
|
||||
def __init__(self, vocoder_type, vocoder_ckpt, device = None):
|
||||
if device is None:
|
||||
|
|
|
@ -7,7 +7,7 @@ import torchaudio
|
|||
from flask import Flask, request, send_file
|
||||
from flask_cors import CORS
|
||||
|
||||
from inference.infer_tool import Svc, RealTimeVC
|
||||
from inference.infer_tool import RealTimeVC, Svc
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
import io
|
||||
|
||||
import numpy as np
|
||||
import soundfile
|
||||
from flask import Flask, request, send_file
|
||||
|
||||
from inference import infer_tool
|
||||
from inference import slicer
|
||||
from inference import infer_tool, slicer
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
|
|
|
@ -1,15 +1,16 @@
|
|||
import gc
|
||||
import hashlib
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import pickle
|
||||
import time
|
||||
from pathlib import Path
|
||||
from inference import slicer
|
||||
import gc
|
||||
|
||||
import librosa
|
||||
import numpy as np
|
||||
|
||||
# import onnxruntime
|
||||
import soundfile
|
||||
import torch
|
||||
|
@ -17,11 +18,9 @@ import torchaudio
|
|||
|
||||
import cluster
|
||||
import utils
|
||||
from models import SynthesizerTrn
|
||||
import pickle
|
||||
|
||||
from diffusion.unit2mel import load_model_vocoder
|
||||
import yaml
|
||||
from inference import slicer
|
||||
from models import SynthesizerTrn
|
||||
|
||||
logging.getLogger('matplotlib').setLevel(logging.WARNING)
|
||||
|
||||
|
@ -153,7 +152,7 @@ class Svc(object):
|
|||
self.hop_size = self.diffusion_args.data.block_size
|
||||
self.spk2id = self.diffusion_args.spk
|
||||
self.speech_encoder = self.diffusion_args.data.encoder
|
||||
self.unit_interpolate_mode = self.diffusion_args.data.unit_interpolate_mode if self.diffusion_args.data.unit_interpolate_mode!=None else 'left'
|
||||
self.unit_interpolate_mode = self.diffusion_args.data.unit_interpolate_mode if self.diffusion_args.data.unit_interpolate_mode is not None else 'left'
|
||||
if spk_mix_enable:
|
||||
self.diffusion_model.init_spkmix(len(self.spk2id))
|
||||
else:
|
||||
|
@ -180,7 +179,8 @@ class Svc(object):
|
|||
else:
|
||||
self.feature_retrieval=False
|
||||
|
||||
if self.shallow_diffusion : self.nsf_hifigan_enhance = False
|
||||
if self.shallow_diffusion :
|
||||
self.nsf_hifigan_enhance = False
|
||||
if self.nsf_hifigan_enhance:
|
||||
from modules.enhancer import Enhancer
|
||||
self.enhancer = Enhancer('nsf-hifigan', 'pretrain/nsf_hifigan/model',device=self.dev)
|
||||
|
@ -290,7 +290,7 @@ class Svc(object):
|
|||
audio = torch.FloatTensor(wav).to(self.dev)
|
||||
audio_mel = None
|
||||
if self.only_diffusion or self.shallow_diffusion:
|
||||
vol = self.volume_extractor.extract(audio[None,:])[None,:,None].to(self.dev) if vol==None else vol[:,:,None]
|
||||
vol = self.volume_extractor.extract(audio[None,:])[None,:,None].to(self.dev) if vol is None else vol[:,:,None]
|
||||
if self.shallow_diffusion and second_encoding:
|
||||
audio16k = librosa.resample(audio.detach().cpu().numpy(), orig_sr=self.target_sample, target_sr=16000)
|
||||
audio16k = torch.from_numpy(audio16k).to(self.dev)
|
||||
|
@ -443,7 +443,8 @@ class Svc(object):
|
|||
datas = [data]
|
||||
for k,dat in enumerate(datas):
|
||||
per_length = int(np.ceil(len(dat) / audio_sr * self.target_sample)) if clip_seconds!=0 else length
|
||||
if clip_seconds!=0: print(f'###=====segment clip start, {round(len(dat) / audio_sr, 3)}s======')
|
||||
if clip_seconds!=0:
|
||||
print(f'###=====segment clip start, {round(len(dat) / audio_sr, 3)}s======')
|
||||
# padd
|
||||
pad_len = int(audio_sr * pad_seconds)
|
||||
dat = np.concatenate([np.zeros([pad_len]), dat, np.zeros([pad_len])])
|
||||
|
|
|
@ -1,22 +1,18 @@
|
|||
import hashlib
|
||||
import json
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from pathlib import Path
|
||||
import io
|
||||
|
||||
import librosa
|
||||
import maad
|
||||
import numpy as np
|
||||
from inference import slicer
|
||||
import parselmouth
|
||||
import soundfile
|
||||
import torch
|
||||
import torchaudio
|
||||
|
||||
from hubert import hubert_model
|
||||
import utils
|
||||
from inference import slicer
|
||||
from models import SynthesizerTrn
|
||||
|
||||
logging.getLogger('numba').setLevel(logging.WARNING)
|
||||
logging.getLogger('matplotlib').setLevel(logging.WARNING)
|
||||
|
||||
|
@ -93,7 +89,7 @@ class VitsSvc(object):
|
|||
def set_device(self, device):
|
||||
self.device = torch.device(device)
|
||||
self.hubert_soft.to(self.device)
|
||||
if self.SVCVITS != None:
|
||||
if self.SVCVITS is not None:
|
||||
self.SVCVITS.to(self.device)
|
||||
|
||||
def loadCheckpoint(self, path):
|
||||
|
|
|
@ -1,15 +1,10 @@
|
|||
import io
|
||||
import logging
|
||||
import time
|
||||
from pathlib import Path
|
||||
from spkmix import spk_mix_map
|
||||
import librosa
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
|
||||
import soundfile
|
||||
|
||||
from inference import infer_tool
|
||||
from inference import slicer
|
||||
from inference.infer_tool import Svc
|
||||
from spkmix import spk_mix_map
|
||||
|
||||
logging.getLogger('numba').setLevel(logging.WARNING)
|
||||
chunks_dict = infer_tool.read_temp("inference/chunks_temp.json")
|
||||
|
@ -146,8 +141,10 @@ def main():
|
|||
key = "auto" if auto_predict_f0 else f"{tran}key"
|
||||
cluster_name = "" if cluster_infer_ratio == 0 else f"_{cluster_infer_ratio}"
|
||||
isdiffusion = "sovits"
|
||||
if shallow_diffusion : isdiffusion = "sovdiff"
|
||||
if only_diffusion : isdiffusion = "diff"
|
||||
if shallow_diffusion :
|
||||
isdiffusion = "sovdiff"
|
||||
if only_diffusion :
|
||||
isdiffusion = "diff"
|
||||
if use_spk_mix:
|
||||
spk = "spk_mix"
|
||||
res_path = f'results/{clean_name}_{key}_{spk}{cluster_name}_{isdiffusion}_{f0p}.{wav_format}'
|
||||
|
|
19
models.py
19
models.py
|
@ -1,20 +1,17 @@
|
|||
import copy
|
||||
import math
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.nn import Conv1d, Conv2d
|
||||
from torch.nn import functional as F
|
||||
from torch.nn.utils import spectral_norm, weight_norm
|
||||
|
||||
import modules.attentions as attentions
|
||||
import modules.commons as commons
|
||||
import modules.modules as modules
|
||||
|
||||
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
||||
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
||||
|
||||
import utils
|
||||
from modules.commons import init_weights, get_padding
|
||||
from modules.commons import get_padding
|
||||
from utils import f0_to_coarse
|
||||
|
||||
|
||||
class ResidualCouplingBlock(nn.Module):
|
||||
def __init__(self,
|
||||
channels,
|
||||
|
@ -125,7 +122,7 @@ class DiscriminatorP(torch.nn.Module):
|
|||
super(DiscriminatorP, self).__init__()
|
||||
self.period = period
|
||||
self.use_spectral_norm = use_spectral_norm
|
||||
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
||||
norm_f = weight_norm if use_spectral_norm is False else spectral_norm
|
||||
self.convs = nn.ModuleList([
|
||||
norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
|
||||
norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
|
||||
|
@ -160,7 +157,7 @@ class DiscriminatorP(torch.nn.Module):
|
|||
class DiscriminatorS(torch.nn.Module):
|
||||
def __init__(self, use_spectral_norm=False):
|
||||
super(DiscriminatorS, self).__init__()
|
||||
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
||||
norm_f = weight_norm if use_spectral_norm is False else spectral_norm
|
||||
self.convs = nn.ModuleList([
|
||||
norm_f(Conv1d(1, 16, 15, 1, padding=7)),
|
||||
norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
|
||||
|
@ -407,7 +404,7 @@ class SynthesizerTrn(nn.Module):
|
|||
g = self.emb_g(g).transpose(1,2)
|
||||
|
||||
# vol proj
|
||||
vol = self.emb_vol(vol[:,:,None]).transpose(1,2) if vol!=None and self.vol_embedding else 0
|
||||
vol = self.emb_vol(vol[:,:,None]).transpose(1,2) if vol is not None and self.vol_embedding else 0
|
||||
|
||||
# ssl prenet
|
||||
x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to(c.dtype)
|
||||
|
@ -452,7 +449,7 @@ class SynthesizerTrn(nn.Module):
|
|||
|
||||
x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to(c.dtype)
|
||||
# vol proj
|
||||
vol = self.emb_vol(vol[:,:,None]).transpose(1,2) if vol!=None and self.vol_embedding else 0
|
||||
vol = self.emb_vol(vol[:,:,None]).transpose(1,2) if vol is not None and self.vol_embedding else 0
|
||||
|
||||
x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1, 2) + vol
|
||||
|
||||
|
|
|
@ -1,7 +1,9 @@
|
|||
from modules.F0Predictor.F0Predictor import F0Predictor
|
||||
from modules.F0Predictor.crepe import CrepePitchExtractor
|
||||
import torch
|
||||
|
||||
from modules.F0Predictor.crepe import CrepePitchExtractor
|
||||
from modules.F0Predictor.F0Predictor import F0Predictor
|
||||
|
||||
|
||||
class CrepeF0Predictor(F0Predictor):
|
||||
def __init__(self,hop_length=512,f0_min=50,f0_max=1100,device=None,sampling_rate=44100,threshold=0.05,model="full"):
|
||||
self.F0Creper = CrepePitchExtractor(hop_length=hop_length,f0_min=f0_min,f0_max=f0_max,device=device,threshold=threshold,model=model)
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
from modules.F0Predictor.F0Predictor import F0Predictor
|
||||
import pyworld
|
||||
import numpy as np
|
||||
import pyworld
|
||||
|
||||
from modules.F0Predictor.F0Predictor import F0Predictor
|
||||
|
||||
|
||||
class DioF0Predictor(F0Predictor):
|
||||
def __init__(self,hop_length=512,f0_min=50,f0_max=1100,sampling_rate=44100):
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
from modules.F0Predictor.F0Predictor import F0Predictor
|
||||
import pyworld
|
||||
import numpy as np
|
||||
import pyworld
|
||||
|
||||
from modules.F0Predictor.F0Predictor import F0Predictor
|
||||
|
||||
|
||||
class HarvestF0Predictor(F0Predictor):
|
||||
def __init__(self,hop_length=512,f0_min=50,f0_max=1100,sampling_rate=44100):
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
from modules.F0Predictor.F0Predictor import F0Predictor
|
||||
import parselmouth
|
||||
import numpy as np
|
||||
import parselmouth
|
||||
|
||||
from modules.F0Predictor.F0Predictor import F0Predictor
|
||||
|
||||
|
||||
class PMF0Predictor(F0Predictor):
|
||||
def __init__(self,hop_length=512,f0_min=50,f0_max=1100,sampling_rate=44100):
|
||||
|
|
|
@ -1,14 +1,14 @@
|
|||
from typing import Optional,Union
|
||||
from typing import Optional, Union
|
||||
|
||||
try:
|
||||
from typing import Literal
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
from typing_extensions import Literal
|
||||
import numpy as np
|
||||
import torch
|
||||
import torchcrepe
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
import scipy
|
||||
|
||||
#from:https://github.com/fishaudio/fish-diffusion
|
||||
|
||||
|
@ -334,7 +334,7 @@ class CrepePitchExtractor(BasePitchExtractor):
|
|||
f0 = torch.where(torch.isnan(f0), torch.full_like(f0, 0), f0)[0]
|
||||
|
||||
if torch.all(f0 == 0):
|
||||
rtn = f0.cpu().numpy() if pad_to==None else np.zeros(pad_to)
|
||||
rtn = f0.cpu().numpy() if pad_to is None else np.zeros(pad_to)
|
||||
return rtn,rtn
|
||||
|
||||
return self.post_process(x, sampling_rate, f0, pad_to)
|
||||
|
|
|
@ -1,12 +1,10 @@
|
|||
import copy
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
|
||||
import modules.commons as commons
|
||||
import modules.modules as modules
|
||||
from modules.modules import LayerNorm
|
||||
|
||||
|
||||
|
@ -243,7 +241,7 @@ class MultiHeadAttention(nn.Module):
|
|||
return ret
|
||||
|
||||
def _get_relative_embeddings(self, relative_embeddings, length):
|
||||
max_relative_position = 2 * self.window_size + 1
|
||||
2 * self.window_size + 1
|
||||
# Pad first before slice to avoid using cond ops.
|
||||
pad_length = max(length - (self.window_size + 1), 0)
|
||||
slice_start_position = max((self.window_size + 1) - length, 0)
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
import math
|
||||
import numpy as np
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
|
||||
|
||||
def slice_pitch_segments(x, ids_str, segment_size=4):
|
||||
ret = torch.zeros_like(x[:, :segment_size])
|
||||
for i in range(x.size(0)):
|
||||
|
@ -134,12 +134,6 @@ def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
|
|||
return acts
|
||||
|
||||
|
||||
def convert_pad_shape(pad_shape):
|
||||
l = pad_shape[::-1]
|
||||
pad_shape = [item for sublist in l for item in sublist]
|
||||
return pad_shape
|
||||
|
||||
|
||||
def shift_1d(x):
|
||||
x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
|
||||
return x
|
||||
|
@ -157,7 +151,6 @@ def generate_path(duration, mask):
|
|||
duration: [b, 1, t_x]
|
||||
mask: [b, 1, t_y, t_x]
|
||||
"""
|
||||
device = duration.device
|
||||
|
||||
b, _, t_y, t_x = mask.shape
|
||||
cum_duration = torch.cumsum(duration, -1)
|
||||
|
|
|
@ -1,10 +1,12 @@
|
|||
import numpy as np
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from vdecoder.nsf_hifigan.nvSTFT import STFT
|
||||
from vdecoder.nsf_hifigan.models import load_model
|
||||
from torchaudio.transforms import Resample
|
||||
|
||||
from vdecoder.nsf_hifigan.models import load_model
|
||||
from vdecoder.nsf_hifigan.nvSTFT import STFT
|
||||
|
||||
|
||||
class Enhancer:
|
||||
def __init__(self, enhancer_type, enhancer_ckpt, device=None):
|
||||
if device is None:
|
||||
|
|
|
@ -1,7 +1,4 @@
|
|||
import torch
|
||||
from torch.nn import functional as F
|
||||
|
||||
import modules.commons as commons
|
||||
import torch
|
||||
|
||||
|
||||
def feature_loss(fmap_r, fmap_g):
|
||||
|
|
|
@ -1,16 +1,5 @@
|
|||
import math
|
||||
import os
|
||||
import random
|
||||
import torch
|
||||
from torch import nn
|
||||
import torch.nn.functional as F
|
||||
import torch.utils.data
|
||||
import numpy as np
|
||||
import librosa
|
||||
import librosa.util as librosa_util
|
||||
from librosa.util import normalize, pad_center, tiny
|
||||
from scipy.signal import get_window
|
||||
from scipy.io.wavfile import read
|
||||
from librosa.filters import mel as librosa_mel_fn
|
||||
|
||||
MAX_WAV_VALUE = 32768.0
|
||||
|
|
|
@ -1,17 +1,11 @@
|
|||
import copy
|
||||
import math
|
||||
import numpy as np
|
||||
import scipy
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.nn import Conv1d
|
||||
from torch.nn import functional as F
|
||||
|
||||
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
||||
from torch.nn.utils import weight_norm, remove_weight_norm
|
||||
from torch.nn.utils import remove_weight_norm, weight_norm
|
||||
|
||||
import modules.commons as commons
|
||||
from modules.commons import init_weights, get_padding
|
||||
|
||||
from modules.commons import get_padding, init_weights
|
||||
|
||||
LRELU_SLOPE = 0.1
|
||||
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
import torch
|
||||
from onnxexport.model_onnx import SynthesizerTrn
|
||||
|
||||
import utils
|
||||
from onnxexport.model_onnx import SynthesizerTrn
|
||||
|
||||
|
||||
def main(NetExport):
|
||||
path = "SoVits4.0"
|
||||
|
|
|
@ -1,8 +1,11 @@
|
|||
import torch
|
||||
from onnxexport.model_onnx_speaker_mix import SynthesizerTrn
|
||||
import utils
|
||||
import json
|
||||
|
||||
import torch
|
||||
|
||||
import utils
|
||||
from onnxexport.model_onnx_speaker_mix import SynthesizerTrn
|
||||
|
||||
|
||||
def main():
|
||||
path = "crs"
|
||||
|
||||
|
@ -127,7 +130,7 @@ def main():
|
|||
"Characters": spklist
|
||||
}
|
||||
|
||||
MoeVSConfJson = json.dumps(MoeVSConf)
|
||||
json.dumps(MoeVSConf)
|
||||
with open(f"checkpoints/{path}.json", 'w') as MoeVsConfFile:
|
||||
json.dump(MoeVSConf, MoeVsConfFile, indent = 4)
|
||||
|
||||
|
|
|
@ -1,18 +1,16 @@
|
|||
import torch
|
||||
from torch import nn
|
||||
from torch.nn import Conv1d, Conv2d
|
||||
from torch.nn import functional as F
|
||||
from torch.nn.utils import spectral_norm, weight_norm
|
||||
|
||||
import modules.attentions as attentions
|
||||
import modules.commons as commons
|
||||
import modules.modules as modules
|
||||
|
||||
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
||||
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
||||
|
||||
import utils
|
||||
from modules.commons import init_weights, get_padding
|
||||
from vdecoder.hifigan.models import Generator
|
||||
from modules.commons import get_padding
|
||||
from utils import f0_to_coarse
|
||||
from vdecoder.hifigan.models import Generator
|
||||
|
||||
|
||||
class ResidualCouplingBlock(nn.Module):
|
||||
|
@ -124,7 +122,7 @@ class DiscriminatorP(torch.nn.Module):
|
|||
super(DiscriminatorP, self).__init__()
|
||||
self.period = period
|
||||
self.use_spectral_norm = use_spectral_norm
|
||||
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
||||
norm_f = weight_norm if use_spectral_norm is False else spectral_norm
|
||||
self.convs = nn.ModuleList([
|
||||
norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
|
||||
norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
|
||||
|
@ -159,7 +157,7 @@ class DiscriminatorP(torch.nn.Module):
|
|||
class DiscriminatorS(torch.nn.Module):
|
||||
def __init__(self, use_spectral_norm=False):
|
||||
super(DiscriminatorS, self).__init__()
|
||||
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
||||
norm_f = weight_norm if use_spectral_norm is False else spectral_norm
|
||||
self.convs = nn.ModuleList([
|
||||
norm_f(Conv1d(1, 16, 15, 1, padding=7)),
|
||||
norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
|
||||
|
|
|
@ -1,20 +1,12 @@
|
|||
import copy
|
||||
import math
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
|
||||
import modules.attentions as attentions
|
||||
import modules.commons as commons
|
||||
import modules.modules as modules
|
||||
|
||||
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
||||
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
||||
|
||||
import utils
|
||||
from modules.commons import init_weights, get_padding
|
||||
from utils import f0_to_coarse
|
||||
|
||||
|
||||
class ResidualCouplingBlock(nn.Module):
|
||||
def __init__(self,
|
||||
channels,
|
||||
|
@ -259,7 +251,7 @@ class SynthesizerTrn(nn.Module):
|
|||
|
||||
x_mask = torch.unsqueeze(torch.ones_like(f0), 1).to(c.dtype)
|
||||
# vol proj
|
||||
vol = self.emb_vol(vol[:,:,None]).transpose(1,2) if vol!=None and self.vol_embedding else 0
|
||||
vol = self.emb_vol(vol[:,:,None]).transpose(1,2) if vol is not None and self.vol_embedding else 0
|
||||
|
||||
x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1, 2) + vol
|
||||
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
import os
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import wave
|
||||
from random import shuffle
|
||||
|
||||
from tqdm import tqdm
|
||||
from random import shuffle
|
||||
import json
|
||||
import wave
|
||||
|
||||
import diffusion.logger.utils as du
|
||||
|
||||
|
|
|
@ -1,19 +1,20 @@
|
|||
import os
|
||||
import utils
|
||||
import torch
|
||||
import random
|
||||
import librosa
|
||||
import logging
|
||||
import argparse
|
||||
import logging
|
||||
import multiprocessing
|
||||
import numpy as np
|
||||
import diffusion.logger.utils as du
|
||||
|
||||
from glob import glob
|
||||
from tqdm import tqdm
|
||||
from random import shuffle
|
||||
from diffusion.vocoder import Vocoder
|
||||
import os
|
||||
import random
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
from glob import glob
|
||||
from random import shuffle
|
||||
|
||||
import librosa
|
||||
import numpy as np
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
|
||||
import diffusion.logger.utils as du
|
||||
import utils
|
||||
from diffusion.vocoder import Vocoder
|
||||
from modules.mel_processing import spectrogram_torch
|
||||
|
||||
logging.getLogger("numba").setLevel(logging.WARNING)
|
||||
|
|
|
@ -1,10 +1,11 @@
|
|||
import os
|
||||
import argparse
|
||||
import concurrent.futures
|
||||
import os
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
from multiprocessing import cpu_count
|
||||
|
||||
import librosa
|
||||
import numpy as np
|
||||
import concurrent.futures
|
||||
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
|
||||
from multiprocessing import Pool, cpu_count
|
||||
from scipy.io import wavfile
|
||||
from tqdm import tqdm
|
||||
|
||||
|
|
35
train.py
35
train.py
|
@ -1,39 +1,30 @@
|
|||
import logging
|
||||
import multiprocessing
|
||||
import os
|
||||
import time
|
||||
|
||||
logging.getLogger('matplotlib').setLevel(logging.WARNING)
|
||||
logging.getLogger('numba').setLevel(logging.WARNING)
|
||||
|
||||
import os
|
||||
import json
|
||||
import argparse
|
||||
import itertools
|
||||
import math
|
||||
import torch
|
||||
from torch import nn, optim
|
||||
import torch.distributed as dist
|
||||
import torch.multiprocessing as mp
|
||||
from torch.cuda.amp import GradScaler, autocast
|
||||
from torch.nn import functional as F
|
||||
from torch.nn.parallel import DistributedDataParallel as DDP
|
||||
from torch.utils.data import DataLoader
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
import torch.multiprocessing as mp
|
||||
import torch.distributed as dist
|
||||
from torch.nn.parallel import DistributedDataParallel as DDP
|
||||
from torch.cuda.amp import autocast, GradScaler
|
||||
|
||||
import modules.commons as commons
|
||||
import utils
|
||||
from data_utils import TextAudioSpeakerLoader, TextAudioCollate
|
||||
from data_utils import TextAudioCollate, TextAudioSpeakerLoader
|
||||
from models import (
|
||||
SynthesizerTrn,
|
||||
MultiPeriodDiscriminator,
|
||||
SynthesizerTrn,
|
||||
)
|
||||
from modules.losses import (
|
||||
kl_loss,
|
||||
generator_loss, discriminator_loss, feature_loss
|
||||
)
|
||||
|
||||
from modules.losses import discriminator_loss, feature_loss, generator_loss, kl_loss
|
||||
from modules.mel_processing import mel_spectrogram_torch, spec_to_mel_torch
|
||||
|
||||
logging.getLogger('matplotlib').setLevel(logging.WARNING)
|
||||
logging.getLogger('numba').setLevel(logging.WARNING)
|
||||
|
||||
torch.backends.cudnn.benchmark = True
|
||||
global_step = 0
|
||||
start_time = time.time()
|
||||
|
@ -287,7 +278,7 @@ def evaluate(hps, generator, eval_loader, writer_eval):
|
|||
c = c[:1].cuda(0)
|
||||
f0 = f0[:1].cuda(0)
|
||||
uv= uv[:1].cuda(0)
|
||||
if volume!=None:
|
||||
if volume is not None:
|
||||
volume = volume[:1].cuda(0)
|
||||
mel = spec_to_mel_torch(
|
||||
spec,
|
||||
|
@ -314,7 +305,7 @@ def evaluate(hps, generator, eval_loader, writer_eval):
|
|||
f"gt/audio_{batch_idx}": y[0]
|
||||
})
|
||||
image_dict.update({
|
||||
f"gen/mel": utils.plot_spectrogram_to_numpy(y_hat_mel[0].cpu().numpy()),
|
||||
"gen/mel": utils.plot_spectrogram_to_numpy(y_hat_mel[0].cpu().numpy()),
|
||||
"gt/mel": utils.plot_spectrogram_to_numpy(mel[0].cpu().numpy())
|
||||
})
|
||||
utils.summarize(
|
||||
|
|
|
@ -1,9 +1,10 @@
|
|||
import os
|
||||
import argparse
|
||||
|
||||
import torch
|
||||
from torch.optim import lr_scheduler
|
||||
from diffusion.logger import utils
|
||||
|
||||
from diffusion.data_loaders import get_data_loaders
|
||||
from diffusion.logger import utils
|
||||
from diffusion.solver import train
|
||||
from diffusion.unit2mel import Unit2Mel
|
||||
from diffusion.vocoder import Vocoder
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
import utils
|
||||
import pickle
|
||||
import os
|
||||
import argparse
|
||||
import os
|
||||
import pickle
|
||||
|
||||
import utils
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
|
|
37
utils.py
37
utils.py
|
@ -1,22 +1,18 @@
|
|||
import os
|
||||
import glob
|
||||
import re
|
||||
import sys
|
||||
import argparse
|
||||
import logging
|
||||
import glob
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import warnings
|
||||
import random
|
||||
import functools
|
||||
import sys
|
||||
|
||||
import faiss
|
||||
import librosa
|
||||
import numpy as np
|
||||
from scipy.io.wavfile import read
|
||||
import torch
|
||||
from scipy.io.wavfile import read
|
||||
from torch.nn import functional as F
|
||||
from modules.commons import sequence_mask
|
||||
import faiss
|
||||
import tqdm
|
||||
|
||||
MATPLOTLIB_FLAG = False
|
||||
|
||||
|
@ -201,15 +197,20 @@ def clean_checkpoints(path_to_models='logs/44k/', n_ckpts_to_keep=2, sort_by_tim
|
|||
False -> lexicographically delete ckpts
|
||||
"""
|
||||
ckpts_files = [f for f in os.listdir(path_to_models) if os.path.isfile(os.path.join(path_to_models, f))]
|
||||
name_key = (lambda _f: int(re.compile('._(\d+)\.pth').match(_f).group(1)))
|
||||
time_key = (lambda _f: os.path.getmtime(os.path.join(path_to_models, _f)))
|
||||
def name_key(_f):
|
||||
return int(re.compile("._(\\d+)\\.pth").match(_f).group(1))
|
||||
def time_key(_f):
|
||||
return os.path.getmtime(os.path.join(path_to_models, _f))
|
||||
sort_key = time_key if sort_by_time else name_key
|
||||
x_sorted = lambda _x: sorted([f for f in ckpts_files if f.startswith(_x) and not f.endswith('_0.pth')], key=sort_key)
|
||||
def x_sorted(_x):
|
||||
return sorted([f for f in ckpts_files if f.startswith(_x) and not f.endswith("_0.pth")], key=sort_key)
|
||||
to_del = [os.path.join(path_to_models, fn) for fn in
|
||||
(x_sorted('G')[:-n_ckpts_to_keep] + x_sorted('D')[:-n_ckpts_to_keep])]
|
||||
del_info = lambda fn: logger.info(f".. Free up space by deleting ckpt {fn}")
|
||||
del_routine = lambda x: [os.remove(x), del_info(x)]
|
||||
rs = [del_routine(fn) for fn in to_del]
|
||||
def del_info(fn):
|
||||
return logger.info(f".. Free up space by deleting ckpt {fn}")
|
||||
def del_routine(x):
|
||||
return [os.remove(x), del_info(x)]
|
||||
[del_routine(fn) for fn in to_del]
|
||||
|
||||
def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050):
|
||||
for k, v in scalars.items():
|
||||
|
|
|
@ -1,13 +1,15 @@
|
|||
import os
|
||||
import json
|
||||
from .env import AttrDict
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
import torch.nn as nn
|
||||
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
||||
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
||||
from .utils import init_weights, get_padding
|
||||
import torch.nn.functional as F
|
||||
from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d
|
||||
from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
|
||||
|
||||
from .env import AttrDict
|
||||
from .utils import get_padding, init_weights
|
||||
|
||||
LRELU_SLOPE = 0.1
|
||||
|
||||
|
@ -199,7 +201,7 @@ class SineGen(torch.nn.Module):
|
|||
output uv: tensor(batchsize=1, length, 1)
|
||||
"""
|
||||
with torch.no_grad():
|
||||
f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim,
|
||||
torch.zeros(f0.shape[0], f0.shape[1], self.dim,
|
||||
device=f0.device)
|
||||
# fundamental component
|
||||
fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device))
|
||||
|
@ -353,7 +355,7 @@ class DiscriminatorP(torch.nn.Module):
|
|||
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
|
||||
super(DiscriminatorP, self).__init__()
|
||||
self.period = period
|
||||
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
||||
norm_f = weight_norm if use_spectral_norm is False else spectral_norm
|
||||
self.convs = nn.ModuleList([
|
||||
norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
|
||||
norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
|
||||
|
@ -412,7 +414,7 @@ class MultiPeriodDiscriminator(torch.nn.Module):
|
|||
class DiscriminatorS(torch.nn.Module):
|
||||
def __init__(self, use_spectral_norm=False):
|
||||
super(DiscriminatorS, self).__init__()
|
||||
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
||||
norm_f = weight_norm if use_spectral_norm is False else spectral_norm
|
||||
self.convs = nn.ModuleList([
|
||||
norm_f(Conv1d(1, 128, 15, 1, padding=7)),
|
||||
norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
|
||||
|
|
|
@ -1,15 +1,13 @@
|
|||
import math
|
||||
import os
|
||||
os.environ["LRU_CACHE_CAPACITY"] = "3"
|
||||
import random
|
||||
|
||||
import librosa
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
import torch
|
||||
import torch.utils.data
|
||||
import numpy as np
|
||||
import librosa
|
||||
from librosa.util import normalize
|
||||
from librosa.filters import mel as librosa_mel_fn
|
||||
from scipy.io.wavfile import read
|
||||
import soundfile as sf
|
||||
|
||||
os.environ["LRU_CACHE_CAPACITY"] = "3"
|
||||
|
||||
def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False):
|
||||
sampling_rate = None
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
import glob
|
||||
import os
|
||||
import matplotlib
|
||||
import torch
|
||||
from torch.nn.utils import weight_norm
|
||||
|
||||
# matplotlib.use("Agg")
|
||||
import matplotlib.pylab as plt
|
||||
import torch
|
||||
from torch.nn.utils import weight_norm
|
||||
|
||||
|
||||
def plot_spectrogram(spectrogram):
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
|
||||
# LICENSE is in incl_licenses directory.
|
||||
|
||||
from .act import *
|
||||
from .filter import *
|
||||
from .resample import *
|
||||
from .act import *
|
|
@ -4,10 +4,10 @@
|
|||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from torch import sin, pow
|
||||
from torch import pow, sin
|
||||
from torch.nn import Parameter
|
||||
from .resample import UpSample1d, DownSample1d
|
||||
|
||||
from .resample import DownSample1d, UpSample1d
|
||||
|
||||
|
||||
class Activation1d(nn.Module):
|
||||
|
|
|
@ -1,10 +1,11 @@
|
|||
# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
|
||||
# LICENSE is in incl_licenses directory.
|
||||
|
||||
import math
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import math
|
||||
|
||||
if 'sinc' in dir(torch):
|
||||
sinc = torch.sinc
|
||||
|
|
|
@ -3,8 +3,8 @@
|
|||
|
||||
import torch.nn as nn
|
||||
from torch.nn import functional as F
|
||||
from .filter import LowPassFilter1d
|
||||
from .filter import kaiser_sinc_filter1d
|
||||
|
||||
from .filter import LowPassFilter1d, kaiser_sinc_filter1d
|
||||
|
||||
|
||||
class UpSample1d(nn.Module):
|
||||
|
|
|
@ -1,15 +1,18 @@
|
|||
import os
|
||||
import json
|
||||
from .env import AttrDict
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
import torch.nn as nn
|
||||
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
||||
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
||||
from .utils import init_weights, get_padding
|
||||
import torch.nn.functional as F
|
||||
from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d
|
||||
from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
|
||||
|
||||
from vdecoder.hifiganwithsnake.alias.act import SnakeAlias
|
||||
|
||||
from .env import AttrDict
|
||||
from .utils import get_padding, init_weights
|
||||
|
||||
LRELU_SLOPE = 0.1
|
||||
|
||||
|
||||
|
@ -211,7 +214,7 @@ class SineGen(torch.nn.Module):
|
|||
output uv: tensor(batchsize=1, length, 1)
|
||||
"""
|
||||
with torch.no_grad():
|
||||
f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim,
|
||||
torch.zeros(f0.shape[0], f0.shape[1], self.dim,
|
||||
device=f0.device)
|
||||
# fundamental component
|
||||
fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device))
|
||||
|
@ -370,7 +373,7 @@ class DiscriminatorP(torch.nn.Module):
|
|||
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
|
||||
super(DiscriminatorP, self).__init__()
|
||||
self.period = period
|
||||
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
||||
norm_f = weight_norm if use_spectral_norm is False else spectral_norm
|
||||
self.convs = nn.ModuleList([
|
||||
norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
|
||||
norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
|
||||
|
@ -429,7 +432,7 @@ class MultiPeriodDiscriminator(torch.nn.Module):
|
|||
class DiscriminatorS(torch.nn.Module):
|
||||
def __init__(self, use_spectral_norm=False):
|
||||
super(DiscriminatorS, self).__init__()
|
||||
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
||||
norm_f = weight_norm if use_spectral_norm is False else spectral_norm
|
||||
self.convs = nn.ModuleList([
|
||||
norm_f(Conv1d(1, 128, 15, 1, padding=7)),
|
||||
norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
|
||||
|
|
|
@ -1,15 +1,13 @@
|
|||
import math
|
||||
import os
|
||||
os.environ["LRU_CACHE_CAPACITY"] = "3"
|
||||
import random
|
||||
|
||||
import librosa
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
import torch
|
||||
import torch.utils.data
|
||||
import numpy as np
|
||||
import librosa
|
||||
from librosa.util import normalize
|
||||
from librosa.filters import mel as librosa_mel_fn
|
||||
from scipy.io.wavfile import read
|
||||
import soundfile as sf
|
||||
|
||||
os.environ["LRU_CACHE_CAPACITY"] = "3"
|
||||
|
||||
def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False):
|
||||
sampling_rate = None
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
import glob
|
||||
import os
|
||||
import matplotlib
|
||||
import torch
|
||||
from torch.nn.utils import weight_norm
|
||||
|
||||
# matplotlib.use("Agg")
|
||||
import matplotlib.pylab as plt
|
||||
import torch
|
||||
from torch.nn.utils import weight_norm
|
||||
|
||||
|
||||
def plot_spectrogram(spectrogram):
|
||||
|
|
|
@ -1,13 +1,15 @@
|
|||
import os
|
||||
import json
|
||||
from .env import AttrDict
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
import torch.nn as nn
|
||||
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
||||
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
||||
from .utils import init_weights, get_padding
|
||||
import torch.nn.functional as F
|
||||
from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d
|
||||
from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
|
||||
|
||||
from .env import AttrDict
|
||||
from .utils import get_padding, init_weights
|
||||
|
||||
LRELU_SLOPE = 0.1
|
||||
|
||||
|
@ -289,7 +291,7 @@ class DiscriminatorP(torch.nn.Module):
|
|||
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
|
||||
super(DiscriminatorP, self).__init__()
|
||||
self.period = period
|
||||
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
||||
norm_f = weight_norm if use_spectral_norm is False else spectral_norm
|
||||
self.convs = nn.ModuleList([
|
||||
norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
|
||||
norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
|
||||
|
@ -348,7 +350,7 @@ class MultiPeriodDiscriminator(torch.nn.Module):
|
|||
class DiscriminatorS(torch.nn.Module):
|
||||
def __init__(self, use_spectral_norm=False):
|
||||
super(DiscriminatorS, self).__init__()
|
||||
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
||||
norm_f = weight_norm if use_spectral_norm is False else spectral_norm
|
||||
self.convs = nn.ModuleList([
|
||||
norm_f(Conv1d(1, 128, 15, 1, padding=7)),
|
||||
norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
|
||||
|
|
|
@ -1,16 +1,14 @@
|
|||
import math
|
||||
import os
|
||||
os.environ["LRU_CACHE_CAPACITY"] = "3"
|
||||
import random
|
||||
import torch
|
||||
import torch.utils.data
|
||||
import numpy as np
|
||||
|
||||
import librosa
|
||||
from librosa.util import normalize
|
||||
from librosa.filters import mel as librosa_mel_fn
|
||||
from scipy.io.wavfile import read
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
import torch.utils.data
|
||||
from librosa.filters import mel as librosa_mel_fn
|
||||
|
||||
os.environ["LRU_CACHE_CAPACITY"] = "3"
|
||||
|
||||
def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False):
|
||||
sampling_rate = None
|
||||
|
|
|
@ -1,10 +1,12 @@
|
|||
import glob
|
||||
import os
|
||||
|
||||
import matplotlib
|
||||
import matplotlib.pylab as plt
|
||||
import torch
|
||||
from torch.nn.utils import weight_norm
|
||||
|
||||
matplotlib.use("Agg")
|
||||
import matplotlib.pylab as plt
|
||||
|
||||
|
||||
def plot_spectrogram(spectrogram):
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
from vencoder.encoder import SpeechEncoder
|
||||
import torch
|
||||
from fairseq import checkpoint_utils
|
||||
|
||||
from vencoder.encoder import SpeechEncoder
|
||||
|
||||
|
||||
class CNHubertLarge(SpeechEncoder):
|
||||
def __init__(self, vec_path="pretrain/chinese-hubert-large-fairseq-ckpt.pt", device=None):
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
from vencoder.encoder import SpeechEncoder
|
||||
import onnxruntime
|
||||
import torch
|
||||
|
||||
from vencoder.encoder import SpeechEncoder
|
||||
|
||||
|
||||
class ContentVec256L12_Onnx(SpeechEncoder):
|
||||
def __init__(self, vec_path="pretrain/vec-256-layer-12.onnx", device=None):
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
from vencoder.encoder import SpeechEncoder
|
||||
import torch
|
||||
from fairseq import checkpoint_utils
|
||||
|
||||
from vencoder.encoder import SpeechEncoder
|
||||
|
||||
|
||||
class ContentVec256L9(SpeechEncoder):
|
||||
def __init__(self, vec_path="pretrain/checkpoint_best_legacy_500.pt", device=None):
|
||||
|
|
|
@ -1,7 +1,9 @@
|
|||
from vencoder.encoder import SpeechEncoder
|
||||
import onnxruntime
|
||||
import torch
|
||||
|
||||
from vencoder.encoder import SpeechEncoder
|
||||
|
||||
|
||||
class ContentVec256L9_Onnx(SpeechEncoder):
|
||||
def __init__(self, vec_path="pretrain/vec-256-layer-9.onnx", device=None):
|
||||
super().__init__()
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
from vencoder.encoder import SpeechEncoder
|
||||
import torch
|
||||
from fairseq import checkpoint_utils
|
||||
|
||||
from vencoder.encoder import SpeechEncoder
|
||||
|
||||
|
||||
class ContentVec768L12(SpeechEncoder):
|
||||
def __init__(self, vec_path="pretrain/checkpoint_best_legacy_500.pt", device=None):
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
from vencoder.encoder import SpeechEncoder
|
||||
import onnxruntime
|
||||
import torch
|
||||
|
||||
from vencoder.encoder import SpeechEncoder
|
||||
|
||||
|
||||
class ContentVec768L12_Onnx(SpeechEncoder):
|
||||
def __init__(self, vec_path="pretrain/vec-768-layer-12.onnx", device=None):
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
from vencoder.encoder import SpeechEncoder
|
||||
import onnxruntime
|
||||
import torch
|
||||
|
||||
from vencoder.encoder import SpeechEncoder
|
||||
|
||||
|
||||
class ContentVec768L9_Onnx(SpeechEncoder):
|
||||
def __init__(self,vec_path = "pretrain/vec-768-layer-9.onnx",device=None):
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
from vencoder.encoder import SpeechEncoder
|
||||
import torch
|
||||
|
||||
from vencoder.dphubert.model import wav2vec2_model
|
||||
from vencoder.encoder import SpeechEncoder
|
||||
|
||||
|
||||
class DPHubert(SpeechEncoder):
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
from vencoder.encoder import SpeechEncoder
|
||||
import torch
|
||||
|
||||
from vencoder.encoder import SpeechEncoder
|
||||
from vencoder.hubert import hubert_model
|
||||
|
||||
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
from vencoder.encoder import SpeechEncoder
|
||||
import onnxruntime
|
||||
import torch
|
||||
|
||||
from vencoder.encoder import SpeechEncoder
|
||||
|
||||
|
||||
class HubertSoft_Onnx(SpeechEncoder):
|
||||
def __init__(self, vec_path="pretrain/hubert-soft.onnx", device=None):
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
from vencoder.encoder import SpeechEncoder
|
||||
import torch
|
||||
|
||||
from vencoder.encoder import SpeechEncoder
|
||||
from vencoder.wavlm.WavLM import WavLM, WavLMConfig
|
||||
|
||||
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
from vencoder.encoder import SpeechEncoder
|
||||
import torch
|
||||
|
||||
from vencoder.whisper.model import Whisper, ModelDimensions
|
||||
from vencoder.whisper.audio import pad_or_trim, log_mel_spectrogram
|
||||
from vencoder.encoder import SpeechEncoder
|
||||
from vencoder.whisper.audio import log_mel_spectrogram, pad_or_trim
|
||||
from vencoder.whisper.model import ModelDimensions, Whisper
|
||||
|
||||
|
||||
class WhisperPPG(SpeechEncoder):
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
from vencoder.encoder import SpeechEncoder
|
||||
import torch
|
||||
|
||||
from vencoder.whisper.model import Whisper, ModelDimensions
|
||||
from vencoder.whisper.audio import pad_or_trim, log_mel_spectrogram
|
||||
from vencoder.encoder import SpeechEncoder
|
||||
from vencoder.whisper.audio import log_mel_spectrogram, pad_or_trim
|
||||
from vencoder.whisper.model import ModelDimensions, Whisper
|
||||
|
||||
|
||||
class WhisperPPGLarge(SpeechEncoder):
|
||||
|
|
|
@ -5,19 +5,19 @@ https://github.com/pytorch/audio/blob/main/torchaudio/models/wav2vec2/components
|
|||
|
||||
"""
|
||||
|
||||
import math
|
||||
from collections import defaultdict
|
||||
from typing import List, Optional, Tuple
|
||||
import math
|
||||
|
||||
import torch
|
||||
from torch import nn, Tensor
|
||||
from torch.nn import Module, Parameter
|
||||
from torch import Tensor, nn
|
||||
from torch.nn import Module
|
||||
|
||||
from .hardconcrete import HardConcrete
|
||||
from .pruning_utils import (
|
||||
prune_linear_layer,
|
||||
prune_conv1d_layer,
|
||||
prune_layer_norm,
|
||||
prune_linear_layer,
|
||||
)
|
||||
|
||||
|
||||
|
|
|
@ -10,7 +10,7 @@ from typing import Any, Dict
|
|||
|
||||
from torch.nn import Module
|
||||
|
||||
from ..model import wav2vec2_model, Wav2Vec2Model, wavlm_model
|
||||
from ..model import Wav2Vec2Model, wav2vec2_model, wavlm_model
|
||||
|
||||
_LG = logging.getLogger(__name__)
|
||||
|
||||
|
|
|
@ -7,26 +7,26 @@
|
|||
# https://github.com/pytorch/fairseq
|
||||
# --------------------------------------------------------
|
||||
|
||||
import math
|
||||
import logging
|
||||
import math
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
import numpy as np
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from torch.nn import LayerNorm
|
||||
|
||||
from vencoder.wavlm.modules import (
|
||||
Fp32GroupNorm,
|
||||
Fp32LayerNorm,
|
||||
GLU_Linear,
|
||||
GradMultiply,
|
||||
MultiheadAttention,
|
||||
SamePad,
|
||||
init_bert_params,
|
||||
get_activation_fn,
|
||||
TransposeLast,
|
||||
GLU_Linear,
|
||||
get_activation_fn,
|
||||
init_bert_params,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -402,9 +402,7 @@ class ConvFeatureExtractionModel(nn.Module):
|
|||
nn.init.kaiming_normal_(conv.weight)
|
||||
return conv
|
||||
|
||||
assert (
|
||||
is_layer_norm and is_group_norm
|
||||
) == False, "layer norm and group norm are exclusive"
|
||||
assert (is_layer_norm and is_group_norm) is False, "layer norm and group norm are exclusive"
|
||||
|
||||
if is_layer_norm:
|
||||
return nn.Sequential(
|
||||
|
|
|
@ -10,10 +10,11 @@
|
|||
import math
|
||||
import warnings
|
||||
from typing import Dict, Optional, Tuple
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from torch import Tensor, nn
|
||||
from torch.nn import Parameter
|
||||
import torch.nn.functional as F
|
||||
|
||||
|
||||
class TransposeLast(nn.Module):
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
import os
|
||||
from functools import lru_cache
|
||||
from typing import Union
|
||||
|
||||
|
@ -6,11 +5,10 @@ import ffmpeg
|
|||
import numpy as np
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from librosa.filters import mel as librosa_mel_fn
|
||||
|
||||
from .utils import exact_div
|
||||
|
||||
from librosa.filters import mel as librosa_mel_fn
|
||||
|
||||
# hard-coded audio hyperparameters
|
||||
SAMPLE_RATE = 16000
|
||||
N_FFT = 400
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
from dataclasses import dataclass, field
|
||||
from typing import Dict, List, Tuple, Iterable, Optional, Sequence, Union, TYPE_CHECKING
|
||||
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Sequence, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
@ -32,7 +32,7 @@ def detect_language(model: "Whisper", mel: Tensor, tokenizer: Tokenizer = None)
|
|||
if tokenizer is None:
|
||||
tokenizer = get_tokenizer(model.is_multilingual)
|
||||
if tokenizer.language is None or tokenizer.language_token not in tokenizer.sot_sequence:
|
||||
raise ValueError(f"This model doesn't have language tokens so it can't perform lang id")
|
||||
raise ValueError("This model doesn't have language tokens so it can't perform lang id")
|
||||
|
||||
single = mel.ndim == 2
|
||||
if single:
|
||||
|
|
|
@ -1,14 +1,13 @@
|
|||
from dataclasses import dataclass
|
||||
from typing import Dict
|
||||
from typing import Iterable, Optional
|
||||
from typing import Dict, Iterable, Optional
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from torch import Tensor
|
||||
from torch import nn
|
||||
from torch import Tensor, nn
|
||||
|
||||
from .decoding import detect_language as detect_language_function, decode as decode_function
|
||||
from .decoding import decode as decode_function
|
||||
from .decoding import detect_language as detect_language_function
|
||||
|
||||
|
||||
@dataclass
|
||||
|
|
|
@ -196,7 +196,7 @@ class Tokenizer:
|
|||
def language_token(self) -> int:
|
||||
"""Returns the token id corresponding to the value of the `language` field"""
|
||||
if self.language is None:
|
||||
raise ValueError(f"This tokenizer does not have language token configured")
|
||||
raise ValueError("This tokenizer does not have language token configured")
|
||||
|
||||
additional_tokens = dict(
|
||||
zip(
|
||||
|
|
|
@ -1,7 +1,9 @@
|
|||
from google.colab import files
|
||||
import shutil
|
||||
import os
|
||||
import argparse
|
||||
import os
|
||||
import shutil
|
||||
|
||||
from google.colab import files
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--type", type=str, required=True, help="type of file to upload")
|
||||
|
|
49
webUI.py
49
webUI.py
|
@ -1,5 +1,11 @@
|
|||
import io
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import time
|
||||
import traceback
|
||||
from itertools import chain
|
||||
|
||||
# os.system("wget -P cvec/ https://huggingface.co/spaces/innnky/nanami/resolve/main/checkpoint_best_legacy_500.pt")
|
||||
import gradio as gr
|
||||
|
@ -7,22 +13,12 @@ import gradio.processing_utils as gr_pu
|
|||
import librosa
|
||||
import numpy as np
|
||||
import soundfile
|
||||
from inference.infer_tool import Svc
|
||||
import logging
|
||||
import re
|
||||
import json
|
||||
|
||||
import subprocess
|
||||
import edge_tts
|
||||
import asyncio
|
||||
from scipy.io import wavfile
|
||||
import librosa
|
||||
import torch
|
||||
import time
|
||||
import traceback
|
||||
from itertools import chain
|
||||
from utils import mix_model
|
||||
from scipy.io import wavfile
|
||||
|
||||
from compress_model import removeOptimizer
|
||||
from inference.infer_tool import Svc
|
||||
from utils import mix_model
|
||||
|
||||
logging.getLogger('numba').setLevel(logging.WARNING)
|
||||
logging.getLogger('markdown_it').setLevel(logging.WARNING)
|
||||
|
@ -42,14 +38,15 @@ if torch.cuda.is_available():
|
|||
|
||||
def upload_mix_append_file(files,sfiles):
|
||||
try:
|
||||
if(sfiles == None):
|
||||
if(sfiles is None):
|
||||
file_paths = [file.name for file in files]
|
||||
else:
|
||||
file_paths = [file.name for file in chain(files,sfiles)]
|
||||
p = {file:100 for file in file_paths}
|
||||
return file_paths,mix_model_output1.update(value=json.dumps(p,indent=2))
|
||||
except Exception as e:
|
||||
if debug: traceback.print_exc()
|
||||
if debug:
|
||||
traceback.print_exc()
|
||||
raise gr.Error(e)
|
||||
|
||||
def mix_submit_click(js,mode):
|
||||
|
@ -63,16 +60,19 @@ def mix_submit_click(js,mode):
|
|||
path = mix_model(model_path,mix_rate,mode)
|
||||
return f"成功,文件被保存在了{path}"
|
||||
except Exception as e:
|
||||
if debug: traceback.print_exc()
|
||||
if debug:
|
||||
traceback.print_exc()
|
||||
raise gr.Error(e)
|
||||
|
||||
def updata_mix_info(files):
|
||||
try:
|
||||
if files == None : return mix_model_output1.update(value="")
|
||||
if files is None :
|
||||
return mix_model_output1.update(value="")
|
||||
p = {file.name:100 for file in files}
|
||||
return mix_model_output1.update(value=json.dumps(p,indent=2))
|
||||
except Exception as e:
|
||||
if debug: traceback.print_exc()
|
||||
if debug:
|
||||
traceback.print_exc()
|
||||
raise gr.Error(e)
|
||||
|
||||
def modelAnalysis(model_path,config_path,cluster_model_path,device,enhance,diff_model_path,diff_config_path,only_diffusion,use_spk_mix):
|
||||
|
@ -112,7 +112,8 @@ def modelAnalysis(model_path,config_path,cluster_model_path,device,enhance,diff_
|
|||
msg += i + " "
|
||||
return sid.update(choices = spks,value=spks[0]), msg
|
||||
except Exception as e:
|
||||
if debug: traceback.print_exc()
|
||||
if debug:
|
||||
traceback.print_exc()
|
||||
raise gr.Error(e)
|
||||
|
||||
|
||||
|
@ -172,7 +173,8 @@ def vc_fn(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, noise
|
|||
soundfile.write(output_file, _audio, model.target_sample, format="wav")
|
||||
return "Success", output_file
|
||||
except Exception as e:
|
||||
if debug: traceback.print_exc()
|
||||
if debug:
|
||||
traceback.print_exc()
|
||||
raise gr.Error(e)
|
||||
|
||||
def tts_func(_text,_rate,_voice):
|
||||
|
@ -180,7 +182,8 @@ def tts_func(_text,_rate,_voice):
|
|||
# voice = "zh-CN-XiaoyiNeural"#女性,较高音
|
||||
# voice = "zh-CN-YunxiNeural"#男性
|
||||
voice = "zh-CN-YunxiNeural"#男性
|
||||
if ( _voice == "女" ) : voice = "zh-CN-XiaoyiNeural"
|
||||
if ( _voice == "女" ) :
|
||||
voice = "zh-CN-XiaoyiNeural"
|
||||
output_file = _text[0:10]+".wav"
|
||||
# communicate = edge_tts.Communicate(_text, voice)
|
||||
# await communicate.save(output_file)
|
||||
|
|
Loading…
Reference in New Issue