Merge branch '4.1-Stable' into 4.1-Stable

This commit is contained in:
2DIPW 2023-06-27 02:03:50 +08:00 committed by GitHub
commit f090e33e9c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
83 changed files with 428 additions and 413 deletions

4
.ruff.toml Normal file
View File

@ -0,0 +1,4 @@
select = ["E", "F", "I"]
# Never enforce `E501` (line length violations).
ignore = ["E501"]

6
.vscode/extensions.json vendored Normal file
View File

@ -0,0 +1,6 @@
{
"recommendations": [
"charliermarsh.ruff",
"ms-python.python"
]
}

View File

@ -396,7 +396,7 @@ python train_index.py -c configs/config.json
模型的输出会在`logs/44k/feature_and_index.pkl`
+ 推理过程:
+ 需要首先定`--feature_retrieval`,此时聚类方案会自动切换到特征检索方案
+ 需要首先定`--feature_retrieval`,此时聚类方案会自动切换到特征检索方案
+ `inference_main.py`中指定`cluster_model_path` 为模型输出文件, 留空则默认为`logs/44k/feature_and_index.pkl`
+ `inference_main.py`中指定`cluster_infer_ratio``0`为完全不使用特征检索,`1`为只使用特征检索,通常设置`0.5`即可

View File

@ -1,7 +1,7 @@
import numpy as np
import torch
from sklearn.cluster import KMeans
def get_cluster_model(ckpt_path):
checkpoint = torch.load(ckpt_path)
kmeans_dict = {}

View File

@ -1,8 +1,11 @@
import math,pdb
import torch,pynvml
from torch.nn.functional import normalize
from time import time
import numpy as np
import pynvml
import torch
from torch.nn.functional import normalize
# device=torch.device("cuda:0")
def _kpp(data: torch.Tensor, k: int, sample_size: int = -1):
""" Picks k points in the data based on the kmeans++ method.

View File

@ -1,19 +1,17 @@
import time,pdb
import tqdm
from time import time as ttime
import os
from pathlib import Path
import logging
import argparse
from kmeans import KMeansGPU
import torch
import logging
import os
import time
from pathlib import Path
import numpy as np
from sklearn.cluster import KMeans,MiniBatchKMeans
import torch
import tqdm
from kmeans import KMeansGPU
from sklearn.cluster import KMeans, MiniBatchKMeans
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
from time import time as ttime
import pynvml,torch
def train_cluster(in_dir, n_clusters, use_minibatch=True, verbose=False,use_gpu=False):#gpu_minibatch真拉虽然库支持但是也不考虑
logger.info(f"Loading features from {in_dir}")
@ -29,7 +27,7 @@ def train_cluster(in_dir, n_clusters, use_minibatch=True, verbose=False,use_gpu=
features = features.astype(np.float32)
logger.info(f"Clustering features of shape: {features.shape}")
t = time.time()
if(use_gpu==False):
if(use_gpu is False):
if use_minibatch:
kmeans = MiniBatchKMeans(n_clusters=n_clusters,verbose=verbose, batch_size=4096, max_iter=80).fit(features)
else:
@ -37,14 +35,14 @@ def train_cluster(in_dir, n_clusters, use_minibatch=True, verbose=False,use_gpu=
else:
kmeans = KMeansGPU(n_clusters=n_clusters, mode='euclidean', verbose=2 if verbose else 0,max_iter=500,tol=1e-2)#
features=torch.from_numpy(features)#.to(device)
labels = kmeans.fit_predict(features)#
kmeans.fit_predict(features)#
print(time.time()-t, "s")
x = {
"n_features_in_": kmeans.n_features_in_ if use_gpu==False else features.shape[1],
"_n_threads": kmeans._n_threads if use_gpu==False else 4,
"cluster_centers_": kmeans.cluster_centers_ if use_gpu==False else kmeans.centroids.cpu().numpy(),
"n_features_in_": kmeans.n_features_in_ if use_gpu is False else features.shape[1],
"_n_threads": kmeans._n_threads if use_gpu is False else 4,
"cluster_centers_": kmeans.cluster_centers_ if use_gpu is False else kmeans.centroids.cpu().numpy(),
}
print("end")

View File

@ -1,14 +1,13 @@
import time
import os
import random
import numpy as np
import torch
import torch.utils.data
import modules.commons as commons
import utils
from modules.mel_processing import spectrogram_torch, spec_to_mel_torch, spectrogram_torch
from utils import load_wav_to_torch, load_filepaths_and_text
from modules.mel_processing import spectrogram_torch
from utils import load_filepaths_and_text, load_wav_to_torch
# import h5py
@ -87,7 +86,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
assert abs(audio_norm.shape[1]-lmin * self.hop_length) < 3 * self.hop_length
spec, c, f0, uv = spec[:, :lmin], c[:, :lmin], f0[:lmin], uv[:lmin]
audio_norm = audio_norm[:, :lmin * self.hop_length]
if volume!= None:
if volume is not None:
volume = volume[:lmin]
return c, f0, spec, audio_norm, spk, uv, volume
@ -96,7 +95,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
# print("skip too short audio:", filename)
# return None
if random.choice([True, False]) and self.vol_aug and volume!=None:
if random.choice([True, False]) and self.vol_aug and volume is not None:
max_amp = float(torch.max(torch.abs(audio_norm))) + 1e-5
max_shift = min(1, np.log10(1/max_amp))
log10_vol_shift = random.uniform(-1, max_shift)
@ -114,7 +113,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
end = start + 790
spec, c, f0, uv = spec[:, start:end], c[:, start:end], f0[start:end], uv[start:end]
audio_norm = audio_norm[:, start * self.hop_length : end * self.hop_length]
if volume !=None:
if volume is not None:
volume = volume[start:end]
return c, f0, spec, audio_norm, spk, uv,volume
@ -178,7 +177,7 @@ class TextAudioCollate:
uv = row[5]
uv_padded[i, :uv.size(0)] = uv
volume = row[6]
if volume != None:
if volume is not None:
volume_padded[i, :volume.size(0)] = volume
else :
volume_padded = None

View File

@ -1,13 +1,14 @@
import os
import random
import re
import numpy as np
import librosa
import numpy as np
import torch
import random
from utils import repeat_expand_2d
from tqdm import tqdm
from torch.utils.data import Dataset
from tqdm import tqdm
from utils import repeat_expand_2d
def traverse_dir(
root_dir,
@ -130,7 +131,7 @@ class AudioDataset(Dataset):
with open(filelists,"r") as f:
self.paths = f.read().splitlines()
for name_ext in tqdm(self.paths, total=len(self.paths)):
name = os.path.splitext(name_ext)[0]
os.path.splitext(name_ext)[0]
path_audio = name_ext
duration = librosa.get_duration(filename = path_audio, sr = self.sample_rate)

View File

@ -1,10 +1,10 @@
from collections import deque
from functools import partial
from inspect import isfunction
import torch.nn.functional as F
import librosa.sequence
import numpy as np
import torch
import torch.nn.functional as F
from torch import nn
from tqdm import tqdm
@ -26,8 +26,10 @@ def extract(a, t, x_shape):
def noise_like(shape, device, repeat=False):
repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1)))
noise = lambda: torch.randn(shape, device=device)
def repeat_noise():
return torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1)))
def noise():
return torch.randn(shape, device=device)
return repeat_noise() if repeat else noise()
@ -253,7 +255,11 @@ class GaussianDiffusion(nn.Module):
if method is not None and infer_speedup > 1:
if method == 'dpm-solver' or method == 'dpm-solver++':
from .dpm_solver_pytorch import NoiseScheduleVP, model_wrapper, DPM_Solver
from .dpm_solver_pytorch import (
DPM_Solver,
NoiseScheduleVP,
model_wrapper,
)
# 1. Define the noise schedule.
noise_schedule = NoiseScheduleVP(schedule='discrete', betas=self.betas[:t])
@ -331,7 +337,7 @@ class GaussianDiffusion(nn.Module):
infer_speedup, cond=cond
)
elif method == 'unipc':
from .uni_pc import NoiseScheduleVP, model_wrapper, UniPC
from .uni_pc import NoiseScheduleVP, UniPC, model_wrapper
# 1. Define the noise schedule.
noise_schedule = NoiseScheduleVP(schedule='discrete', betas=self.betas[:t])

View File

@ -1,15 +1,14 @@
import math
from collections import deque
from functools import partial
from inspect import isfunction
import torch.nn.functional as F
import librosa.sequence
import numpy as np
from torch.nn import Conv1d
from torch.nn import Mish
import torch
import torch.nn.functional as F
from torch import nn
from torch.nn import Conv1d, Mish
from tqdm import tqdm
import math
def exists(x):
@ -27,8 +26,10 @@ def extract(a, t):
def noise_like(shape, device, repeat=False):
repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1)))
noise = lambda: torch.randn(shape, device=device)
def repeat_noise():
return torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1)))
def noise():
return torch.randn(shape, device=device)
return repeat_noise() if repeat else noise()
@ -389,7 +390,11 @@ class GaussianDiffusion(nn.Module):
if method is not None and infer_speedup > 1:
if method == 'dpm-solver':
from .dpm_solver_pytorch import NoiseScheduleVP, model_wrapper, DPM_Solver
from .dpm_solver_pytorch import (
DPM_Solver,
NoiseScheduleVP,
model_wrapper,
)
# 1. Define the noise schedule.
noise_schedule = NoiseScheduleVP(schedule='discrete', betas=self.betas[:t])
@ -577,7 +582,7 @@ class GaussianDiffusion(nn.Module):
noise_list = torch.zeros((0, 1, 1, self.mel_bins, n_frames), device=device)
ot = step_range[0]
ot_1 = torch.full((1,), ot, device=device, dtype=torch.long)
torch.full((1,), ot, device=device, dtype=torch.long)
for t in step_range:
t_1 = torch.full((1,), t, device=device, dtype=torch.long)

View File

@ -1,6 +1,4 @@
import torch
import torch.nn.functional as F
import math
class NoiseScheduleVP:
@ -559,7 +557,7 @@ class DPM_Solver:
x_t: A pytorch tensor. The approximated solution at time `t`.
"""
ns = self.noise_schedule
dims = x.dim()
x.dim()
lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t)
h = lambda_t - lambda_s
log_alpha_s, log_alpha_t = ns.marginal_log_mean_coeff(s), ns.marginal_log_mean_coeff(t)
@ -984,12 +982,16 @@ class DPM_Solver:
nfe = 0
if order == 2:
r1 = 0.5
lower_update = lambda x, s, t: self.dpm_solver_first_update(x, s, t, return_intermediate=True)
higher_update = lambda x, s, t, **kwargs: self.singlestep_dpm_solver_second_update(x, s, t, r1=r1, solver_type=solver_type, **kwargs)
def lower_update(x, s, t):
return self.dpm_solver_first_update(x, s, t, return_intermediate=True)
def higher_update(x, s, t, **kwargs):
return self.singlestep_dpm_solver_second_update(x, s, t, r1=r1, solver_type=solver_type, **kwargs)
elif order == 3:
r1, r2 = 1. / 3., 2. / 3.
lower_update = lambda x, s, t: self.singlestep_dpm_solver_second_update(x, s, t, r1=r1, return_intermediate=True, solver_type=solver_type)
higher_update = lambda x, s, t, **kwargs: self.singlestep_dpm_solver_third_update(x, s, t, r1=r1, r2=r2, solver_type=solver_type, **kwargs)
def lower_update(x, s, t):
return self.singlestep_dpm_solver_second_update(x, s, t, r1=r1, return_intermediate=True, solver_type=solver_type)
def higher_update(x, s, t, **kwargs):
return self.singlestep_dpm_solver_third_update(x, s, t, r1=r1, r2=r2, solver_type=solver_type, **kwargs)
else:
raise ValueError("For adaptive step size solver, order must be 2 or 3, got {}".format(order))
while torch.abs((s - t_0)).mean() > t_err:
@ -997,7 +999,8 @@ class DPM_Solver:
x_lower, lower_noise_kwargs = lower_update(x, s, t)
x_higher = higher_update(x, s, t, **lower_noise_kwargs)
delta = torch.max(torch.ones_like(x).to(x) * atol, rtol * torch.max(torch.abs(x_lower), torch.abs(x_prev)))
norm_fn = lambda v: torch.sqrt(torch.square(v.reshape((v.shape[0], -1))).mean(dim=-1, keepdim=True))
def norm_fn(v):
return torch.sqrt(torch.square(v.reshape((v.shape[0], -1))).mean(dim=-1, keepdim=True))
E = norm_fn((x_higher - x_lower) / delta).max()
if torch.all(E <= 1.):
x = x_higher

View File

@ -1,6 +1,6 @@
import numpy as np
import torch
import torch.nn.functional as F
from diffusion.unit2mel import load_model_vocoder

View File

@ -2,16 +2,16 @@
author: wayn391@mastertones
'''
import os
import json
import time
import yaml
import datetime
import torch
import os
import time
import matplotlib.pyplot as plt
from . import utils
import torch
import yaml
from torch.utils.tensorboard import SummaryWriter
class Saver(object):
def __init__(
self,

View File

@ -1,8 +1,9 @@
import os
import yaml
import json
import pickle
import os
import torch
import yaml
def traverse_dir(
root_dir,
@ -121,6 +122,6 @@ def load_model(
ckpt = torch.load(path_pt, map_location=torch.device(device))
global_step = ckpt['global_step']
model.load_state_dict(ckpt['model'], strict=False)
if ckpt.get('optimizer') != None:
if ckpt.get("optimizer") is not None:
optimizer.load_state_dict(ckpt['optimizer'])
return global_step, model, optimizer

View File

@ -1,12 +1,12 @@
from diffusion_onnx import GaussianDiffusion
import os
import yaml
import numpy as np
import torch
import torch.nn as nn
import numpy as np
from wavenet import WaveNet
import torch.nn.functional as F
import diffusion
import yaml
from diffusion_onnx import GaussianDiffusion
class DotDict(dict):
def __getattr__(*args):
@ -147,8 +147,8 @@ class Unit2Mel(nn.Module):
spks.update({i:1.0/float(self.n_spk)})
spk_mix = torch.tensor(spk_mix)
spk_mix = spk_mix.repeat(n_frames, 1)
orgouttt = self.init_spkembed(hubert, f0.unsqueeze(-1), volume.unsqueeze(-1), spk_mix_dict=spks)
outtt = self.forward(hubert, mel2ph, f0, volume, spk_mix)
self.init_spkembed(hubert, f0.unsqueeze(-1), volume.unsqueeze(-1), spk_mix_dict=spks)
self.forward(hubert, mel2ph, f0, volume, spk_mix)
if export_encoder:
torch.onnx.export(
self,
@ -182,8 +182,8 @@ class Unit2Mel(nn.Module):
spk_mix.append(1.0/float(self.n_spk))
spks.update({i:1.0/float(self.n_spk)})
spk_mix = torch.tensor(spk_mix)
orgouttt = self.orgforward(hubert, f0.unsqueeze(-1), volume.unsqueeze(-1), spk_mix_dict=spks)
outtt = self.forward(hubert, mel2ph, f0, volume, spk_mix)
self.orgforward(hubert, f0.unsqueeze(-1), volume.unsqueeze(-1), spk_mix_dict=spks)
self.forward(hubert, mel2ph, f0, volume, spk_mix)
torch.onnx.export(
self,

View File

@ -1,13 +1,15 @@
import os
import time
import librosa
import numpy as np
import torch
import librosa
from diffusion.logger.saver import Saver
from diffusion.logger import utils
from torch import autocast
from torch.cuda.amp import GradScaler
from diffusion.logger import utils
from diffusion.logger.saver import Saver
def test(args, model, vocoder, loader_test, saver):
print(' [*] testing...')
model.eval()

View File

@ -1,7 +1,7 @@
import torch
import torch.nn.functional as F
import math
import torch
class NoiseScheduleVP:
def __init__(
@ -109,7 +109,8 @@ class NoiseScheduleVP:
elif self.schedule == 'linear':
return -0.25 * t ** 2 * (self.beta_1 - self.beta_0) - 0.5 * t * self.beta_0
elif self.schedule == 'cosine':
log_alpha_fn = lambda s: torch.log(torch.cos((s + self.cosine_s) / (1. + self.cosine_s) * math.pi / 2.))
def log_alpha_fn(s):
return torch.log(torch.cos((s + self.cosine_s) / (1.0 + self.cosine_s) * math.pi / 2.0))
log_alpha_t = log_alpha_fn(t) - self.cosine_log_alpha_0
return log_alpha_t
@ -147,7 +148,8 @@ class NoiseScheduleVP:
return t.reshape((-1,))
else:
log_alpha = -0.5 * torch.logaddexp(-2. * lamb, torch.zeros((1,)).to(lamb))
t_fn = lambda log_alpha_t: torch.arccos(torch.exp(log_alpha_t + self.cosine_log_alpha_0)) * 2. * (1. + self.cosine_s) / math.pi - self.cosine_s
def t_fn(log_alpha_t):
return torch.arccos(torch.exp(log_alpha_t + self.cosine_log_alpha_0)) * 2.0 * (1.0 + self.cosine_s) / math.pi - self.cosine_s
t = t_fn(log_alpha)
return t

View File

@ -1,11 +1,14 @@
import os
import yaml
import numpy as np
import torch
import torch.nn as nn
import numpy as np
import yaml
from .diffusion import GaussianDiffusion
from .wavenet import WaveNet
from .vocoder import Vocoder
from .wavenet import WaveNet
class DotDict(dict):
def __getattr__(*args):
@ -21,9 +24,11 @@ def load_model_vocoder(
device='cpu',
config_path = None
):
if config_path is None: config_file = os.path.join(os.path.split(model_path)[0], 'config.yaml')
else: config_file = config_path
if config_path is None:
config_file = os.path.join(os.path.split(model_path)[0], 'config.yaml')
else:
config_file = config_path
with open(config_file, "r") as config:
args = yaml.safe_load(config)
args = DotDict(args)
@ -116,13 +121,13 @@ class Unit2Mel(nn.Module):
hubert_hidden_size = self.input_channel
n_frames = 10
hubert = torch.randn((1, n_frames, hubert_hidden_size))
mel2ph = torch.arange(end=n_frames).unsqueeze(0).long()
torch.arange(end=n_frames).unsqueeze(0).long()
f0 = torch.randn((1, n_frames))
volume = torch.randn((1, n_frames))
spks = {}
for i in range(n_spk):
spks.update({i:1.0/float(self.n_spk)})
orgouttt = self.init_spkembed(hubert, f0.unsqueeze(-1), volume.unsqueeze(-1), spk_mix_dict=spks)
self.init_spkembed(hubert, f0.unsqueeze(-1), volume.unsqueeze(-1), spk_mix_dict=spks)
def forward(self, units, f0, volume, spk_id = None, spk_mix_dict = None, aug_shift = None,
gt_spec=None, infer=True, infer_speedup=10, method='dpm-solver', k_step=300, use_tqdm=True):

View File

@ -1,9 +1,10 @@
import torch
from vdecoder.nsf_hifigan.nvSTFT import STFT
from vdecoder.nsf_hifigan.models import load_model,load_config
from torchaudio.transforms import Resample
from vdecoder.nsf_hifigan.models import load_config, load_model
from vdecoder.nsf_hifigan.nvSTFT import STFT
class Vocoder:
def __init__(self, vocoder_type, vocoder_ckpt, device = None):
if device is None:

View File

@ -7,7 +7,7 @@ import torchaudio
from flask import Flask, request, send_file
from flask_cors import CORS
from inference.infer_tool import Svc, RealTimeVC
from inference.infer_tool import RealTimeVC, Svc
app = Flask(__name__)

View File

@ -1,10 +1,10 @@
import io
import numpy as np
import soundfile
from flask import Flask, request, send_file
from inference import infer_tool
from inference import slicer
from inference import infer_tool, slicer
app = Flask(__name__)

View File

@ -1,15 +1,16 @@
import gc
import hashlib
import io
import json
import logging
import os
import pickle
import time
from pathlib import Path
from inference import slicer
import gc
import librosa
import numpy as np
# import onnxruntime
import soundfile
import torch
@ -17,11 +18,9 @@ import torchaudio
import cluster
import utils
from models import SynthesizerTrn
import pickle
from diffusion.unit2mel import load_model_vocoder
import yaml
from inference import slicer
from models import SynthesizerTrn
logging.getLogger('matplotlib').setLevel(logging.WARNING)
@ -153,7 +152,7 @@ class Svc(object):
self.hop_size = self.diffusion_args.data.block_size
self.spk2id = self.diffusion_args.spk
self.speech_encoder = self.diffusion_args.data.encoder
self.unit_interpolate_mode = self.diffusion_args.data.unit_interpolate_mode if self.diffusion_args.data.unit_interpolate_mode!=None else 'left'
self.unit_interpolate_mode = self.diffusion_args.data.unit_interpolate_mode if self.diffusion_args.data.unit_interpolate_mode is not None else 'left'
if spk_mix_enable:
self.diffusion_model.init_spkmix(len(self.spk2id))
else:
@ -180,7 +179,8 @@ class Svc(object):
else:
self.feature_retrieval=False
if self.shallow_diffusion : self.nsf_hifigan_enhance = False
if self.shallow_diffusion :
self.nsf_hifigan_enhance = False
if self.nsf_hifigan_enhance:
from modules.enhancer import Enhancer
self.enhancer = Enhancer('nsf-hifigan', 'pretrain/nsf_hifigan/model',device=self.dev)
@ -290,7 +290,7 @@ class Svc(object):
audio = torch.FloatTensor(wav).to(self.dev)
audio_mel = None
if self.only_diffusion or self.shallow_diffusion:
vol = self.volume_extractor.extract(audio[None,:])[None,:,None].to(self.dev) if vol==None else vol[:,:,None]
vol = self.volume_extractor.extract(audio[None,:])[None,:,None].to(self.dev) if vol is None else vol[:,:,None]
if self.shallow_diffusion and second_encoding:
audio16k = librosa.resample(audio.detach().cpu().numpy(), orig_sr=self.target_sample, target_sr=16000)
audio16k = torch.from_numpy(audio16k).to(self.dev)
@ -443,7 +443,8 @@ class Svc(object):
datas = [data]
for k,dat in enumerate(datas):
per_length = int(np.ceil(len(dat) / audio_sr * self.target_sample)) if clip_seconds!=0 else length
if clip_seconds!=0: print(f'###=====segment clip start, {round(len(dat) / audio_sr, 3)}s======')
if clip_seconds!=0:
print(f'###=====segment clip start, {round(len(dat) / audio_sr, 3)}s======')
# padd
pad_len = int(audio_sr * pad_seconds)
dat = np.concatenate([np.zeros([pad_len]), dat, np.zeros([pad_len])])

View File

@ -1,22 +1,18 @@
import hashlib
import json
import io
import logging
import os
import time
from pathlib import Path
import io
import librosa
import maad
import numpy as np
from inference import slicer
import parselmouth
import soundfile
import torch
import torchaudio
from hubert import hubert_model
import utils
from inference import slicer
from models import SynthesizerTrn
logging.getLogger('numba').setLevel(logging.WARNING)
logging.getLogger('matplotlib').setLevel(logging.WARNING)
@ -93,7 +89,7 @@ class VitsSvc(object):
def set_device(self, device):
self.device = torch.device(device)
self.hubert_soft.to(self.device)
if self.SVCVITS != None:
if self.SVCVITS is not None:
self.SVCVITS.to(self.device)
def loadCheckpoint(self, path):

View File

@ -1,15 +1,10 @@
import io
import logging
import time
from pathlib import Path
from spkmix import spk_mix_map
import librosa
import matplotlib.pyplot as plt
import numpy as np
import soundfile
from inference import infer_tool
from inference import slicer
from inference.infer_tool import Svc
from spkmix import spk_mix_map
logging.getLogger('numba').setLevel(logging.WARNING)
chunks_dict = infer_tool.read_temp("inference/chunks_temp.json")
@ -146,8 +141,10 @@ def main():
key = "auto" if auto_predict_f0 else f"{tran}key"
cluster_name = "" if cluster_infer_ratio == 0 else f"_{cluster_infer_ratio}"
isdiffusion = "sovits"
if shallow_diffusion : isdiffusion = "sovdiff"
if only_diffusion : isdiffusion = "diff"
if shallow_diffusion :
isdiffusion = "sovdiff"
if only_diffusion :
isdiffusion = "diff"
if use_spk_mix:
spk = "spk_mix"
res_path = f'results/{clean_name}_{key}_{spk}{cluster_name}_{isdiffusion}_{f0p}.{wav_format}'

View File

@ -1,20 +1,17 @@
import copy
import math
import torch
from torch import nn
from torch.nn import Conv1d, Conv2d
from torch.nn import functional as F
from torch.nn.utils import spectral_norm, weight_norm
import modules.attentions as attentions
import modules.commons as commons
import modules.modules as modules
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
import utils
from modules.commons import init_weights, get_padding
from modules.commons import get_padding
from utils import f0_to_coarse
class ResidualCouplingBlock(nn.Module):
def __init__(self,
channels,
@ -125,7 +122,7 @@ class DiscriminatorP(torch.nn.Module):
super(DiscriminatorP, self).__init__()
self.period = period
self.use_spectral_norm = use_spectral_norm
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
norm_f = weight_norm if use_spectral_norm is False else spectral_norm
self.convs = nn.ModuleList([
norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
@ -160,7 +157,7 @@ class DiscriminatorP(torch.nn.Module):
class DiscriminatorS(torch.nn.Module):
def __init__(self, use_spectral_norm=False):
super(DiscriminatorS, self).__init__()
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
norm_f = weight_norm if use_spectral_norm is False else spectral_norm
self.convs = nn.ModuleList([
norm_f(Conv1d(1, 16, 15, 1, padding=7)),
norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
@ -407,7 +404,7 @@ class SynthesizerTrn(nn.Module):
g = self.emb_g(g).transpose(1,2)
# vol proj
vol = self.emb_vol(vol[:,:,None]).transpose(1,2) if vol!=None and self.vol_embedding else 0
vol = self.emb_vol(vol[:,:,None]).transpose(1,2) if vol is not None and self.vol_embedding else 0
# ssl prenet
x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to(c.dtype)
@ -452,7 +449,7 @@ class SynthesizerTrn(nn.Module):
x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to(c.dtype)
# vol proj
vol = self.emb_vol(vol[:,:,None]).transpose(1,2) if vol!=None and self.vol_embedding else 0
vol = self.emb_vol(vol[:,:,None]).transpose(1,2) if vol is not None and self.vol_embedding else 0
x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1, 2) + vol

View File

@ -1,7 +1,9 @@
from modules.F0Predictor.F0Predictor import F0Predictor
from modules.F0Predictor.crepe import CrepePitchExtractor
import torch
from modules.F0Predictor.crepe import CrepePitchExtractor
from modules.F0Predictor.F0Predictor import F0Predictor
class CrepeF0Predictor(F0Predictor):
def __init__(self,hop_length=512,f0_min=50,f0_max=1100,device=None,sampling_rate=44100,threshold=0.05,model="full"):
self.F0Creper = CrepePitchExtractor(hop_length=hop_length,f0_min=f0_min,f0_max=f0_max,device=device,threshold=threshold,model=model)

View File

@ -1,6 +1,8 @@
from modules.F0Predictor.F0Predictor import F0Predictor
import pyworld
import numpy as np
import pyworld
from modules.F0Predictor.F0Predictor import F0Predictor
class DioF0Predictor(F0Predictor):
def __init__(self,hop_length=512,f0_min=50,f0_max=1100,sampling_rate=44100):

View File

@ -1,6 +1,8 @@
from modules.F0Predictor.F0Predictor import F0Predictor
import pyworld
import numpy as np
import pyworld
from modules.F0Predictor.F0Predictor import F0Predictor
class HarvestF0Predictor(F0Predictor):
def __init__(self,hop_length=512,f0_min=50,f0_max=1100,sampling_rate=44100):

View File

@ -1,6 +1,8 @@
from modules.F0Predictor.F0Predictor import F0Predictor
import parselmouth
import numpy as np
import parselmouth
from modules.F0Predictor.F0Predictor import F0Predictor
class PMF0Predictor(F0Predictor):
def __init__(self,hop_length=512,f0_min=50,f0_max=1100,sampling_rate=44100):

View File

@ -1,14 +1,14 @@
from typing import Optional,Union
from typing import Optional, Union
try:
from typing import Literal
except Exception as e:
except Exception:
from typing_extensions import Literal
import numpy as np
import torch
import torchcrepe
from torch import nn
from torch.nn import functional as F
import scipy
#from:https://github.com/fishaudio/fish-diffusion
@ -334,7 +334,7 @@ class CrepePitchExtractor(BasePitchExtractor):
f0 = torch.where(torch.isnan(f0), torch.full_like(f0, 0), f0)[0]
if torch.all(f0 == 0):
rtn = f0.cpu().numpy() if pad_to==None else np.zeros(pad_to)
rtn = f0.cpu().numpy() if pad_to is None else np.zeros(pad_to)
return rtn,rtn
return self.post_process(x, sampling_rate, f0, pad_to)

View File

@ -1,12 +1,10 @@
import copy
import math
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F
import modules.commons as commons
import modules.modules as modules
from modules.modules import LayerNorm
@ -243,7 +241,7 @@ class MultiHeadAttention(nn.Module):
return ret
def _get_relative_embeddings(self, relative_embeddings, length):
max_relative_position = 2 * self.window_size + 1
2 * self.window_size + 1
# Pad first before slice to avoid using cond ops.
pad_length = max(length - (self.window_size + 1), 0)
slice_start_position = max((self.window_size + 1) - length, 0)

View File

@ -1,9 +1,9 @@
import math
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F
def slice_pitch_segments(x, ids_str, segment_size=4):
ret = torch.zeros_like(x[:, :segment_size])
for i in range(x.size(0)):
@ -134,12 +134,6 @@ def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
return acts
def convert_pad_shape(pad_shape):
l = pad_shape[::-1]
pad_shape = [item for sublist in l for item in sublist]
return pad_shape
def shift_1d(x):
x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
return x
@ -157,7 +151,6 @@ def generate_path(duration, mask):
duration: [b, 1, t_x]
mask: [b, 1, t_y, t_x]
"""
device = duration.device
b, _, t_y, t_x = mask.shape
cum_duration = torch.cumsum(duration, -1)

View File

@ -1,10 +1,12 @@
import numpy as np
import torch
import torch.nn.functional as F
from vdecoder.nsf_hifigan.nvSTFT import STFT
from vdecoder.nsf_hifigan.models import load_model
from torchaudio.transforms import Resample
from vdecoder.nsf_hifigan.models import load_model
from vdecoder.nsf_hifigan.nvSTFT import STFT
class Enhancer:
def __init__(self, enhancer_type, enhancer_ckpt, device=None):
if device is None:

View File

@ -1,7 +1,4 @@
import torch
from torch.nn import functional as F
import modules.commons as commons
import torch
def feature_loss(fmap_r, fmap_g):

View File

@ -1,16 +1,5 @@
import math
import os
import random
import torch
from torch import nn
import torch.nn.functional as F
import torch.utils.data
import numpy as np
import librosa
import librosa.util as librosa_util
from librosa.util import normalize, pad_center, tiny
from scipy.signal import get_window
from scipy.io.wavfile import read
from librosa.filters import mel as librosa_mel_fn
MAX_WAV_VALUE = 32768.0

View File

@ -1,17 +1,11 @@
import copy
import math
import numpy as np
import scipy
import torch
from torch import nn
from torch.nn import Conv1d
from torch.nn import functional as F
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
from torch.nn.utils import weight_norm, remove_weight_norm
from torch.nn.utils import remove_weight_norm, weight_norm
import modules.commons as commons
from modules.commons import init_weights, get_padding
from modules.commons import get_padding, init_weights
LRELU_SLOPE = 0.1

View File

@ -1,6 +1,8 @@
import torch
from onnxexport.model_onnx import SynthesizerTrn
import utils
from onnxexport.model_onnx import SynthesizerTrn
def main(NetExport):
path = "SoVits4.0"

View File

@ -1,8 +1,11 @@
import torch
from onnxexport.model_onnx_speaker_mix import SynthesizerTrn
import utils
import json
import torch
import utils
from onnxexport.model_onnx_speaker_mix import SynthesizerTrn
def main():
path = "crs"
@ -127,7 +130,7 @@ def main():
"Characters": spklist
}
MoeVSConfJson = json.dumps(MoeVSConf)
json.dumps(MoeVSConf)
with open(f"checkpoints/{path}.json", 'w') as MoeVsConfFile:
json.dump(MoeVSConf, MoeVsConfFile, indent = 4)

View File

@ -1,18 +1,16 @@
import torch
from torch import nn
from torch.nn import Conv1d, Conv2d
from torch.nn import functional as F
from torch.nn.utils import spectral_norm, weight_norm
import modules.attentions as attentions
import modules.commons as commons
import modules.modules as modules
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
import utils
from modules.commons import init_weights, get_padding
from vdecoder.hifigan.models import Generator
from modules.commons import get_padding
from utils import f0_to_coarse
from vdecoder.hifigan.models import Generator
class ResidualCouplingBlock(nn.Module):
@ -124,7 +122,7 @@ class DiscriminatorP(torch.nn.Module):
super(DiscriminatorP, self).__init__()
self.period = period
self.use_spectral_norm = use_spectral_norm
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
norm_f = weight_norm if use_spectral_norm is False else spectral_norm
self.convs = nn.ModuleList([
norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
@ -159,7 +157,7 @@ class DiscriminatorP(torch.nn.Module):
class DiscriminatorS(torch.nn.Module):
def __init__(self, use_spectral_norm=False):
super(DiscriminatorS, self).__init__()
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
norm_f = weight_norm if use_spectral_norm is False else spectral_norm
self.convs = nn.ModuleList([
norm_f(Conv1d(1, 16, 15, 1, padding=7)),
norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),

View File

@ -1,20 +1,12 @@
import copy
import math
import torch
from torch import nn
from torch.nn import functional as F
import modules.attentions as attentions
import modules.commons as commons
import modules.modules as modules
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
import utils
from modules.commons import init_weights, get_padding
from utils import f0_to_coarse
class ResidualCouplingBlock(nn.Module):
def __init__(self,
channels,
@ -259,7 +251,7 @@ class SynthesizerTrn(nn.Module):
x_mask = torch.unsqueeze(torch.ones_like(f0), 1).to(c.dtype)
# vol proj
vol = self.emb_vol(vol[:,:,None]).transpose(1,2) if vol!=None and self.vol_embedding else 0
vol = self.emb_vol(vol[:,:,None]).transpose(1,2) if vol is not None and self.vol_embedding else 0
x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1, 2) + vol

View File

@ -1,11 +1,11 @@
import os
import argparse
import json
import os
import re
import wave
from random import shuffle
from tqdm import tqdm
from random import shuffle
import json
import wave
import diffusion.logger.utils as du

View File

@ -1,19 +1,20 @@
import os
import utils
import torch
import random
import librosa
import logging
import argparse
import logging
import multiprocessing
import numpy as np
import diffusion.logger.utils as du
from glob import glob
from tqdm import tqdm
from random import shuffle
from diffusion.vocoder import Vocoder
import os
import random
from concurrent.futures import ProcessPoolExecutor
from glob import glob
from random import shuffle
import librosa
import numpy as np
import torch
from tqdm import tqdm
import diffusion.logger.utils as du
import utils
from diffusion.vocoder import Vocoder
from modules.mel_processing import spectrogram_torch
logging.getLogger("numba").setLevel(logging.WARNING)

View File

@ -1,10 +1,11 @@
import os
import argparse
import concurrent.futures
import os
from concurrent.futures import ProcessPoolExecutor
from multiprocessing import cpu_count
import librosa
import numpy as np
import concurrent.futures
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from multiprocessing import Pool, cpu_count
from scipy.io import wavfile
from tqdm import tqdm

View File

@ -1,39 +1,30 @@
import logging
import multiprocessing
import os
import time
logging.getLogger('matplotlib').setLevel(logging.WARNING)
logging.getLogger('numba').setLevel(logging.WARNING)
import os
import json
import argparse
import itertools
import math
import torch
from torch import nn, optim
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.cuda.amp import GradScaler, autocast
from torch.nn import functional as F
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
import torch.multiprocessing as mp
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.cuda.amp import autocast, GradScaler
import modules.commons as commons
import utils
from data_utils import TextAudioSpeakerLoader, TextAudioCollate
from data_utils import TextAudioCollate, TextAudioSpeakerLoader
from models import (
SynthesizerTrn,
MultiPeriodDiscriminator,
SynthesizerTrn,
)
from modules.losses import (
kl_loss,
generator_loss, discriminator_loss, feature_loss
)
from modules.losses import discriminator_loss, feature_loss, generator_loss, kl_loss
from modules.mel_processing import mel_spectrogram_torch, spec_to_mel_torch
logging.getLogger('matplotlib').setLevel(logging.WARNING)
logging.getLogger('numba').setLevel(logging.WARNING)
torch.backends.cudnn.benchmark = True
global_step = 0
start_time = time.time()
@ -287,7 +278,7 @@ def evaluate(hps, generator, eval_loader, writer_eval):
c = c[:1].cuda(0)
f0 = f0[:1].cuda(0)
uv= uv[:1].cuda(0)
if volume!=None:
if volume is not None:
volume = volume[:1].cuda(0)
mel = spec_to_mel_torch(
spec,
@ -314,7 +305,7 @@ def evaluate(hps, generator, eval_loader, writer_eval):
f"gt/audio_{batch_idx}": y[0]
})
image_dict.update({
f"gen/mel": utils.plot_spectrogram_to_numpy(y_hat_mel[0].cpu().numpy()),
"gen/mel": utils.plot_spectrogram_to_numpy(y_hat_mel[0].cpu().numpy()),
"gt/mel": utils.plot_spectrogram_to_numpy(mel[0].cpu().numpy())
})
utils.summarize(

View File

@ -1,9 +1,10 @@
import os
import argparse
import torch
from torch.optim import lr_scheduler
from diffusion.logger import utils
from diffusion.data_loaders import get_data_loaders
from diffusion.logger import utils
from diffusion.solver import train
from diffusion.unit2mel import Unit2Mel
from diffusion.vocoder import Vocoder

View File

@ -1,8 +1,8 @@
import utils
import pickle
import os
import argparse
import os
import pickle
import utils
if __name__ == "__main__":
parser = argparse.ArgumentParser()

View File

@ -1,22 +1,18 @@
import os
import glob
import re
import sys
import argparse
import logging
import glob
import json
import logging
import os
import re
import subprocess
import warnings
import random
import functools
import sys
import faiss
import librosa
import numpy as np
from scipy.io.wavfile import read
import torch
from scipy.io.wavfile import read
from torch.nn import functional as F
from modules.commons import sequence_mask
import faiss
import tqdm
MATPLOTLIB_FLAG = False
@ -201,15 +197,20 @@ def clean_checkpoints(path_to_models='logs/44k/', n_ckpts_to_keep=2, sort_by_tim
False -> lexicographically delete ckpts
"""
ckpts_files = [f for f in os.listdir(path_to_models) if os.path.isfile(os.path.join(path_to_models, f))]
name_key = (lambda _f: int(re.compile('._(\d+)\.pth').match(_f).group(1)))
time_key = (lambda _f: os.path.getmtime(os.path.join(path_to_models, _f)))
def name_key(_f):
return int(re.compile("._(\\d+)\\.pth").match(_f).group(1))
def time_key(_f):
return os.path.getmtime(os.path.join(path_to_models, _f))
sort_key = time_key if sort_by_time else name_key
x_sorted = lambda _x: sorted([f for f in ckpts_files if f.startswith(_x) and not f.endswith('_0.pth')], key=sort_key)
def x_sorted(_x):
return sorted([f for f in ckpts_files if f.startswith(_x) and not f.endswith("_0.pth")], key=sort_key)
to_del = [os.path.join(path_to_models, fn) for fn in
(x_sorted('G')[:-n_ckpts_to_keep] + x_sorted('D')[:-n_ckpts_to_keep])]
del_info = lambda fn: logger.info(f".. Free up space by deleting ckpt {fn}")
del_routine = lambda x: [os.remove(x), del_info(x)]
rs = [del_routine(fn) for fn in to_del]
def del_info(fn):
return logger.info(f".. Free up space by deleting ckpt {fn}")
def del_routine(x):
return [os.remove(x), del_info(x)]
[del_routine(fn) for fn in to_del]
def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050):
for k, v in scalars.items():

View File

@ -1,13 +1,15 @@
import os
import json
from .env import AttrDict
import os
import numpy as np
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
from .utils import init_weights, get_padding
import torch.nn.functional as F
from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d
from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
from .env import AttrDict
from .utils import get_padding, init_weights
LRELU_SLOPE = 0.1
@ -199,7 +201,7 @@ class SineGen(torch.nn.Module):
output uv: tensor(batchsize=1, length, 1)
"""
with torch.no_grad():
f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim,
torch.zeros(f0.shape[0], f0.shape[1], self.dim,
device=f0.device)
# fundamental component
fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device))
@ -353,7 +355,7 @@ class DiscriminatorP(torch.nn.Module):
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
super(DiscriminatorP, self).__init__()
self.period = period
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
norm_f = weight_norm if use_spectral_norm is False else spectral_norm
self.convs = nn.ModuleList([
norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
@ -412,7 +414,7 @@ class MultiPeriodDiscriminator(torch.nn.Module):
class DiscriminatorS(torch.nn.Module):
def __init__(self, use_spectral_norm=False):
super(DiscriminatorS, self).__init__()
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
norm_f = weight_norm if use_spectral_norm is False else spectral_norm
self.convs = nn.ModuleList([
norm_f(Conv1d(1, 128, 15, 1, padding=7)),
norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),

View File

@ -1,15 +1,13 @@
import math
import os
os.environ["LRU_CACHE_CAPACITY"] = "3"
import random
import librosa
import numpy as np
import soundfile as sf
import torch
import torch.utils.data
import numpy as np
import librosa
from librosa.util import normalize
from librosa.filters import mel as librosa_mel_fn
from scipy.io.wavfile import read
import soundfile as sf
os.environ["LRU_CACHE_CAPACITY"] = "3"
def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False):
sampling_rate = None

View File

@ -1,10 +1,10 @@
import glob
import os
import matplotlib
import torch
from torch.nn.utils import weight_norm
# matplotlib.use("Agg")
import matplotlib.pylab as plt
import torch
from torch.nn.utils import weight_norm
def plot_spectrogram(spectrogram):

View File

@ -1,6 +1,6 @@
# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
# LICENSE is in incl_licenses directory.
from .act import *
from .filter import *
from .resample import *
from .act import *

View File

@ -4,10 +4,10 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import sin, pow
from torch import pow, sin
from torch.nn import Parameter
from .resample import UpSample1d, DownSample1d
from .resample import DownSample1d, UpSample1d
class Activation1d(nn.Module):

View File

@ -1,10 +1,11 @@
# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
# LICENSE is in incl_licenses directory.
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
if 'sinc' in dir(torch):
sinc = torch.sinc

View File

@ -3,8 +3,8 @@
import torch.nn as nn
from torch.nn import functional as F
from .filter import LowPassFilter1d
from .filter import kaiser_sinc_filter1d
from .filter import LowPassFilter1d, kaiser_sinc_filter1d
class UpSample1d(nn.Module):

View File

@ -1,15 +1,18 @@
import os
import json
from .env import AttrDict
import os
import numpy as np
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
from .utils import init_weights, get_padding
import torch.nn.functional as F
from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d
from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
from vdecoder.hifiganwithsnake.alias.act import SnakeAlias
from .env import AttrDict
from .utils import get_padding, init_weights
LRELU_SLOPE = 0.1
@ -211,7 +214,7 @@ class SineGen(torch.nn.Module):
output uv: tensor(batchsize=1, length, 1)
"""
with torch.no_grad():
f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim,
torch.zeros(f0.shape[0], f0.shape[1], self.dim,
device=f0.device)
# fundamental component
fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device))
@ -370,7 +373,7 @@ class DiscriminatorP(torch.nn.Module):
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
super(DiscriminatorP, self).__init__()
self.period = period
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
norm_f = weight_norm if use_spectral_norm is False else spectral_norm
self.convs = nn.ModuleList([
norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
@ -429,7 +432,7 @@ class MultiPeriodDiscriminator(torch.nn.Module):
class DiscriminatorS(torch.nn.Module):
def __init__(self, use_spectral_norm=False):
super(DiscriminatorS, self).__init__()
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
norm_f = weight_norm if use_spectral_norm is False else spectral_norm
self.convs = nn.ModuleList([
norm_f(Conv1d(1, 128, 15, 1, padding=7)),
norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),

View File

@ -1,15 +1,13 @@
import math
import os
os.environ["LRU_CACHE_CAPACITY"] = "3"
import random
import librosa
import numpy as np
import soundfile as sf
import torch
import torch.utils.data
import numpy as np
import librosa
from librosa.util import normalize
from librosa.filters import mel as librosa_mel_fn
from scipy.io.wavfile import read
import soundfile as sf
os.environ["LRU_CACHE_CAPACITY"] = "3"
def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False):
sampling_rate = None

View File

@ -1,10 +1,10 @@
import glob
import os
import matplotlib
import torch
from torch.nn.utils import weight_norm
# matplotlib.use("Agg")
import matplotlib.pylab as plt
import torch
from torch.nn.utils import weight_norm
def plot_spectrogram(spectrogram):

View File

@ -1,13 +1,15 @@
import os
import json
from .env import AttrDict
import os
import numpy as np
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
from .utils import init_weights, get_padding
import torch.nn.functional as F
from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d
from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
from .env import AttrDict
from .utils import get_padding, init_weights
LRELU_SLOPE = 0.1
@ -289,7 +291,7 @@ class DiscriminatorP(torch.nn.Module):
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
super(DiscriminatorP, self).__init__()
self.period = period
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
norm_f = weight_norm if use_spectral_norm is False else spectral_norm
self.convs = nn.ModuleList([
norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
@ -348,7 +350,7 @@ class MultiPeriodDiscriminator(torch.nn.Module):
class DiscriminatorS(torch.nn.Module):
def __init__(self, use_spectral_norm=False):
super(DiscriminatorS, self).__init__()
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
norm_f = weight_norm if use_spectral_norm is False else spectral_norm
self.convs = nn.ModuleList([
norm_f(Conv1d(1, 128, 15, 1, padding=7)),
norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),

View File

@ -1,16 +1,14 @@
import math
import os
os.environ["LRU_CACHE_CAPACITY"] = "3"
import random
import torch
import torch.utils.data
import numpy as np
import librosa
from librosa.util import normalize
from librosa.filters import mel as librosa_mel_fn
from scipy.io.wavfile import read
import numpy as np
import soundfile as sf
import torch
import torch.nn.functional as F
import torch.utils.data
from librosa.filters import mel as librosa_mel_fn
os.environ["LRU_CACHE_CAPACITY"] = "3"
def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False):
sampling_rate = None

View File

@ -1,10 +1,12 @@
import glob
import os
import matplotlib
import matplotlib.pylab as plt
import torch
from torch.nn.utils import weight_norm
matplotlib.use("Agg")
import matplotlib.pylab as plt
def plot_spectrogram(spectrogram):

View File

@ -1,7 +1,8 @@
from vencoder.encoder import SpeechEncoder
import torch
from fairseq import checkpoint_utils
from vencoder.encoder import SpeechEncoder
class CNHubertLarge(SpeechEncoder):
def __init__(self, vec_path="pretrain/chinese-hubert-large-fairseq-ckpt.pt", device=None):

View File

@ -1,7 +1,8 @@
from vencoder.encoder import SpeechEncoder
import onnxruntime
import torch
from vencoder.encoder import SpeechEncoder
class ContentVec256L12_Onnx(SpeechEncoder):
def __init__(self, vec_path="pretrain/vec-256-layer-12.onnx", device=None):

View File

@ -1,7 +1,8 @@
from vencoder.encoder import SpeechEncoder
import torch
from fairseq import checkpoint_utils
from vencoder.encoder import SpeechEncoder
class ContentVec256L9(SpeechEncoder):
def __init__(self, vec_path="pretrain/checkpoint_best_legacy_500.pt", device=None):

View File

@ -1,7 +1,9 @@
from vencoder.encoder import SpeechEncoder
import onnxruntime
import torch
from vencoder.encoder import SpeechEncoder
class ContentVec256L9_Onnx(SpeechEncoder):
def __init__(self, vec_path="pretrain/vec-256-layer-9.onnx", device=None):
super().__init__()

View File

@ -1,7 +1,8 @@
from vencoder.encoder import SpeechEncoder
import torch
from fairseq import checkpoint_utils
from vencoder.encoder import SpeechEncoder
class ContentVec768L12(SpeechEncoder):
def __init__(self, vec_path="pretrain/checkpoint_best_legacy_500.pt", device=None):

View File

@ -1,7 +1,8 @@
from vencoder.encoder import SpeechEncoder
import onnxruntime
import torch
from vencoder.encoder import SpeechEncoder
class ContentVec768L12_Onnx(SpeechEncoder):
def __init__(self, vec_path="pretrain/vec-768-layer-12.onnx", device=None):

View File

@ -1,7 +1,8 @@
from vencoder.encoder import SpeechEncoder
import onnxruntime
import torch
from vencoder.encoder import SpeechEncoder
class ContentVec768L9_Onnx(SpeechEncoder):
def __init__(self,vec_path = "pretrain/vec-768-layer-9.onnx",device=None):

View File

@ -1,6 +1,7 @@
from vencoder.encoder import SpeechEncoder
import torch
from vencoder.dphubert.model import wav2vec2_model
from vencoder.encoder import SpeechEncoder
class DPHubert(SpeechEncoder):

View File

@ -1,5 +1,6 @@
from vencoder.encoder import SpeechEncoder
import torch
from vencoder.encoder import SpeechEncoder
from vencoder.hubert import hubert_model

View File

@ -1,7 +1,8 @@
from vencoder.encoder import SpeechEncoder
import onnxruntime
import torch
from vencoder.encoder import SpeechEncoder
class HubertSoft_Onnx(SpeechEncoder):
def __init__(self, vec_path="pretrain/hubert-soft.onnx", device=None):

View File

@ -1,5 +1,6 @@
from vencoder.encoder import SpeechEncoder
import torch
from vencoder.encoder import SpeechEncoder
from vencoder.wavlm.WavLM import WavLM, WavLMConfig

View File

@ -1,8 +1,8 @@
from vencoder.encoder import SpeechEncoder
import torch
from vencoder.whisper.model import Whisper, ModelDimensions
from vencoder.whisper.audio import pad_or_trim, log_mel_spectrogram
from vencoder.encoder import SpeechEncoder
from vencoder.whisper.audio import log_mel_spectrogram, pad_or_trim
from vencoder.whisper.model import ModelDimensions, Whisper
class WhisperPPG(SpeechEncoder):

View File

@ -1,8 +1,8 @@
from vencoder.encoder import SpeechEncoder
import torch
from vencoder.whisper.model import Whisper, ModelDimensions
from vencoder.whisper.audio import pad_or_trim, log_mel_spectrogram
from vencoder.encoder import SpeechEncoder
from vencoder.whisper.audio import log_mel_spectrogram, pad_or_trim
from vencoder.whisper.model import ModelDimensions, Whisper
class WhisperPPGLarge(SpeechEncoder):

View File

@ -5,19 +5,19 @@ https://github.com/pytorch/audio/blob/main/torchaudio/models/wav2vec2/components
"""
import math
from collections import defaultdict
from typing import List, Optional, Tuple
import math
import torch
from torch import nn, Tensor
from torch.nn import Module, Parameter
from torch import Tensor, nn
from torch.nn import Module
from .hardconcrete import HardConcrete
from .pruning_utils import (
prune_linear_layer,
prune_conv1d_layer,
prune_layer_norm,
prune_linear_layer,
)

View File

@ -10,7 +10,7 @@ from typing import Any, Dict
from torch.nn import Module
from ..model import wav2vec2_model, Wav2Vec2Model, wavlm_model
from ..model import Wav2Vec2Model, wav2vec2_model, wavlm_model
_LG = logging.getLogger(__name__)

View File

@ -7,26 +7,26 @@
# https://github.com/pytorch/fairseq
# --------------------------------------------------------
import math
import logging
import math
from typing import List, Optional, Tuple
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import LayerNorm
from vencoder.wavlm.modules import (
Fp32GroupNorm,
Fp32LayerNorm,
GLU_Linear,
GradMultiply,
MultiheadAttention,
SamePad,
init_bert_params,
get_activation_fn,
TransposeLast,
GLU_Linear,
get_activation_fn,
init_bert_params,
)
logger = logging.getLogger(__name__)
@ -402,9 +402,7 @@ class ConvFeatureExtractionModel(nn.Module):
nn.init.kaiming_normal_(conv.weight)
return conv
assert (
is_layer_norm and is_group_norm
) == False, "layer norm and group norm are exclusive"
assert (is_layer_norm and is_group_norm) is False, "layer norm and group norm are exclusive"
if is_layer_norm:
return nn.Sequential(

View File

@ -10,10 +10,11 @@
import math
import warnings
from typing import Dict, Optional, Tuple
import torch
import torch.nn.functional as F
from torch import Tensor, nn
from torch.nn import Parameter
import torch.nn.functional as F
class TransposeLast(nn.Module):

View File

@ -1,4 +1,3 @@
import os
from functools import lru_cache
from typing import Union
@ -6,11 +5,10 @@ import ffmpeg
import numpy as np
import torch
import torch.nn.functional as F
from librosa.filters import mel as librosa_mel_fn
from .utils import exact_div
from librosa.filters import mel as librosa_mel_fn
# hard-coded audio hyperparameters
SAMPLE_RATE = 16000
N_FFT = 400

View File

@ -1,5 +1,5 @@
from dataclasses import dataclass, field
from typing import Dict, List, Tuple, Iterable, Optional, Sequence, Union, TYPE_CHECKING
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Sequence, Tuple, Union
import numpy as np
import torch
@ -32,7 +32,7 @@ def detect_language(model: "Whisper", mel: Tensor, tokenizer: Tokenizer = None)
if tokenizer is None:
tokenizer = get_tokenizer(model.is_multilingual)
if tokenizer.language is None or tokenizer.language_token not in tokenizer.sot_sequence:
raise ValueError(f"This model doesn't have language tokens so it can't perform lang id")
raise ValueError("This model doesn't have language tokens so it can't perform lang id")
single = mel.ndim == 2
if single:

View File

@ -1,14 +1,13 @@
from dataclasses import dataclass
from typing import Dict
from typing import Iterable, Optional
from typing import Dict, Iterable, Optional
import numpy as np
import torch
import torch.nn.functional as F
from torch import Tensor
from torch import nn
from torch import Tensor, nn
from .decoding import detect_language as detect_language_function, decode as decode_function
from .decoding import decode as decode_function
from .decoding import detect_language as detect_language_function
@dataclass

View File

@ -196,7 +196,7 @@ class Tokenizer:
def language_token(self) -> int:
"""Returns the token id corresponding to the value of the `language` field"""
if self.language is None:
raise ValueError(f"This tokenizer does not have language token configured")
raise ValueError("This tokenizer does not have language token configured")
additional_tokens = dict(
zip(

View File

@ -1,7 +1,9 @@
from google.colab import files
import shutil
import os
import argparse
import os
import shutil
from google.colab import files
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--type", type=str, required=True, help="type of file to upload")

View File

@ -1,5 +1,11 @@
import io
import json
import logging
import os
import re
import subprocess
import time
import traceback
from itertools import chain
# os.system("wget -P cvec/ https://huggingface.co/spaces/innnky/nanami/resolve/main/checkpoint_best_legacy_500.pt")
import gradio as gr
@ -7,22 +13,12 @@ import gradio.processing_utils as gr_pu
import librosa
import numpy as np
import soundfile
from inference.infer_tool import Svc
import logging
import re
import json
import subprocess
import edge_tts
import asyncio
from scipy.io import wavfile
import librosa
import torch
import time
import traceback
from itertools import chain
from utils import mix_model
from scipy.io import wavfile
from compress_model import removeOptimizer
from inference.infer_tool import Svc
from utils import mix_model
logging.getLogger('numba').setLevel(logging.WARNING)
logging.getLogger('markdown_it').setLevel(logging.WARNING)
@ -42,14 +38,15 @@ if torch.cuda.is_available():
def upload_mix_append_file(files,sfiles):
try:
if(sfiles == None):
if(sfiles is None):
file_paths = [file.name for file in files]
else:
file_paths = [file.name for file in chain(files,sfiles)]
p = {file:100 for file in file_paths}
return file_paths,mix_model_output1.update(value=json.dumps(p,indent=2))
except Exception as e:
if debug: traceback.print_exc()
if debug:
traceback.print_exc()
raise gr.Error(e)
def mix_submit_click(js,mode):
@ -63,16 +60,19 @@ def mix_submit_click(js,mode):
path = mix_model(model_path,mix_rate,mode)
return f"成功,文件被保存在了{path}"
except Exception as e:
if debug: traceback.print_exc()
if debug:
traceback.print_exc()
raise gr.Error(e)
def updata_mix_info(files):
try:
if files == None : return mix_model_output1.update(value="")
if files is None :
return mix_model_output1.update(value="")
p = {file.name:100 for file in files}
return mix_model_output1.update(value=json.dumps(p,indent=2))
except Exception as e:
if debug: traceback.print_exc()
if debug:
traceback.print_exc()
raise gr.Error(e)
def modelAnalysis(model_path,config_path,cluster_model_path,device,enhance,diff_model_path,diff_config_path,only_diffusion,use_spk_mix):
@ -112,7 +112,8 @@ def modelAnalysis(model_path,config_path,cluster_model_path,device,enhance,diff_
msg += i + " "
return sid.update(choices = spks,value=spks[0]), msg
except Exception as e:
if debug: traceback.print_exc()
if debug:
traceback.print_exc()
raise gr.Error(e)
@ -172,7 +173,8 @@ def vc_fn(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, noise
soundfile.write(output_file, _audio, model.target_sample, format="wav")
return "Success", output_file
except Exception as e:
if debug: traceback.print_exc()
if debug:
traceback.print_exc()
raise gr.Error(e)
def tts_func(_text,_rate,_voice):
@ -180,7 +182,8 @@ def tts_func(_text,_rate,_voice):
# voice = "zh-CN-XiaoyiNeural"#女性,较高音
# voice = "zh-CN-YunxiNeural"#男性
voice = "zh-CN-YunxiNeural"#男性
if ( _voice == "" ) : voice = "zh-CN-XiaoyiNeural"
if ( _voice == "" ) :
voice = "zh-CN-XiaoyiNeural"
output_file = _text[0:10]+".wav"
# communicate = edge_tts.Communicate(_text, voice)
# await communicate.save(output_file)