From f979532e818b61be8d055f0460135d5111f93ec7 Mon Sep 17 00:00:00 2001 From: Ftps Date: Sat, 24 Jun 2023 03:18:13 +0900 Subject: [PATCH 1/9] fix --- resample.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/resample.py b/resample.py index 954a676..275265e 100644 --- a/resample.py +++ b/resample.py @@ -37,7 +37,7 @@ def save_wav_to_path(wav, save_path, sr): def process(item): - spkdir, wav_name = item + spkdir, wav_name, args = item speaker = spkdir.replace("\\", "/").split("/")[-1] wav_path = os.path.join(args.in_dir, speaker, wav_name) @@ -79,7 +79,7 @@ def process_all_speakers(): spk_dir = os.path.join(args.in_dir, speaker) if os.path.isdir(spk_dir): print(spk_dir) - futures = [executor.submit(process, (spk_dir, i)) for i in os.listdir(spk_dir) if i.endswith("wav")] + futures = [executor.submit(process, (spk_dir, i, args)) for i in os.listdir(spk_dir) if i.endswith("wav")] for _ in tqdm(concurrent.futures.as_completed(futures), total=len(futures)): pass From 64be055fec53d5b281e58ed7f895be01dd00e5e1 Mon Sep 17 00:00:00 2001 From: asdfw13 <86564126+asdfw13@users.noreply.github.com> Date: Fri, 23 Jun 2023 03:12:02 +0800 Subject: [PATCH 2/9] Update inference_main.py --- inference_main.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/inference_main.py b/inference_main.py index 58aa176..ce87f37 100644 --- a/inference_main.py +++ b/inference_main.py @@ -141,7 +141,8 @@ def main(): if only_diffusion : isdiffusion = "diff" if use_spk_mix: spk = "spk_mix" - res_path = f'results/{clean_name}_{key}_{spk}{cluster_name}_{isdiffusion}.{wav_format}' + f0_predictor = f0p + res_path = f'results/{clean_name}_{key}_{spk}{cluster_name}_{isdiffusion}_{f0_predictor}.{wav_format}' soundfile.write(res_path, audio, svc_model.target_sample, format=wav_format) svc_model.clear_empty() From 7c0d113eae379312727ef04a38a356c547c0fd59 Mon Sep 17 00:00:00 2001 From: asdfw13 <86564126+asdfw13@users.noreply.github.com> Date: Fri, 23 Jun 2023 03:17:40 +0800 Subject: [PATCH 3/9] Update inference_main.py --- inference_main.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/inference_main.py b/inference_main.py index ce87f37..1780342 100644 --- a/inference_main.py +++ b/inference_main.py @@ -141,8 +141,7 @@ def main(): if only_diffusion : isdiffusion = "diff" if use_spk_mix: spk = "spk_mix" - f0_predictor = f0p - res_path = f'results/{clean_name}_{key}_{spk}{cluster_name}_{isdiffusion}_{f0_predictor}.{wav_format}' + res_path = f'results/{clean_name}_{key}_{spk}{cluster_name}_{isdiffusion}_{f0p}.{wav_format}' soundfile.write(res_path, audio, svc_model.target_sample, format=wav_format) svc_model.clear_empty() From 3691bbf5f38c6071855530f2efc84bcdb3b7c700 Mon Sep 17 00:00:00 2001 From: YuriHead Date: Mon, 26 Jun 2023 00:58:52 +0800 Subject: [PATCH 4/9] Updata Depthwise Separable Conv1D to Infer Speed Up --- configs_template/config_template.json | 5 +- models.py | 52 +++++++++------ modules/DSConv.py | 76 ++++++++++++++++++++++ modules/commons.py | 6 +- modules/modules.py | 54 ++++++++-------- train.py | 18 +++--- vdecoder/hifigan/models.py | 59 ++++++++++------- vdecoder/hifigan/utils.py | 5 +- vdecoder/hifiganwithsnake/models.py | 91 ++++++++++++++------------- vdecoder/hifiganwithsnake/utils.py | 5 +- 10 files changed, 249 insertions(+), 122 deletions(-) create mode 100644 modules/DSConv.py diff --git a/configs_template/config_template.json b/configs_template/config_template.json index 670329c..e70b5d8 100644 --- a/configs_template/config_template.json +++ b/configs_template/config_template.json @@ -60,7 +60,10 @@ "vocoder_name":"nsf-hifigan", "speech_encoder":"vec768l12", "speaker_embedding":false, - "vol_embedding":false + "vol_embedding":false, + "use_depthwise_conv":false, + "use_depthwise_transposeconv":false, + "use_automatic_f0_prediction": true }, "spk": { "nyaru": 0, diff --git a/models.py b/models.py index 1f67b29..2b2d5a1 100644 --- a/models.py +++ b/models.py @@ -321,6 +321,9 @@ class SynthesizerTrn(nn.Module): sampling_rate=44100, vol_embedding=False, vocoder_name = "nsf-hifigan", + use_depthwise_conv = False, + use_depthwise_transposeconv = False, + use_automatic_f0_prediction = True, **kwargs): super().__init__() @@ -343,6 +346,8 @@ class SynthesizerTrn(nn.Module): self.ssl_dim = ssl_dim self.vol_embedding = vol_embedding self.emb_g = nn.Embedding(n_speakers, gin_channels) + self.use_depthwise_conv = use_depthwise_conv + self.use_automatic_f0_prediction = use_automatic_f0_prediction if vol_embedding: self.emb_vol = nn.Linear(1, hidden_channels) @@ -367,9 +372,12 @@ class SynthesizerTrn(nn.Module): "upsample_initial_channel": upsample_initial_channel, "upsample_kernel_sizes": upsample_kernel_sizes, "gin_channels": gin_channels, + "use_depthwise_conv":use_depthwise_conv, + "use_depthwise_transposeconv":use_depthwise_transposeconv } - + modules.set_Conv1dModel(self.use_depthwise_conv) + if vocoder_name == "nsf-hifigan": from vdecoder.hifigan.models import Generator self.dec = Generator(h=hps) @@ -383,16 +391,17 @@ class SynthesizerTrn(nn.Module): self.enc_q = Encoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels) self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels) - self.f0_decoder = F0Decoder( - 1, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - spk_channels=gin_channels - ) + if self.use_automatic_f0_prediction: + self.f0_decoder = F0Decoder( + 1, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + spk_channels=gin_channels + ) self.emb_uv = nn.Embedding(2, hidden_channels) self.character_mix = False @@ -412,12 +421,16 @@ class SynthesizerTrn(nn.Module): # ssl prenet x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to(c.dtype) x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1,2) + vol - + # f0 predict - lf0 = 2595. * torch.log10(1. + f0.unsqueeze(1) / 700.) / 500 - norm_lf0 = utils.normalize_f0(lf0, x_mask, uv) - pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g) - + if self.use_automatic_f0_prediction: + lf0 = 2595. * torch.log10(1. + f0.unsqueeze(1) / 700.) / 500 + norm_lf0 = utils.normalize_f0(lf0, x_mask, uv) + pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g) + else: + lf0 = 0 + norm_lf0 = 0 + pred_lf0 = 0 # encoder z_ptemp, m_p, logs_p, _ = self.enc_p(x, x_mask, f0=f0_to_coarse(f0)) z, m_q, logs_q, spec_mask = self.enc_q(spec, spec_lengths, g=g) @@ -431,6 +444,7 @@ class SynthesizerTrn(nn.Module): return o, ids_slice, spec_mask, (z, z_p, m_p, logs_p, m_q, logs_q), pred_lf0, norm_lf0, lf0 + @torch.no_grad() def infer(self, c, f0, uv, g=None, noice_scale=0.35, seed=52468, predict_f0=False, vol = None): if c.device == torch.device("cuda"): @@ -453,10 +467,10 @@ class SynthesizerTrn(nn.Module): x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to(c.dtype) # vol proj vol = self.emb_vol(vol[:,:,None]).transpose(1,2) if vol!=None and self.vol_embedding else 0 - - x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1, 2) + vol + + x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1,2) + vol - if predict_f0: + if self.use_automatic_f0_prediction and predict_f0: lf0 = 2595. * torch.log10(1. + f0.unsqueeze(1) / 700.) / 500 norm_lf0 = utils.normalize_f0(lf0, x_mask, uv, random_scale=False) pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g) diff --git a/modules/DSConv.py b/modules/DSConv.py new file mode 100644 index 0000000..9909521 --- /dev/null +++ b/modules/DSConv.py @@ -0,0 +1,76 @@ +import torch +import torch.nn as nn +from torch.nn.utils import weight_norm, remove_weight_norm + +class Depthwise_Separable_Conv1D(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride = 1, + padding = 0, + dilation = 1, + bias = True, + padding_mode = 'zeros', # TODO: refine this type + device=None, + dtype=None + ): + super().__init__() + self.depth_conv = nn.Conv1d(in_channels=in_channels, out_channels=in_channels, kernel_size=kernel_size, groups=in_channels,stride = stride,padding=padding,dilation=dilation,bias=bias,padding_mode=padding_mode,device=device,dtype=dtype) + self.point_conv = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=1, bias=bias, device=device,dtype=dtype) + + def forward(self, input): + return self.point_conv(self.depth_conv(input)) + + def weight_norm(self): + self.depth_conv = weight_norm(self.depth_conv, name = 'weight') + self.point_conv = weight_norm(self.point_conv, name = 'weight') + + def remove_weight_norm(self): + self.depth_conv = remove_weight_norm(self.depth_conv, name = 'weight') + self.point_conv = remove_weight_norm(self.point_conv, name = 'weight') + +class Depthwise_Separable_TransposeConv1D(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride = 1, + padding = 0, + output_padding = 0, + bias = True, + dilation = 1, + padding_mode = 'zeros', # TODO: refine this type + device=None, + dtype=None + ): + super().__init__() + self.depth_conv = nn.ConvTranspose1d(in_channels=in_channels, out_channels=in_channels, kernel_size=kernel_size, groups=in_channels,stride = stride,output_padding=output_padding,padding=padding,dilation=dilation,bias=bias,padding_mode=padding_mode,device=device,dtype=dtype) + self.point_conv = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=1, bias=bias, device=device,dtype=dtype) + + def forward(self, input): + return self.point_conv(self.depth_conv(input)) + + def weight_norm(self): + self.depth_conv = weight_norm(self.depth_conv, name = 'weight') + self.point_conv = weight_norm(self.point_conv, name = 'weight') + + def remove_weight_norm(self): + remove_weight_norm(self.depth_conv, name = 'weight') + remove_weight_norm(self.point_conv, name = 'weight') + + +def weight_norm_modules(module, name = 'weight', dim = 0): + if isinstance(module,Depthwise_Separable_Conv1D) or isinstance(module,Depthwise_Separable_TransposeConv1D): + module.weight_norm() + return module + else: + return weight_norm(module,name,dim) + +def remove_weight_norm_modules(module, name = 'weight'): + if isinstance(module,Depthwise_Separable_Conv1D) or isinstance(module,Depthwise_Separable_TransposeConv1D): + module.remove_weight_norm() + else: + remove_weight_norm(module,name) \ No newline at end of file diff --git a/modules/commons.py b/modules/commons.py index 0748880..c6e891d 100644 --- a/modules/commons.py +++ b/modules/commons.py @@ -24,10 +24,12 @@ def rand_slice_segments_with_pitch(x, pitch, x_lengths=None, segment_size=4): def init_weights(m, mean=0.0, std=0.01): classname = m.__class__.__name__ - if classname.find("Conv") != -1: + if "Depthwise_Separable" in classname: + m.depth_conv.weight.data.normal_(mean, std) + m.point_conv.weight.data.normal_(mean, std) + elif classname.find("Conv") != -1: m.weight.data.normal_(mean, std) - def get_padding(kernel_size, dilation=1): return int((kernel_size*dilation - dilation)/2) diff --git a/modules/modules.py b/modules/modules.py index 54290fd..c1326a3 100644 --- a/modules/modules.py +++ b/modules/modules.py @@ -1,20 +1,20 @@ -import copy -import math -import numpy as np -import scipy import torch from torch import nn from torch.nn import functional as F -from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d -from torch.nn.utils import weight_norm, remove_weight_norm +from modules.DSConv import weight_norm_modules, remove_weight_norm_modules, Depthwise_Separable_Conv1D import modules.commons as commons from modules.commons import init_weights, get_padding - LRELU_SLOPE = 0.1 +Conv1dModel = nn.Conv1d + +def set_Conv1dModel(use_depthwise_conv): + global Conv1dModel + Conv1dModel = Depthwise_Separable_Conv1D if use_depthwise_conv else nn.Conv1d + class LayerNorm(nn.Module): def __init__(self, channels, eps=1e-5): @@ -44,13 +44,13 @@ class ConvReluNorm(nn.Module): self.conv_layers = nn.ModuleList() self.norm_layers = nn.ModuleList() - self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2)) + self.conv_layers.append(Conv1dModel(in_channels, hidden_channels, kernel_size, padding=kernel_size//2)) self.norm_layers.append(LayerNorm(hidden_channels)) self.relu_drop = nn.Sequential( nn.ReLU(), nn.Dropout(p_dropout)) for _ in range(n_layers-1): - self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2)) + self.conv_layers.append(Conv1dModel(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2)) self.norm_layers.append(LayerNorm(hidden_channels)) self.proj = nn.Conv1d(hidden_channels, out_channels, 1) self.proj.weight.data.zero_() @@ -124,14 +124,14 @@ class WN(torch.nn.Module): if gin_channels != 0: cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1) - self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight') + self.cond_layer = weight_norm_modules(cond_layer, name='weight') for i in range(n_layers): dilation = dilation_rate ** i padding = int((kernel_size * dilation - dilation) / 2) - in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size, + in_layer = Conv1dModel(hidden_channels, 2*hidden_channels, kernel_size, dilation=dilation, padding=padding) - in_layer = torch.nn.utils.weight_norm(in_layer, name='weight') + in_layer = weight_norm_modules(in_layer, name='weight') self.in_layers.append(in_layer) # last one is not necessary @@ -141,7 +141,7 @@ class WN(torch.nn.Module): res_skip_channels = hidden_channels res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) - res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight') + res_skip_layer = weight_norm_modules(res_skip_layer, name='weight') self.res_skip_layers.append(res_skip_layer) def forward(self, x, x_mask, g=None, **kwargs): @@ -176,32 +176,32 @@ class WN(torch.nn.Module): def remove_weight_norm(self): if self.gin_channels != 0: - torch.nn.utils.remove_weight_norm(self.cond_layer) + remove_weight_norm_modules(self.cond_layer) for l in self.in_layers: - torch.nn.utils.remove_weight_norm(l) + remove_weight_norm_modules(l) for l in self.res_skip_layers: - torch.nn.utils.remove_weight_norm(l) + remove_weight_norm_modules(l) class ResBlock1(torch.nn.Module): def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): super(ResBlock1, self).__init__() self.convs1 = nn.ModuleList([ - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], + weight_norm_modules(Conv1dModel(channels, channels, kernel_size, 1, dilation=dilation[0], padding=get_padding(kernel_size, dilation[0]))), - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], + weight_norm_modules(Conv1dModel(channels, channels, kernel_size, 1, dilation=dilation[1], padding=get_padding(kernel_size, dilation[1]))), - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], + weight_norm_modules(Conv1dModel(channels, channels, kernel_size, 1, dilation=dilation[2], padding=get_padding(kernel_size, dilation[2]))) ]) self.convs1.apply(init_weights) self.convs2 = nn.ModuleList([ - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + weight_norm_modules(Conv1dModel(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))), - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + weight_norm_modules(Conv1dModel(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))), - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + weight_norm_modules(Conv1dModel(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))) ]) self.convs2.apply(init_weights) @@ -223,18 +223,18 @@ class ResBlock1(torch.nn.Module): def remove_weight_norm(self): for l in self.convs1: - remove_weight_norm(l) + remove_weight_norm_modules(l) for l in self.convs2: - remove_weight_norm(l) + remove_weight_norm_modules(l) class ResBlock2(torch.nn.Module): def __init__(self, channels, kernel_size=3, dilation=(1, 3)): super(ResBlock2, self).__init__() self.convs = nn.ModuleList([ - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], + weight_norm_modules(Conv1dModel(channels, channels, kernel_size, 1, dilation=dilation[0], padding=get_padding(kernel_size, dilation[0]))), - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], + weight_norm_modules(Conv1dModel(channels, channels, kernel_size, 1, dilation=dilation[1], padding=get_padding(kernel_size, dilation[1]))) ]) self.convs.apply(init_weights) @@ -252,7 +252,7 @@ class ResBlock2(torch.nn.Module): def remove_weight_norm(self): for l in self.convs: - remove_weight_norm(l) + remove_weight_norm_modules(l) class Log(nn.Module): diff --git a/train.py b/train.py index dba77bb..6d901d3 100644 --- a/train.py +++ b/train.py @@ -209,7 +209,7 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl loss_fm = feature_loss(fmap_r, fmap_g) loss_gen, losses_gen = generator_loss(y_d_hat_g) - loss_lf0 = F.mse_loss(pred_lf0, lf0) + loss_lf0 = F.mse_loss(pred_lf0, lf0) if net_g.module.use_automatic_f0_prediction else 0 loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl + loss_lf0 optim_g.zero_grad() scaler.scale(loss_gen_all).backward() @@ -241,13 +241,17 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade image_dict = { "slice/mel_org": utils.plot_spectrogram_to_numpy(y_mel[0].data.cpu().numpy()), "slice/mel_gen": utils.plot_spectrogram_to_numpy(y_hat_mel[0].data.cpu().numpy()), - "all/mel": utils.plot_spectrogram_to_numpy(mel[0].data.cpu().numpy()), - "all/lf0": utils.plot_data_to_numpy(lf0[0, 0, :].cpu().numpy(), - pred_lf0[0, 0, :].detach().cpu().numpy()), - "all/norm_lf0": utils.plot_data_to_numpy(lf0[0, 0, :].cpu().numpy(), - norm_lf0[0, 0, :].detach().cpu().numpy()) + "all/mel": utils.plot_spectrogram_to_numpy(mel[0].data.cpu().numpy()) } + if net_g.module.use_automatic_f0_prediction: + image_dict.module.update({ + "all/lf0": utils.plot_data_to_numpy(lf0[0, 0, :].cpu().numpy(), + pred_lf0[0, 0, :].detach().cpu().numpy()), + "all/norm_lf0": utils.plot_data_to_numpy(lf0[0, 0, :].cpu().numpy(), + norm_lf0[0, 0, :].detach().cpu().numpy()) + }) + utils.summarize( writer=writer, global_step=global_step, @@ -328,4 +332,4 @@ def evaluate(hps, generator, eval_loader, writer_eval): if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/vdecoder/hifigan/models.py b/vdecoder/hifigan/models.py index 2c868f3..e727c70 100644 --- a/vdecoder/hifigan/models.py +++ b/vdecoder/hifigan/models.py @@ -6,11 +6,23 @@ import torch import torch.nn.functional as F import torch.nn as nn from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d -from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm +from torch.nn.utils import weight_norm,spectral_norm from .utils import init_weights, get_padding +from modules.DSConv import weight_norm_modules, remove_weight_norm_modules, Depthwise_Separable_Conv1D, Depthwise_Separable_TransposeConv1D + LRELU_SLOPE = 0.1 +Conv1dModel = nn.Conv1d +ConvTranspose1dModel = nn.ConvTranspose1d + +def set_Conv1dModel(use_depthwise_conv): + global Conv1dModel + Conv1dModel = Depthwise_Separable_Conv1D if use_depthwise_conv else nn.Conv1d + +def set_ConvTranspose1dModel(use_depthwise_transposeconv): + global ConvTranspose1dModel + ConvTranspose1dModel = Depthwise_Separable_TransposeConv1D if use_depthwise_transposeconv else nn.ConvTranspose1d def load_model(model_path, device='cuda'): config_file = os.path.join(os.path.split(model_path)[0], 'config.json') @@ -36,21 +48,21 @@ class ResBlock1(torch.nn.Module): super(ResBlock1, self).__init__() self.h = h self.convs1 = nn.ModuleList([ - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], + weight_norm_modules(Conv1dModel(channels, channels, kernel_size, 1, dilation=dilation[0], padding=get_padding(kernel_size, dilation[0]))), - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], + weight_norm_modules(Conv1dModel(channels, channels, kernel_size, 1, dilation=dilation[1], padding=get_padding(kernel_size, dilation[1]))), - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], + weight_norm_modules(Conv1dModel(channels, channels, kernel_size, 1, dilation=dilation[2], padding=get_padding(kernel_size, dilation[2]))) ]) self.convs1.apply(init_weights) self.convs2 = nn.ModuleList([ - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + weight_norm_modules(Conv1dModel(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))), - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + weight_norm_modules(Conv1dModel(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))), - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + weight_norm_modules(Conv1dModel(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))) ]) self.convs2.apply(init_weights) @@ -66,9 +78,9 @@ class ResBlock1(torch.nn.Module): def remove_weight_norm(self): for l in self.convs1: - remove_weight_norm(l) + remove_weight_norm_modules(l) for l in self.convs2: - remove_weight_norm(l) + remove_weight_norm_modules(l) class ResBlock2(torch.nn.Module): @@ -76,9 +88,9 @@ class ResBlock2(torch.nn.Module): super(ResBlock2, self).__init__() self.h = h self.convs = nn.ModuleList([ - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], + weight_norm_modules(Conv1dModel(channels, channels, kernel_size, 1, dilation=dilation[0], padding=get_padding(kernel_size, dilation[0]))), - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], + weight_norm_modules(Conv1dModel(channels, channels, kernel_size, 1, dilation=dilation[1], padding=get_padding(kernel_size, dilation[1]))) ]) self.convs.apply(init_weights) @@ -92,7 +104,7 @@ class ResBlock2(torch.nn.Module): def remove_weight_norm(self): for l in self.convs: - remove_weight_norm(l) + remove_weight_norm_modules(l) def padDiff(x): @@ -277,7 +289,10 @@ class Generator(torch.nn.Module): def __init__(self, h): super(Generator, self).__init__() self.h = h - + + set_Conv1dModel(h["use_depthwise_conv"]) + set_ConvTranspose1dModel(h["use_depthwise_transposeconv"]) + self.num_kernels = len(h["resblock_kernel_sizes"]) self.num_upsamples = len(h["upsample_rates"]) self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(h["upsample_rates"])) @@ -285,17 +300,17 @@ class Generator(torch.nn.Module): sampling_rate=h["sampling_rate"], harmonic_num=8) self.noise_convs = nn.ModuleList() - self.conv_pre = weight_norm(Conv1d(h["inter_channels"], h["upsample_initial_channel"], 7, 1, padding=3)) + self.conv_pre = weight_norm_modules(Conv1dModel(h["inter_channels"], h["upsample_initial_channel"], 7, 1, padding=3)) resblock = ResBlock1 if h["resblock"] == '1' else ResBlock2 self.ups = nn.ModuleList() for i, (u, k) in enumerate(zip(h["upsample_rates"], h["upsample_kernel_sizes"])): c_cur = h["upsample_initial_channel"] // (2 ** (i + 1)) - self.ups.append(weight_norm( - ConvTranspose1d(h["upsample_initial_channel"] // (2 ** i), h["upsample_initial_channel"] // (2 ** (i + 1)), - k, u, padding=(k - u +1 ) // 2))) + self.ups.append(weight_norm_modules( + ConvTranspose1dModel(h["upsample_initial_channel"] // (2 ** i), h["upsample_initial_channel"] // (2 ** (i + 1)), + k, u, padding=(k - u + 1 ) // 2))) if i + 1 < len(h["upsample_rates"]): # stride_f0 = np.prod(h["upsample_rates"][i + 1:]) - self.noise_convs.append(Conv1d( + self.noise_convs.append(Conv1dModel( 1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=(stride_f0+1) // 2)) else: self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1)) @@ -305,7 +320,7 @@ class Generator(torch.nn.Module): for j, (k, d) in enumerate(zip(h["resblock_kernel_sizes"], h["resblock_dilation_sizes"])): self.resblocks.append(resblock(h, ch, k, d)) - self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3)) + self.conv_post = weight_norm_modules(Conv1dModel(ch, 1, 7, 1, padding=3)) self.ups.apply(init_weights) self.conv_post.apply(init_weights) self.cond = nn.Conv1d(h['gin_channels'], h['upsample_initial_channel'], 1) @@ -342,11 +357,11 @@ class Generator(torch.nn.Module): def remove_weight_norm(self): print('Removing weight norm...') for l in self.ups: - remove_weight_norm(l) + remove_weight_norm_modules(l) for l in self.resblocks: l.remove_weight_norm() - remove_weight_norm(self.conv_pre) - remove_weight_norm(self.conv_post) + remove_weight_norm_modules(self.conv_pre) + remove_weight_norm_modules(self.conv_post) class DiscriminatorP(torch.nn.Module): diff --git a/vdecoder/hifigan/utils.py b/vdecoder/hifigan/utils.py index 9c93c99..4a4742f 100644 --- a/vdecoder/hifigan/utils.py +++ b/vdecoder/hifigan/utils.py @@ -21,7 +21,10 @@ def plot_spectrogram(spectrogram): def init_weights(m, mean=0.0, std=0.01): classname = m.__class__.__name__ - if classname.find("Conv") != -1: + if "Depthwise_Separable" in classname: + m.depth_conv.weight.data.normal_(mean, std) + m.point_conv.weight.data.normal_(mean, std) + elif classname.find("Conv") != -1: m.weight.data.normal_(mean, std) diff --git a/vdecoder/hifiganwithsnake/models.py b/vdecoder/hifiganwithsnake/models.py index 4d9ae7a..709aead 100644 --- a/vdecoder/hifiganwithsnake/models.py +++ b/vdecoder/hifiganwithsnake/models.py @@ -6,12 +6,23 @@ import torch import torch.nn.functional as F import torch.nn as nn from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d -from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm +from torch.nn.utils import weight_norm, spectral_norm from .utils import init_weights, get_padding from vdecoder.hifiganwithsnake.alias.act import SnakeAlias +from modules.DSConv import weight_norm_modules, remove_weight_norm_modules, Depthwise_Separable_Conv1D, Depthwise_Separable_TransposeConv1D LRELU_SLOPE = 0.1 +Conv1dModel = nn.Conv1d +ConvTranspose1dModel = nn.ConvTranspose1d + +def set_Conv1dModel(use_depthwise_conv): + global Conv1dModel + Conv1dModel = Depthwise_Separable_Conv1D if use_depthwise_conv else nn.Conv1d + +def set_ConvTranspose1dModel(use_depthwise_transposeconv): + global ConvTranspose1dModel + ConvTranspose1dModel = Depthwise_Separable_TransposeConv1D if use_depthwise_transposeconv else nn.ConvTranspose1d def load_model(model_path, device='cuda'): config_file = os.path.join(os.path.split(model_path)[0], 'config.json') @@ -33,79 +44,77 @@ def load_model(model_path, device='cuda'): class ResBlock1(torch.nn.Module): - def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5), C=None): + def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)): super(ResBlock1, self).__init__() self.h = h self.convs1 = nn.ModuleList([ - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], + weight_norm_modules(Conv1dModel(channels, channels, kernel_size, 1, dilation=dilation[0], padding=get_padding(kernel_size, dilation[0]))), - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], + weight_norm_modules(Conv1dModel(channels, channels, kernel_size, 1, dilation=dilation[1], padding=get_padding(kernel_size, dilation[1]))), - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], + weight_norm_modules(Conv1dModel(channels, channels, kernel_size, 1, dilation=dilation[2], padding=get_padding(kernel_size, dilation[2]))) ]) self.convs1.apply(init_weights) self.convs2 = nn.ModuleList([ - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + weight_norm_modules(Conv1dModel(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))), - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + weight_norm_modules(Conv1dModel(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))), - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + weight_norm_modules(Conv1dModel(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))) ]) self.convs2.apply(init_weights) self.num_layers = len(self.convs1) + len(self.convs2) self.activations = nn.ModuleList([ - SnakeAlias(channels, C=C) for _ in range(self.num_layers) + SnakeAlias(channels) for _ in range(self.num_layers) ]) - def forward(self, x, DIM=None): + def forward(self, x): acts1, acts2 = self.activations[::2], self.activations[1::2] for c1, c2, a1, a2 in zip(self.convs1, self.convs2, acts1, acts2): - xt = a1(x, DIM) + xt = a1(x) xt = c1(xt) - xt = a2(xt, DIM) + xt = a2(xt) xt = c2(xt) x = xt + x return x def remove_weight_norm(self): for l in self.convs1: - remove_weight_norm(l) + remove_weight_norm_modules(l) for l in self.convs2: - remove_weight_norm(l) - + remove_weight_norm_modules(l) class ResBlock2(torch.nn.Module): - def __init__(self, h, channels, kernel_size=3, dilation=(1, 3), C=None): + def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)): super(ResBlock2, self).__init__() self.h = h self.convs = nn.ModuleList([ - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], + weight_norm_modules(Conv1dModel(channels, channels, kernel_size, 1, dilation=dilation[0], padding=get_padding(kernel_size, dilation[0]))), - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], + weight_norm_modules(Conv1dModel(channels, channels, kernel_size, 1, dilation=dilation[1], padding=get_padding(kernel_size, dilation[1]))) ]) self.convs.apply(init_weights) self.num_layers = len(self.convs) self.activations = nn.ModuleList([ - SnakeAlias(channels, C=C) for _ in range(self.num_layers) + SnakeAlias(channels) for _ in range(self.num_layers) ]) - def forward(self, x, DIM=None): + def forward(self, x): for c,a in zip(self.convs, self.activations): - xt = a(x, DIM) + xt = a(x) xt = c(xt) x = xt + x return x def remove_weight_norm(self): for l in self.convs: - remove_weight_norm(l) - + remove_weight_norm_modules(l) def padDiff(x): return F.pad(F.pad(x, (0,0,-1,1), 'constant', 0) - x, (0,0,0,-1), 'constant', 0) @@ -289,7 +298,10 @@ class Generator(torch.nn.Module): def __init__(self, h): super(Generator, self).__init__() self.h = h - + + set_Conv1dModel(h["use_depthwise_conv"]) + set_ConvTranspose1dModel(h["use_depthwise_transposeconv"]) + self.num_kernels = len(h["resblock_kernel_sizes"]) self.num_upsamples = len(h["upsample_rates"]) self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(h["upsample_rates"])) @@ -297,32 +309,29 @@ class Generator(torch.nn.Module): sampling_rate=h["sampling_rate"], harmonic_num=8) self.noise_convs = nn.ModuleList() - self.conv_pre = weight_norm(Conv1d(h["inter_channels"], h["upsample_initial_channel"], 7, 1, padding=3)) + self.conv_pre = weight_norm_modules(Conv1dModel(h["inter_channels"], h["upsample_initial_channel"], 7, 1, padding=3)) resblock = ResBlock1 if h["resblock"] == '1' else ResBlock2 self.ups = nn.ModuleList() for i, (u, k) in enumerate(zip(h["upsample_rates"], h["upsample_kernel_sizes"])): c_cur = h["upsample_initial_channel"] // (2 ** (i + 1)) - self.ups.append(weight_norm( - ConvTranspose1d(h["upsample_initial_channel"] // (2 ** i), h["upsample_initial_channel"] // (2 ** (i + 1)), - k, u, padding=(k - u + 1) // 2))) + self.ups.append(weight_norm_modules( + ConvTranspose1dModel(h["upsample_initial_channel"] // (2 ** i), h["upsample_initial_channel"] // (2 ** (i + 1)), + k, u, padding=(k - u +1 ) // 2))) if i + 1 < len(h["upsample_rates"]): # stride_f0 = np.prod(h["upsample_rates"][i + 1:]) - self.noise_convs.append(Conv1d( - 1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=(stride_f0+ 1) // 2)) + self.noise_convs.append(Conv1dModel( + 1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=(stride_f0+1) // 2)) else: self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1)) self.resblocks = nn.ModuleList() - self.snakes = nn.ModuleList() for i in range(len(self.ups)): ch = h["upsample_initial_channel"] // (2 ** (i + 1)) - self.snakes.append(SnakeAlias(h["upsample_initial_channel"] // (2 ** (i)), C = h["upsample_initial_channel"] >> i)) for j, (k, d) in enumerate(zip(h["resblock_kernel_sizes"], h["resblock_dilation_sizes"])): - self.resblocks.append(resblock(h, ch, k, d, C = h["upsample_initial_channel"] >> (i + 1))) + self.resblocks.append(resblock(h, ch, k, d)) - self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3)) + self.conv_post = weight_norm_modules(Conv1dModel(ch, 1, 7, 1, padding=3)) self.ups.apply(init_weights) self.conv_post.apply(init_weights) - self.snake_post = SnakeAlias(ch, C = h["upsample_initial_channel"] >> len(self.ups)) self.cond = nn.Conv1d(h['gin_channels'], h['upsample_initial_channel'], 1) def forward(self, x, f0, g=None): @@ -335,9 +344,8 @@ class Generator(torch.nn.Module): x = x + self.cond(g) # print(124,x.shape,har_source.shape) for i in range(self.num_upsamples): - # print(f"self.snakes.{i}.pre:", x.shape) x = self.snakes[i](x) - # print(f"self.snakes.{i}.after:", x.shape) + # print(3,x.shape) x = self.ups[i](x) x_source = self.noise_convs[i](har_source) # print(4,x_source.shape,har_source.shape,x.shape) @@ -348,7 +356,6 @@ class Generator(torch.nn.Module): xs = self.resblocks[i * self.num_kernels + j](x) else: xs += self.resblocks[i * self.num_kernels + j](x) - # print(f"self.resblocks.{i}.after:", xs.shape) x = xs / self.num_kernels x = self.snake_post(x) x = self.conv_post(x) @@ -359,11 +366,11 @@ class Generator(torch.nn.Module): def remove_weight_norm(self): print('Removing weight norm...') for l in self.ups: - remove_weight_norm(l) + remove_weight_norm_modules(l) for l in self.resblocks: - l.remove_weight_norm() - remove_weight_norm(self.conv_pre) - remove_weight_norm(self.conv_post) + l.remove_weight_norm_modules() + remove_weight_norm_modules(self.conv_pre) + remove_weight_norm_modules(self.conv_post) class DiscriminatorP(torch.nn.Module): diff --git a/vdecoder/hifiganwithsnake/utils.py b/vdecoder/hifiganwithsnake/utils.py index 9c93c99..4a4742f 100644 --- a/vdecoder/hifiganwithsnake/utils.py +++ b/vdecoder/hifiganwithsnake/utils.py @@ -21,7 +21,10 @@ def plot_spectrogram(spectrogram): def init_weights(m, mean=0.0, std=0.01): classname = m.__class__.__name__ - if classname.find("Conv") != -1: + if "Depthwise_Separable" in classname: + m.depth_conv.weight.data.normal_(mean, std) + m.point_conv.weight.data.normal_(mean, std) + elif classname.find("Conv") != -1: m.weight.data.normal_(mean, std) From 89fb7159044b660292aae32979b059f321b7d487 Mon Sep 17 00:00:00 2001 From: YuriHead Date: Mon, 26 Jun 2023 01:05:59 +0800 Subject: [PATCH 5/9] Debug --- train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train.py b/train.py index 6d901d3..9ddd5c4 100644 --- a/train.py +++ b/train.py @@ -245,7 +245,7 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade } if net_g.module.use_automatic_f0_prediction: - image_dict.module.update({ + image_dict.update({ "all/lf0": utils.plot_data_to_numpy(lf0[0, 0, :].cpu().numpy(), pred_lf0[0, 0, :].detach().cpu().numpy()), "all/norm_lf0": utils.plot_data_to_numpy(lf0[0, 0, :].cpu().numpy(), From fd03762d522430192b09ff2da3419a5f92153248 Mon Sep 17 00:00:00 2001 From: YuriHead Date: Mon, 26 Jun 2023 04:02:17 +0800 Subject: [PATCH 6/9] Updata BF16 AMP --- configs_template/config_template.json | 1 + modules/mel_processing.py | 7 +++++-- train.py | 17 ++++++++++------- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/configs_template/config_template.json b/configs_template/config_template.json index e70b5d8..c7ec01b 100644 --- a/configs_template/config_template.json +++ b/configs_template/config_template.json @@ -12,6 +12,7 @@ "eps": 1e-09, "batch_size": 6, "fp16_run": false, + "half_type": "fp16", "lr_decay": 0.999875, "segment_size": 10240, "init_lr_ratio": 1, diff --git a/modules/mel_processing.py b/modules/mel_processing.py index a9936a2..a0ba17b 100644 --- a/modules/mel_processing.py +++ b/modules/mel_processing.py @@ -62,10 +62,13 @@ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False) y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect') y = y.squeeze(1) - + + y_dtype = y.dtype + if y.dtype == torch.bfloat16: y = y.to(torch.float32) spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device], center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=True) - spec = torch.view_as_real(spec) + spec = torch.view_as_real(spec).to(y_dtype) + spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) return spec diff --git a/train.py b/train.py index 9ddd5c4..cbc55d2 100644 --- a/train.py +++ b/train.py @@ -61,7 +61,7 @@ def run(rank, n_gpus, hps): utils.check_git_hash(hps.model_dir) writer = SummaryWriter(log_dir=hps.model_dir) writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval")) - + # for pytorch on win, backend use gloo dist.init_process_group(backend= 'gloo' if os.name == 'nt' else 'nccl', init_method='env://', world_size=n_gpus, rank=rank) torch.manual_seed(hps.train.seed) @@ -148,6 +148,8 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade train_loader, eval_loader = loaders if writers is not None: writer, writer_eval = writers + + half_type = torch.float16 if hps.train.half_type=="fp16" else torch.bfloat16 # train_loader.batch_sampler.set_epoch(epoch) global global_step @@ -169,8 +171,8 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade hps.data.sampling_rate, hps.data.mel_fmin, hps.data.mel_fmax) - - with autocast(enabled=hps.train.fp16_run): + + with autocast(enabled=hps.train.fp16_run, dtype=half_type): y_hat, ids_slice, z_mask, \ (z, z_p, m_p, logs_p, m_q, logs_q), pred_lf0, norm_lf0, lf0 = net_g(c, f0, uv, spec, g=g, c_lengths=lengths, spec_lengths=lengths,vol = volume) @@ -191,20 +193,21 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade # Discriminator y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach()) - with autocast(enabled=False): + with autocast(enabled=False, dtype=half_type): loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(y_d_hat_r, y_d_hat_g) loss_disc_all = loss_disc - + optim_d.zero_grad() scaler.scale(loss_disc_all).backward() scaler.unscale_(optim_d) grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None) scaler.step(optim_d) + - with autocast(enabled=hps.train.fp16_run): + with autocast(enabled=hps.train.fp16_run, dtype=half_type): # Generator y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(y, y_hat) - with autocast(enabled=False): + with autocast(enabled=False, dtype=half_type): loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl loss_fm = feature_loss(fmap_r, fmap_g) From 531b765bfa0ffd56de453eee85d5f75089e07a1c Mon Sep 17 00:00:00 2001 From: YuriHead Date: Mon, 26 Jun 2023 04:12:57 +0800 Subject: [PATCH 7/9] Debug Snake --- vdecoder/hifiganwithsnake/models.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/vdecoder/hifiganwithsnake/models.py b/vdecoder/hifiganwithsnake/models.py index 709aead..ff4b32d 100644 --- a/vdecoder/hifiganwithsnake/models.py +++ b/vdecoder/hifiganwithsnake/models.py @@ -44,7 +44,7 @@ def load_model(model_path, device='cuda'): class ResBlock1(torch.nn.Module): - def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)): + def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5), C=None): super(ResBlock1, self).__init__() self.h = h self.convs1 = nn.ModuleList([ @@ -69,15 +69,15 @@ class ResBlock1(torch.nn.Module): self.num_layers = len(self.convs1) + len(self.convs2) self.activations = nn.ModuleList([ - SnakeAlias(channels) for _ in range(self.num_layers) + SnakeAlias(channels, C=C) for _ in range(self.num_layers) ]) - def forward(self, x): + def forward(self, x, DIM=None): acts1, acts2 = self.activations[::2], self.activations[1::2] for c1, c2, a1, a2 in zip(self.convs1, self.convs2, acts1, acts2): - xt = a1(x) + xt = a1(x, DIM) xt = c1(xt) - xt = a2(xt) + xt = a2(xt, DIM) xt = c2(xt) x = xt + x return x @@ -89,7 +89,7 @@ class ResBlock1(torch.nn.Module): remove_weight_norm_modules(l) class ResBlock2(torch.nn.Module): - def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)): + def __init__(self, h, channels, kernel_size=3, dilation=(1, 3), C=None): super(ResBlock2, self).__init__() self.h = h self.convs = nn.ModuleList([ @@ -102,12 +102,12 @@ class ResBlock2(torch.nn.Module): self.num_layers = len(self.convs) self.activations = nn.ModuleList([ - SnakeAlias(channels) for _ in range(self.num_layers) + SnakeAlias(channels, C=C) for _ in range(self.num_layers) ]) - def forward(self, x): + def forward(self, x, DIM=None): for c,a in zip(self.convs, self.activations): - xt = a(x) + xt = a(x, DIM) xt = c(xt) x = xt + x return x @@ -324,14 +324,17 @@ class Generator(torch.nn.Module): else: self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1)) self.resblocks = nn.ModuleList() + self.snakes = nn.ModuleList() for i in range(len(self.ups)): ch = h["upsample_initial_channel"] // (2 ** (i + 1)) + self.snakes.append(SnakeAlias(h["upsample_initial_channel"] // (2 ** (i)), C = h["upsample_initial_channel"] >> i)) for j, (k, d) in enumerate(zip(h["resblock_kernel_sizes"], h["resblock_dilation_sizes"])): self.resblocks.append(resblock(h, ch, k, d)) self.conv_post = weight_norm_modules(Conv1dModel(ch, 1, 7, 1, padding=3)) self.ups.apply(init_weights) self.conv_post.apply(init_weights) + self.snake_post = SnakeAlias(ch, C = h["upsample_initial_channel"] >> len(self.ups)) self.cond = nn.Conv1d(h['gin_channels'], h['upsample_initial_channel'], 1) def forward(self, x, f0, g=None): @@ -344,8 +347,9 @@ class Generator(torch.nn.Module): x = x + self.cond(g) # print(124,x.shape,har_source.shape) for i in range(self.num_upsamples): + # print(f"self.snakes.{i}.pre:", x.shape) x = self.snakes[i](x) - # print(3,x.shape) + # print(f"self.snakes.{i}.after:", x.shape) x = self.ups[i](x) x_source = self.noise_convs[i](har_source) # print(4,x_source.shape,har_source.shape,x.shape) @@ -356,6 +360,7 @@ class Generator(torch.nn.Module): xs = self.resblocks[i * self.num_kernels + j](x) else: xs += self.resblocks[i * self.num_kernels + j](x) + # print(f"self.resblocks.{i}.after:", xs.shape) x = xs / self.num_kernels x = self.snake_post(x) x = self.conv_post(x) From 98ce91c395787408bbfb5aa4db97d09e915edd35 Mon Sep 17 00:00:00 2001 From: YuriHead Date: Mon, 26 Jun 2023 04:16:55 +0800 Subject: [PATCH 8/9] Debug --- vdecoder/hifiganwithsnake/models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vdecoder/hifiganwithsnake/models.py b/vdecoder/hifiganwithsnake/models.py index ff4b32d..ba8d8f1 100644 --- a/vdecoder/hifiganwithsnake/models.py +++ b/vdecoder/hifiganwithsnake/models.py @@ -316,7 +316,7 @@ class Generator(torch.nn.Module): c_cur = h["upsample_initial_channel"] // (2 ** (i + 1)) self.ups.append(weight_norm_modules( ConvTranspose1dModel(h["upsample_initial_channel"] // (2 ** i), h["upsample_initial_channel"] // (2 ** (i + 1)), - k, u, padding=(k - u +1 ) // 2))) + k, u, padding=(k - u + 1 ) // 2))) if i + 1 < len(h["upsample_rates"]): # stride_f0 = np.prod(h["upsample_rates"][i + 1:]) self.noise_convs.append(Conv1dModel( @@ -329,7 +329,7 @@ class Generator(torch.nn.Module): ch = h["upsample_initial_channel"] // (2 ** (i + 1)) self.snakes.append(SnakeAlias(h["upsample_initial_channel"] // (2 ** (i)), C = h["upsample_initial_channel"] >> i)) for j, (k, d) in enumerate(zip(h["resblock_kernel_sizes"], h["resblock_dilation_sizes"])): - self.resblocks.append(resblock(h, ch, k, d)) + self.resblocks.append(resblock(h, ch, k, d, C = h["upsample_initial_channel"] >> (i + 1))) self.conv_post = weight_norm_modules(Conv1dModel(ch, 1, 7, 1, padding=3)) self.ups.apply(init_weights) From 57c079fbfab28e8922fe8db2b7d15edfd787e01c Mon Sep 17 00:00:00 2001 From: YuriHead Date: Fri, 30 Jun 2023 00:38:09 +0800 Subject: [PATCH 9/9] New Tiny --- configs_template/config_template.json | 2 +- models.py | 7 +-- modules/modules.py | 41 ------------- vdecoder/hifigan/models.py | 75 ++++++++++-------------- vdecoder/hifigan/utils.py | 11 ++-- vdecoder/hifiganwithsnake/models.py | 83 ++++++++++++--------------- vdecoder/hifiganwithsnake/utils.py | 11 ++-- 7 files changed, 78 insertions(+), 152 deletions(-) diff --git a/configs_template/config_template.json b/configs_template/config_template.json index c7ec01b..377a5ec 100644 --- a/configs_template/config_template.json +++ b/configs_template/config_template.json @@ -54,6 +54,7 @@ "upsample_initial_channel": 512, "upsample_kernel_sizes": [16,16, 4, 4, 4], "n_layers_q": 3, + "n_flow_layer": 4, "use_spectral_norm": false, "gin_channels": 768, "ssl_dim": 768, @@ -63,7 +64,6 @@ "speaker_embedding":false, "vol_embedding":false, "use_depthwise_conv":false, - "use_depthwise_transposeconv":false, "use_automatic_f0_prediction": true }, "spk": { diff --git a/models.py b/models.py index 2b2d5a1..a529206 100644 --- a/models.py +++ b/models.py @@ -322,8 +322,8 @@ class SynthesizerTrn(nn.Module): vol_embedding=False, vocoder_name = "nsf-hifigan", use_depthwise_conv = False, - use_depthwise_transposeconv = False, use_automatic_f0_prediction = True, + n_flow_layer = 4, **kwargs): super().__init__() @@ -372,8 +372,7 @@ class SynthesizerTrn(nn.Module): "upsample_initial_channel": upsample_initial_channel, "upsample_kernel_sizes": upsample_kernel_sizes, "gin_channels": gin_channels, - "use_depthwise_conv":use_depthwise_conv, - "use_depthwise_transposeconv":use_depthwise_transposeconv + "use_depthwise_conv":use_depthwise_conv } modules.set_Conv1dModel(self.use_depthwise_conv) @@ -390,7 +389,7 @@ class SynthesizerTrn(nn.Module): self.dec = Generator(h=hps) self.enc_q = Encoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels) - self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels) + self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, n_flow_layer, gin_channels=gin_channels) if self.use_automatic_f0_prediction: self.f0_decoder = F0Decoder( 1, diff --git a/modules/modules.py b/modules/modules.py index c1326a3..df63e29 100644 --- a/modules/modules.py +++ b/modules/modules.py @@ -66,47 +66,6 @@ class ConvReluNorm(nn.Module): return x * x_mask -class DDSConv(nn.Module): - """ - Dialted and Depth-Separable Convolution - """ - def __init__(self, channels, kernel_size, n_layers, p_dropout=0.): - super().__init__() - self.channels = channels - self.kernel_size = kernel_size - self.n_layers = n_layers - self.p_dropout = p_dropout - - self.drop = nn.Dropout(p_dropout) - self.convs_sep = nn.ModuleList() - self.convs_1x1 = nn.ModuleList() - self.norms_1 = nn.ModuleList() - self.norms_2 = nn.ModuleList() - for i in range(n_layers): - dilation = kernel_size ** i - padding = (kernel_size * dilation - dilation) // 2 - self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size, - groups=channels, dilation=dilation, padding=padding - )) - self.convs_1x1.append(nn.Conv1d(channels, channels, 1)) - self.norms_1.append(LayerNorm(channels)) - self.norms_2.append(LayerNorm(channels)) - - def forward(self, x, x_mask, g=None): - if g is not None: - x = x + g - for i in range(self.n_layers): - y = self.convs_sep[i](x * x_mask) - y = self.norms_1[i](y) - y = F.gelu(y) - y = self.convs_1x1[i](y) - y = self.norms_2[i](y) - y = F.gelu(y) - y = self.drop(y) - x = x + y - return x * x_mask - - class WN(torch.nn.Module): def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0): super(WN, self).__init__() diff --git a/vdecoder/hifigan/models.py b/vdecoder/hifigan/models.py index e727c70..10eca45 100644 --- a/vdecoder/hifigan/models.py +++ b/vdecoder/hifigan/models.py @@ -1,28 +1,18 @@ -import os import json -from .env import AttrDict +import os + import numpy as np import torch -import torch.nn.functional as F import torch.nn as nn -from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d -from torch.nn.utils import weight_norm,spectral_norm -from .utils import init_weights, get_padding +import torch.nn.functional as F +from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d +from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm -from modules.DSConv import weight_norm_modules, remove_weight_norm_modules, Depthwise_Separable_Conv1D, Depthwise_Separable_TransposeConv1D +from .env import AttrDict +from .utils import get_padding, init_weights LRELU_SLOPE = 0.1 -Conv1dModel = nn.Conv1d -ConvTranspose1dModel = nn.ConvTranspose1d - -def set_Conv1dModel(use_depthwise_conv): - global Conv1dModel - Conv1dModel = Depthwise_Separable_Conv1D if use_depthwise_conv else nn.Conv1d - -def set_ConvTranspose1dModel(use_depthwise_transposeconv): - global ConvTranspose1dModel - ConvTranspose1dModel = Depthwise_Separable_TransposeConv1D if use_depthwise_transposeconv else nn.ConvTranspose1d def load_model(model_path, device='cuda'): config_file = os.path.join(os.path.split(model_path)[0], 'config.json') @@ -48,21 +38,21 @@ class ResBlock1(torch.nn.Module): super(ResBlock1, self).__init__() self.h = h self.convs1 = nn.ModuleList([ - weight_norm_modules(Conv1dModel(channels, channels, kernel_size, 1, dilation=dilation[0], + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], padding=get_padding(kernel_size, dilation[0]))), - weight_norm_modules(Conv1dModel(channels, channels, kernel_size, 1, dilation=dilation[1], + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], padding=get_padding(kernel_size, dilation[1]))), - weight_norm_modules(Conv1dModel(channels, channels, kernel_size, 1, dilation=dilation[2], + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], padding=get_padding(kernel_size, dilation[2]))) ]) self.convs1.apply(init_weights) self.convs2 = nn.ModuleList([ - weight_norm_modules(Conv1dModel(channels, channels, kernel_size, 1, dilation=1, + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))), - weight_norm_modules(Conv1dModel(channels, channels, kernel_size, 1, dilation=1, + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))), - weight_norm_modules(Conv1dModel(channels, channels, kernel_size, 1, dilation=1, + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))) ]) self.convs2.apply(init_weights) @@ -78,9 +68,9 @@ class ResBlock1(torch.nn.Module): def remove_weight_norm(self): for l in self.convs1: - remove_weight_norm_modules(l) + remove_weight_norm(l) for l in self.convs2: - remove_weight_norm_modules(l) + remove_weight_norm(l) class ResBlock2(torch.nn.Module): @@ -88,9 +78,9 @@ class ResBlock2(torch.nn.Module): super(ResBlock2, self).__init__() self.h = h self.convs = nn.ModuleList([ - weight_norm_modules(Conv1dModel(channels, channels, kernel_size, 1, dilation=dilation[0], + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], padding=get_padding(kernel_size, dilation[0]))), - weight_norm_modules(Conv1dModel(channels, channels, kernel_size, 1, dilation=dilation[1], + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], padding=get_padding(kernel_size, dilation[1]))) ]) self.convs.apply(init_weights) @@ -104,7 +94,7 @@ class ResBlock2(torch.nn.Module): def remove_weight_norm(self): for l in self.convs: - remove_weight_norm_modules(l) + remove_weight_norm(l) def padDiff(x): @@ -211,8 +201,6 @@ class SineGen(torch.nn.Module): output uv: tensor(batchsize=1, length, 1) """ with torch.no_grad(): - f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, - device=f0.device) # fundamental component fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device)) @@ -289,10 +277,7 @@ class Generator(torch.nn.Module): def __init__(self, h): super(Generator, self).__init__() self.h = h - - set_Conv1dModel(h["use_depthwise_conv"]) - set_ConvTranspose1dModel(h["use_depthwise_transposeconv"]) - + self.num_kernels = len(h["resblock_kernel_sizes"]) self.num_upsamples = len(h["upsample_rates"]) self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(h["upsample_rates"])) @@ -300,17 +285,17 @@ class Generator(torch.nn.Module): sampling_rate=h["sampling_rate"], harmonic_num=8) self.noise_convs = nn.ModuleList() - self.conv_pre = weight_norm_modules(Conv1dModel(h["inter_channels"], h["upsample_initial_channel"], 7, 1, padding=3)) + self.conv_pre = weight_norm(Conv1d(h["inter_channels"], h["upsample_initial_channel"], 7, 1, padding=3)) resblock = ResBlock1 if h["resblock"] == '1' else ResBlock2 self.ups = nn.ModuleList() for i, (u, k) in enumerate(zip(h["upsample_rates"], h["upsample_kernel_sizes"])): c_cur = h["upsample_initial_channel"] // (2 ** (i + 1)) - self.ups.append(weight_norm_modules( - ConvTranspose1dModel(h["upsample_initial_channel"] // (2 ** i), h["upsample_initial_channel"] // (2 ** (i + 1)), - k, u, padding=(k - u + 1 ) // 2))) + self.ups.append(weight_norm( + ConvTranspose1d(h["upsample_initial_channel"] // (2 ** i), h["upsample_initial_channel"] // (2 ** (i + 1)), + k, u, padding=(k - u +1 ) // 2))) if i + 1 < len(h["upsample_rates"]): # stride_f0 = np.prod(h["upsample_rates"][i + 1:]) - self.noise_convs.append(Conv1dModel( + self.noise_convs.append(Conv1d( 1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=(stride_f0+1) // 2)) else: self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1)) @@ -320,7 +305,7 @@ class Generator(torch.nn.Module): for j, (k, d) in enumerate(zip(h["resblock_kernel_sizes"], h["resblock_dilation_sizes"])): self.resblocks.append(resblock(h, ch, k, d)) - self.conv_post = weight_norm_modules(Conv1dModel(ch, 1, 7, 1, padding=3)) + self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3)) self.ups.apply(init_weights) self.conv_post.apply(init_weights) self.cond = nn.Conv1d(h['gin_channels'], h['upsample_initial_channel'], 1) @@ -357,18 +342,18 @@ class Generator(torch.nn.Module): def remove_weight_norm(self): print('Removing weight norm...') for l in self.ups: - remove_weight_norm_modules(l) + remove_weight_norm(l) for l in self.resblocks: l.remove_weight_norm() - remove_weight_norm_modules(self.conv_pre) - remove_weight_norm_modules(self.conv_post) + remove_weight_norm(self.conv_pre) + remove_weight_norm(self.conv_post) class DiscriminatorP(torch.nn.Module): def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): super(DiscriminatorP, self).__init__() self.period = period - norm_f = weight_norm if use_spectral_norm == False else spectral_norm + norm_f = weight_norm if use_spectral_norm is False else spectral_norm self.convs = nn.ModuleList([ norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), @@ -427,7 +412,7 @@ class MultiPeriodDiscriminator(torch.nn.Module): class DiscriminatorS(torch.nn.Module): def __init__(self, use_spectral_norm=False): super(DiscriminatorS, self).__init__() - norm_f = weight_norm if use_spectral_norm == False else spectral_norm + norm_f = weight_norm if use_spectral_norm is False else spectral_norm self.convs = nn.ModuleList([ norm_f(Conv1d(1, 128, 15, 1, padding=7)), norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)), diff --git a/vdecoder/hifigan/utils.py b/vdecoder/hifigan/utils.py index 4a4742f..e519e2b 100644 --- a/vdecoder/hifigan/utils.py +++ b/vdecoder/hifigan/utils.py @@ -1,10 +1,10 @@ import glob import os -import matplotlib -import torch -from torch.nn.utils import weight_norm + # matplotlib.use("Agg") import matplotlib.pylab as plt +import torch +from torch.nn.utils import weight_norm def plot_spectrogram(spectrogram): @@ -21,10 +21,7 @@ def plot_spectrogram(spectrogram): def init_weights(m, mean=0.0, std=0.01): classname = m.__class__.__name__ - if "Depthwise_Separable" in classname: - m.depth_conv.weight.data.normal_(mean, std) - m.point_conv.weight.data.normal_(mean, std) - elif classname.find("Conv") != -1: + if classname.find("Conv") != -1: m.weight.data.normal_(mean, std) diff --git a/vdecoder/hifiganwithsnake/models.py b/vdecoder/hifiganwithsnake/models.py index ba8d8f1..ab9bcd1 100644 --- a/vdecoder/hifiganwithsnake/models.py +++ b/vdecoder/hifiganwithsnake/models.py @@ -1,28 +1,20 @@ -import os import json -from .env import AttrDict +import os + import numpy as np import torch -import torch.nn.functional as F import torch.nn as nn -from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d -from torch.nn.utils import weight_norm, spectral_norm -from .utils import init_weights, get_padding +import torch.nn.functional as F +from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d +from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm + from vdecoder.hifiganwithsnake.alias.act import SnakeAlias -from modules.DSConv import weight_norm_modules, remove_weight_norm_modules, Depthwise_Separable_Conv1D, Depthwise_Separable_TransposeConv1D + +from .env import AttrDict +from .utils import get_padding, init_weights LRELU_SLOPE = 0.1 -Conv1dModel = nn.Conv1d -ConvTranspose1dModel = nn.ConvTranspose1d - -def set_Conv1dModel(use_depthwise_conv): - global Conv1dModel - Conv1dModel = Depthwise_Separable_Conv1D if use_depthwise_conv else nn.Conv1d - -def set_ConvTranspose1dModel(use_depthwise_transposeconv): - global ConvTranspose1dModel - ConvTranspose1dModel = Depthwise_Separable_TransposeConv1D if use_depthwise_transposeconv else nn.ConvTranspose1d def load_model(model_path, device='cuda'): config_file = os.path.join(os.path.split(model_path)[0], 'config.json') @@ -48,21 +40,21 @@ class ResBlock1(torch.nn.Module): super(ResBlock1, self).__init__() self.h = h self.convs1 = nn.ModuleList([ - weight_norm_modules(Conv1dModel(channels, channels, kernel_size, 1, dilation=dilation[0], + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], padding=get_padding(kernel_size, dilation[0]))), - weight_norm_modules(Conv1dModel(channels, channels, kernel_size, 1, dilation=dilation[1], + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], padding=get_padding(kernel_size, dilation[1]))), - weight_norm_modules(Conv1dModel(channels, channels, kernel_size, 1, dilation=dilation[2], + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], padding=get_padding(kernel_size, dilation[2]))) ]) self.convs1.apply(init_weights) self.convs2 = nn.ModuleList([ - weight_norm_modules(Conv1dModel(channels, channels, kernel_size, 1, dilation=1, + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))), - weight_norm_modules(Conv1dModel(channels, channels, kernel_size, 1, dilation=1, + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))), - weight_norm_modules(Conv1dModel(channels, channels, kernel_size, 1, dilation=1, + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))) ]) self.convs2.apply(init_weights) @@ -84,18 +76,19 @@ class ResBlock1(torch.nn.Module): def remove_weight_norm(self): for l in self.convs1: - remove_weight_norm_modules(l) + remove_weight_norm(l) for l in self.convs2: - remove_weight_norm_modules(l) + remove_weight_norm(l) + class ResBlock2(torch.nn.Module): def __init__(self, h, channels, kernel_size=3, dilation=(1, 3), C=None): super(ResBlock2, self).__init__() self.h = h self.convs = nn.ModuleList([ - weight_norm_modules(Conv1dModel(channels, channels, kernel_size, 1, dilation=dilation[0], + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], padding=get_padding(kernel_size, dilation[0]))), - weight_norm_modules(Conv1dModel(channels, channels, kernel_size, 1, dilation=dilation[1], + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], padding=get_padding(kernel_size, dilation[1]))) ]) self.convs.apply(init_weights) @@ -114,7 +107,8 @@ class ResBlock2(torch.nn.Module): def remove_weight_norm(self): for l in self.convs: - remove_weight_norm_modules(l) + remove_weight_norm(l) + def padDiff(x): return F.pad(F.pad(x, (0,0,-1,1), 'constant', 0) - x, (0,0,0,-1), 'constant', 0) @@ -220,8 +214,6 @@ class SineGen(torch.nn.Module): output uv: tensor(batchsize=1, length, 1) """ with torch.no_grad(): - f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, - device=f0.device) # fundamental component fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device)) @@ -298,10 +290,7 @@ class Generator(torch.nn.Module): def __init__(self, h): super(Generator, self).__init__() self.h = h - - set_Conv1dModel(h["use_depthwise_conv"]) - set_ConvTranspose1dModel(h["use_depthwise_transposeconv"]) - + self.num_kernels = len(h["resblock_kernel_sizes"]) self.num_upsamples = len(h["upsample_rates"]) self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(h["upsample_rates"])) @@ -309,18 +298,18 @@ class Generator(torch.nn.Module): sampling_rate=h["sampling_rate"], harmonic_num=8) self.noise_convs = nn.ModuleList() - self.conv_pre = weight_norm_modules(Conv1dModel(h["inter_channels"], h["upsample_initial_channel"], 7, 1, padding=3)) + self.conv_pre = weight_norm(Conv1d(h["inter_channels"], h["upsample_initial_channel"], 7, 1, padding=3)) resblock = ResBlock1 if h["resblock"] == '1' else ResBlock2 self.ups = nn.ModuleList() for i, (u, k) in enumerate(zip(h["upsample_rates"], h["upsample_kernel_sizes"])): c_cur = h["upsample_initial_channel"] // (2 ** (i + 1)) - self.ups.append(weight_norm_modules( - ConvTranspose1dModel(h["upsample_initial_channel"] // (2 ** i), h["upsample_initial_channel"] // (2 ** (i + 1)), - k, u, padding=(k - u + 1 ) // 2))) + self.ups.append(weight_norm( + ConvTranspose1d(h["upsample_initial_channel"] // (2 ** i), h["upsample_initial_channel"] // (2 ** (i + 1)), + k, u, padding=(k - u + 1) // 2))) if i + 1 < len(h["upsample_rates"]): # stride_f0 = np.prod(h["upsample_rates"][i + 1:]) - self.noise_convs.append(Conv1dModel( - 1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=(stride_f0+1) // 2)) + self.noise_convs.append(Conv1d( + 1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=(stride_f0+ 1) // 2)) else: self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1)) self.resblocks = nn.ModuleList() @@ -331,7 +320,7 @@ class Generator(torch.nn.Module): for j, (k, d) in enumerate(zip(h["resblock_kernel_sizes"], h["resblock_dilation_sizes"])): self.resblocks.append(resblock(h, ch, k, d, C = h["upsample_initial_channel"] >> (i + 1))) - self.conv_post = weight_norm_modules(Conv1dModel(ch, 1, 7, 1, padding=3)) + self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3)) self.ups.apply(init_weights) self.conv_post.apply(init_weights) self.snake_post = SnakeAlias(ch, C = h["upsample_initial_channel"] >> len(self.ups)) @@ -371,18 +360,18 @@ class Generator(torch.nn.Module): def remove_weight_norm(self): print('Removing weight norm...') for l in self.ups: - remove_weight_norm_modules(l) + remove_weight_norm(l) for l in self.resblocks: - l.remove_weight_norm_modules() - remove_weight_norm_modules(self.conv_pre) - remove_weight_norm_modules(self.conv_post) + l.remove_weight_norm() + remove_weight_norm(self.conv_pre) + remove_weight_norm(self.conv_post) class DiscriminatorP(torch.nn.Module): def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): super(DiscriminatorP, self).__init__() self.period = period - norm_f = weight_norm if use_spectral_norm == False else spectral_norm + norm_f = weight_norm if use_spectral_norm is False else spectral_norm self.convs = nn.ModuleList([ norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), @@ -441,7 +430,7 @@ class MultiPeriodDiscriminator(torch.nn.Module): class DiscriminatorS(torch.nn.Module): def __init__(self, use_spectral_norm=False): super(DiscriminatorS, self).__init__() - norm_f = weight_norm if use_spectral_norm == False else spectral_norm + norm_f = weight_norm if use_spectral_norm is False else spectral_norm self.convs = nn.ModuleList([ norm_f(Conv1d(1, 128, 15, 1, padding=7)), norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)), diff --git a/vdecoder/hifiganwithsnake/utils.py b/vdecoder/hifiganwithsnake/utils.py index 4a4742f..e519e2b 100644 --- a/vdecoder/hifiganwithsnake/utils.py +++ b/vdecoder/hifiganwithsnake/utils.py @@ -1,10 +1,10 @@ import glob import os -import matplotlib -import torch -from torch.nn.utils import weight_norm + # matplotlib.use("Agg") import matplotlib.pylab as plt +import torch +from torch.nn.utils import weight_norm def plot_spectrogram(spectrogram): @@ -21,10 +21,7 @@ def plot_spectrogram(spectrogram): def init_weights(m, mean=0.0, std=0.01): classname = m.__class__.__name__ - if "Depthwise_Separable" in classname: - m.depth_conv.weight.data.normal_(mean, std) - m.point_conv.weight.data.normal_(mean, std) - elif classname.find("Conv") != -1: + if classname.find("Conv") != -1: m.weight.data.normal_(mean, std)