Merge branch 'NT-v1.0' of https://github.com/ylzz1997/so-vits-svc into NT-v1.0

2023-05-16 13:17:38 +08:00 · 2023-05-16 13:17:38 +08:00 · ddc594a8e1
parent 23c9025fac 1f52a0b3ec
commit ddc594a8e1
5 changed files with 485 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@ -147,6 +147,7 @@ results
 inference/chunks_temp.json
 logs
 hubert/checkpoint_best_legacy_500.pt
+pretrain/**/*.pt
 configs/config.json
 filelists/test.txt
 filelists/train.txt
--- a/onnx_export_speaker_mix.py
+++ b/onnx_export_speaker_mix.py
@ -0,0 +1,106 @@
+import torch
+from torchaudio.models.wav2vec2.utils import import_fairseq_model
+from fairseq import checkpoint_utils
+from onnxexport.model_onnx_speaker_mix import SynthesizerTrn
+import utils
+
+def get_hubert_model():
+    vec_path = "hubert/checkpoint_best_legacy_500.pt"
+    print("load model(s) from {}".format(vec_path))
+    models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
+        [vec_path],
+        suffix="",
+    )
+    model = models[0]
+    model.eval()
+    return model
+
+
+def main(HubertExport, NetExport):
+    path = "yuuka"
+
+    '''if HubertExport:
+        device = torch.device("cpu")
+        vec_path = "hubert/checkpoint_best_legacy_500.pt"
+        models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
+            [vec_path],
+            suffix="",
+        )
+        original = models[0]
+        original.eval()
+        model = original
+        test_input = torch.rand(1, 1, 16000)
+        model(test_input)
+        torch.onnx.export(model,
+                          test_input,
+                          "hubert4.0.onnx",
+                          export_params=True,
+                          opset_version=16,
+                          do_constant_folding=True,
+                          input_names=['source'],
+                          output_names=['embed'],
+                          dynamic_axes={
+                              'source':
+                                  {
+                                      2: "sample_length"
+                                  },
+                          }
+                          )'''
+    if NetExport:
+        device = torch.device("cpu")
+        hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json")
+        SVCVITS = SynthesizerTrn(
+            hps.data.filter_length // 2 + 1,
+            hps.train.segment_size // hps.data.hop_length,
+            **hps.model)
+        _ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", SVCVITS, None)
+        _ = SVCVITS.eval().to(device)
+        for i in SVCVITS.parameters():
+            i.requires_grad = False
+        test_hidden_unit = torch.rand(1, 10, SVCVITS.gin_channels)
+        test_pitch = torch.rand(1, 10)
+        test_mel2ph = torch.LongTensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]).unsqueeze(0)
+        test_uv = torch.ones(1, 10, dtype=torch.float32)
+        test_noise = torch.randn(1, 192, 10)
+
+        export_mix = False
+
+        test_sid = torch.LongTensor([0])
+        spk_mix = []
+        if export_mix:
+            n_spk = len(hps.spk)
+            for i in range(n_spk):
+                spk_mix.append(1.0/float(n_spk))
+            test_sid = torch.tensor(spk_mix)
+            SVCVITS.export_chara_mix(n_spk)
+        
+        input_names = ["c", "f0", "mel2ph", "uv", "noise", "sid"]
+        output_names = ["audio", ]
+        SVCVITS.eval()
+
+        torch.onnx.export(SVCVITS,
+                          (
+                              test_hidden_unit.to(device),
+                              test_pitch.to(device),
+                              test_mel2ph.to(device),
+                              test_uv.to(device),
+                              test_noise.to(device),
+                              test_sid.to(device)
+                          ),
+                          f"checkpoints/{path}/model.onnx",
+                          dynamic_axes={
+                              "c": [0, 1],
+                              "f0": [1],
+                              "mel2ph": [1],
+                              "uv": [1],
+                              "noise": [2],
+                          },
+                          do_constant_folding=False,
+                          opset_version=16,
+                          verbose=False,
+                          input_names=input_names,
+                          output_names=output_names)
+
+
+if __name__ == '__main__':
+    main(False, True)
--- a/onnxexport/model_onnx_speaker_mix.py
+++ b/onnxexport/model_onnx_speaker_mix.py
@ -0,0 +1,350 @@
+import torch
+from torch import nn
+from torch.nn import functional as F
+import modules.attentions as attentions
+import modules.commons as commons
+import modules.modules as modules
+
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+
+import utils
+from modules.commons import init_weights, get_padding
+from vdecoder.hifigan.models import Generator
+from utils import f0_to_coarse
+
+
+class ResidualCouplingBlock(nn.Module):
+    def __init__(self,
+                 channels,
+                 hidden_channels,
+                 kernel_size,
+                 dilation_rate,
+                 n_layers,
+                 n_flows=4,
+                 gin_channels=0):
+        super().__init__()
+        self.channels = channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.n_flows = n_flows
+        self.gin_channels = gin_channels
+
+        self.flows = nn.ModuleList()
+        for i in range(n_flows):
+            self.flows.append(
+                modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers,
+                                              gin_channels=gin_channels, mean_only=True))
+            self.flows.append(modules.Flip())
+
+    def forward(self, x, x_mask, g=None, reverse=False):
+        if not reverse:
+            for flow in self.flows:
+                x, _ = flow(x, x_mask, g=g, reverse=reverse)
+        else:
+            for flow in reversed(self.flows):
+                x = flow(x, x_mask, g=g, reverse=reverse)
+        return x
+
+
+class Encoder(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 hidden_channels,
+                 kernel_size,
+                 dilation_rate,
+                 n_layers,
+                 gin_channels=0):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.gin_channels = gin_channels
+
+        self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
+        self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+
+    def forward(self, x, x_lengths, g=None):
+        # print(x.shape,x_lengths.shape)
+        x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
+        x = self.pre(x) * x_mask
+        x = self.enc(x, x_mask, g=g)
+        stats = self.proj(x) * x_mask
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
+        return z, m, logs, x_mask
+
+
+class TextEncoder(nn.Module):
+    def __init__(self,
+                 out_channels,
+                 hidden_channels,
+                 kernel_size,
+                 n_layers,
+                 gin_channels=0,
+                 filter_channels=None,
+                 n_heads=None,
+                 p_dropout=None):
+        super().__init__()
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.n_layers = n_layers
+        self.gin_channels = gin_channels
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+        self.f0_emb = nn.Embedding(256, hidden_channels)
+
+        self.enc_ = attentions.Encoder(
+            hidden_channels,
+            filter_channels,
+            n_heads,
+            n_layers,
+            kernel_size,
+            p_dropout)
+
+    def forward(self, x, x_mask, f0=None, z=None):
+        x = x + self.f0_emb(f0).transpose(1, 2)
+        x = self.enc_(x * x_mask, x_mask)
+        stats = self.proj(x) * x_mask
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        z = (m + z * torch.exp(logs)) * x_mask
+        return z, m, logs, x_mask
+
+
+class DiscriminatorP(torch.nn.Module):
+    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
+        super(DiscriminatorP, self).__init__()
+        self.period = period
+        self.use_spectral_norm = use_spectral_norm
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+            norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+            norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+            norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+            norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
+        ])
+        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+
+    def forward(self, x):
+        fmap = []
+
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0:  # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+
+        return x, fmap
+
+
+class DiscriminatorS(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(DiscriminatorS, self).__init__()
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(Conv1d(1, 16, 15, 1, padding=7)),
+            norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
+            norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
+            norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
+            norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
+            norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+        ])
+        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+
+    def forward(self, x):
+        fmap = []
+
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+
+        return x, fmap
+
+
+class F0Decoder(nn.Module):
+    def __init__(self,
+                 out_channels,
+                 hidden_channels,
+                 filter_channels,
+                 n_heads,
+                 n_layers,
+                 kernel_size,
+                 p_dropout,
+                 spk_channels=0):
+        super().__init__()
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.spk_channels = spk_channels
+
+        self.prenet = nn.Conv1d(hidden_channels, hidden_channels, 3, padding=1)
+        self.decoder = attentions.FFT(
+            hidden_channels,
+            filter_channels,
+            n_heads,
+            n_layers,
+            kernel_size,
+            p_dropout)
+        self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
+        self.f0_prenet = nn.Conv1d(1, hidden_channels, 3, padding=1)
+        self.cond = nn.Conv1d(spk_channels, hidden_channels, 1)
+
+    def forward(self, x, norm_f0, x_mask, spk_emb=None):
+        x = torch.detach(x)
+        if spk_emb is not None:
+            x = x + self.cond(spk_emb)
+        x += self.f0_prenet(norm_f0)
+        x = self.prenet(x) * x_mask
+        x = self.decoder(x * x_mask, x_mask)
+        x = self.proj(x) * x_mask
+        return x
+
+
+class SynthesizerTrn(nn.Module):
+    """
+  Synthesizer for Training
+  """
+
+    def __init__(self,
+                 spec_channels,
+                 segment_size,
+                 inter_channels,
+                 hidden_channels,
+                 filter_channels,
+                 n_heads,
+                 n_layers,
+                 kernel_size,
+                 p_dropout,
+                 resblock,
+                 resblock_kernel_sizes,
+                 resblock_dilation_sizes,
+                 upsample_rates,
+                 upsample_initial_channel,
+                 upsample_kernel_sizes,
+                 gin_channels,
+                 ssl_dim,
+                 n_speakers,
+                 sampling_rate=44100,
+                 **kwargs):
+        super().__init__()
+        self.spec_channels = spec_channels
+        self.inter_channels = inter_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.resblock = resblock
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.upsample_rates = upsample_rates
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.segment_size = segment_size
+        self.gin_channels = gin_channels
+        self.ssl_dim = ssl_dim
+        self.emb_g = nn.Embedding(n_speakers, gin_channels)
+
+        self.pre = nn.Conv1d(ssl_dim, hidden_channels, kernel_size=5, padding=2)
+
+        self.enc_p = TextEncoder(
+            inter_channels,
+            hidden_channels,
+            filter_channels=filter_channels,
+            n_heads=n_heads,
+            n_layers=n_layers,
+            kernel_size=kernel_size,
+            p_dropout=p_dropout
+        )
+        hps = {
+            "sampling_rate": sampling_rate,
+            "inter_channels": inter_channels,
+            "resblock": resblock,
+            "resblock_kernel_sizes": resblock_kernel_sizes,
+            "resblock_dilation_sizes": resblock_dilation_sizes,
+            "upsample_rates": upsample_rates,
+            "upsample_initial_channel": upsample_initial_channel,
+            "upsample_kernel_sizes": upsample_kernel_sizes,
+            "gin_channels": gin_channels,
+        }
+        self.dec = Generator(h=hps)
+        self.enc_q = Encoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
+        self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
+        self.f0_decoder = F0Decoder(
+            1,
+            hidden_channels,
+            filter_channels,
+            n_heads,
+            n_layers,
+            kernel_size,
+            p_dropout,
+            spk_channels=gin_channels
+        )
+        self.emb_uv = nn.Embedding(2, hidden_channels)
+        self.predict_f0 = False
+        self.speaker_map = []
+        self.export_mix = False
+
+    def export_chara_mix(self, n_speakers_mix):
+        spkmap = []
+        for i in range(n_speakers_mix):
+            spkmap.append(self.emb_g(torch.LongTensor([[i]])).transpose(1, 2).detach().numpy())
+        self.speaker_map = torch.tensor(spkmap)
+        self.export_mix = True
+
+    def forward(self, c, f0, mel2ph, uv, noise=None, g=None, cluster_infer_ratio=0.1):
+
+        decoder_inp = F.pad(c, [0, 0, 1, 0])
+        mel2ph_ = mel2ph.unsqueeze(2).repeat([1, 1, c.shape[-1]])
+        c = torch.gather(decoder_inp, 1, mel2ph_).transpose(1, 2)  # [B, T, H]
+
+        c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
+
+        if self.export_mix:
+            spk_mix = spk_mix.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1)
+            g = torch.sum(spk_mix * self.speaker_map, dim=0).transpose(1, 2)
+        else:
+            g = g.unsqueeze(0)
+            g = self.emb_g(g).transpose(1, 2)
+
+
+        x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to(c.dtype)
+        x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1, 2)
+
+        if self.predict_f0:
+            lf0 = 2595. * torch.log10(1. + f0.unsqueeze(1) / 700.) / 500
+            norm_lf0 = utils.normalize_f0(lf0, x_mask, uv, random_scale=False)
+            pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g)
+            f0 = (700 * (torch.pow(10, pred_lf0 * 500 / 2595) - 1)).squeeze(1)
+
+        z_p, m_p, logs_p, c_mask = self.enc_p(x, x_mask, f0=f0_to_coarse(f0), z=noise)
+        z = self.flow(z_p, c_mask, g=g, reverse=True)
+        o = self.dec(z * c_mask, g=g, f0=f0)
+        return o
--- a/pretrain/meta.py
+++ b/pretrain/meta.py
@ -0,0 +1,27 @@
+def download_dict():
+    return {
+        "vec768l12": {
+            "url": "https://ibm.ent.box.com/shared/static/z1wgl1stco8ffooyatzdwsqn2psd9lrr",
+            "output": "./pretrain/checkpoint_best_legacy_500.pt"
+        },
+        "vec256l9": {
+            "url": "https://ibm.ent.box.com/shared/static/z1wgl1stco8ffooyatzdwsqn2psd9lrr",
+            "output": "./pretrain/checkpoint_best_legacy_500.pt"
+        },
+        "hubertsoft": {
+            "url": "https://github.com/bshall/hubert/releases/download/v0.1/hubert-soft-0d54a1f4.pt",
+            "output": "./pretrain/hubert-soft-0d54a1f4.pt"
+        }
+    }
+
+
+def get_speech_encoder(config_path="configs/config.json"):
+    import json
+
+    with open(config_path, "r") as f:
+        data = f.read()
+        config = json.loads(data)
+        speech_encoder = config["model"]["speech_encoder"]
+        dict = download_dict()
+
+        return dict[speech_encoder]["url"], dict[speech_encoder]["output"]
--- a/sovits4_for_colab.ipynb
+++ b/sovits4_for_colab.ipynb