NsfHifigan在DML中上采样出现错误以及SourceModuleHnNSF这两个BUG的修复

2023-07-17 17:51:55 +08:00 · 2023-07-17 17:51:55 +08:00 · 90c9ccc6a8
parent 72deb15df3
commit 90c9ccc6a8
5 changed files with 274 additions and 218 deletions
--- a/onnx_export.py
+++ b/onnx_export.py
@ -1,56 +1,138 @@
 import json
 import torch
 import utils
-from onnxexport.model_onnx import SynthesizerTrn
+from onnxexport.model_onnx_speaker_mix import SynthesizerTrn
-def main(NetExport):
+def main():
-    path = "SoVits4.0"
+    path = "crs"
-    if NetExport:
+
-        device = torch.device("cpu")
+    device = torch.device("cpu")
-        hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json")
+    hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json")
-        SVCVITS = SynthesizerTrn(
+    SVCVITS = SynthesizerTrn(
-            hps.data.filter_length // 2 + 1,
+        hps.data.filter_length // 2 + 1,
-            hps.train.segment_size // hps.data.hop_length,
+        hps.train.segment_size // hps.data.hop_length,
-            **hps.model)
+        **hps.model)
-        _ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", SVCVITS, None)
+    _ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", SVCVITS, None)
-        _ = SVCVITS.eval().to(device)
+    _ = SVCVITS.eval().to(device)
-        for i in SVCVITS.parameters():
+    for i in SVCVITS.parameters():
-            i.requires_grad = False
+        i.requires_grad = False
-        
+    
-        n_frame = 10
+    num_frames = 200
-        test_hidden_unit = torch.rand(1, n_frame, 256)
+
-        test_pitch = torch.rand(1, n_frame)
+    test_hidden_unit = torch.rand(1, num_frames, SVCVITS.gin_channels)
-        test_mel2ph = torch.arange(0, n_frame, dtype=torch.int64)[None] # torch.LongTensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]).unsqueeze(0)
+    test_pitch = torch.rand(1, num_frames)
-        test_uv = torch.ones(1, n_frame, dtype=torch.float32)
+    test_vol = torch.rand(1, num_frames)
-        test_noise = torch.randn(1, 192, n_frame)
+    test_mel2ph = torch.LongTensor(torch.arange(0, num_frames)).unsqueeze(0)
-        test_sid = torch.LongTensor([0])
+    test_uv = torch.ones(1, num_frames, dtype=torch.float32)
-        input_names = ["c", "f0", "mel2ph", "uv", "noise", "sid"]
+    test_noise = torch.randn(1, 192, num_frames)
-        output_names = ["audio", ]
+    test_sid = torch.LongTensor([0])
-        
+    export_mix = True
-        torch.onnx.export(SVCVITS,
+    if len(hps.spk) < 2:
-                          (
+        export_mix = False
-                              test_hidden_unit.to(device),
+    
-                              test_pitch.to(device),
+    if export_mix:
-                              test_mel2ph.to(device),
+        spk_mix = []
-                              test_uv.to(device),
+        n_spk = len(hps.spk)
-                              test_noise.to(device),
+        for i in range(n_spk):
-                              test_sid.to(device)
+            spk_mix.append(1.0/float(n_spk))
-                          ),
+        test_sid = torch.tensor(spk_mix)
-                          f"checkpoints/{path}/model.onnx",
+        SVCVITS.export_chara_mix(hps.spk)
-                          dynamic_axes={
+        test_sid = test_sid.unsqueeze(0)
-                              "c": [0, 1],
+        test_sid = test_sid.repeat(num_frames, 1)
-                              "f0": [1],
+    
-                              "mel2ph": [1],
+    SVCVITS.eval()
-                              "uv": [1],
+
-                              "noise": [2],
+    if export_mix:
-                          },
+        daxes = {
-                          do_constant_folding=False,
+            "c": [0, 1],
-                          opset_version=16,
+            "f0": [1],
-                          verbose=False,
+            "mel2ph": [1],
-                          input_names=input_names,
+            "uv": [1],
-                          output_names=output_names)
+            "noise": [2],
            "sid":[0]
        }
    else:
        daxes = {
            "c": [0, 1],
            "f0": [1],
            "mel2ph": [1],
            "uv": [1],
            "noise": [2]
        }
    input_names = ["c", "f0", "mel2ph", "uv", "noise", "sid"]
    output_names = ["audio", ]
    if SVCVITS.vol_embedding:
        input_names.append("vol")
        vol_dadict = {"vol" : [1]}
        daxes.update(vol_dadict)
        test_inputs = (
            test_hidden_unit.to(device),
            test_pitch.to(device),
            test_mel2ph.to(device),
            test_uv.to(device),
            test_noise.to(device),
            test_sid.to(device),
            test_vol.to(device)
        )
    else:
        test_inputs = (
            test_hidden_unit.to(device),
            test_pitch.to(device),
            test_mel2ph.to(device),
            test_uv.to(device),
            test_noise.to(device),
            test_sid.to(device)
        )
    # SVCVITS = torch.jit.script(SVCVITS)
    SVCVITS(test_hidden_unit.to(device),
            test_pitch.to(device),
            test_mel2ph.to(device),
            test_uv.to(device),
            test_noise.to(device),
            test_sid.to(device),
            test_vol.to(device))
    SVCVITS.dec.OnnxExport()
    torch.onnx.export(
        SVCVITS,
        test_inputs,
        f"checkpoints/{path}/{path}_SoVits.onnx",
        dynamic_axes=daxes,
        do_constant_folding=False,
        opset_version=16,
        verbose=False,
        input_names=input_names,
        output_names=output_names
    )
    vec_lay = "layer-12" if SVCVITS.gin_channels == 768 else "layer-9"
    spklist = []
    for key in hps.spk.keys():
        spklist.append(key)
    MoeVSConf = {
        "Folder" : f"{path}",
        "Name" : f"{path}",
        "Type" : "SoVits",
        "Rate" : hps.data.sampling_rate,
        "Hop" : hps.data.hop_length,
        "Hubert": f"vec-{SVCVITS.gin_channels}-{vec_lay}",
        "SoVits4": True,
        "SoVits3": False,
        "CharaMix": export_mix,
        "Volume": SVCVITS.vol_embedding,
        "HiddenSize": SVCVITS.gin_channels,
        "Characters": spklist
    }
    with open(f"checkpoints/{path}.json", 'w') as MoeVsConfFile:
        json.dump(MoeVSConf, MoeVsConfFile, indent = 4)
 if __name__ == '__main__':
-    main(True)
+    main()
--- a/onnx_export_old.py
+++ b/onnx_export_old.py
@ -0,0 +1,56 @@
 import torch
 import utils
 from onnxexport.model_onnx import SynthesizerTrn
 def main(NetExport):
    path = "SoVits4.0"
    if NetExport:
        device = torch.device("cpu")
        hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json")
        SVCVITS = SynthesizerTrn(
            hps.data.filter_length // 2 + 1,
            hps.train.segment_size // hps.data.hop_length,
            **hps.model)
        _ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", SVCVITS, None)
        _ = SVCVITS.eval().to(device)
        for i in SVCVITS.parameters():
            i.requires_grad = False
        n_frame = 10
        test_hidden_unit = torch.rand(1, n_frame, 256)
        test_pitch = torch.rand(1, n_frame)
        test_mel2ph = torch.arange(0, n_frame, dtype=torch.int64)[None] # torch.LongTensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]).unsqueeze(0)
        test_uv = torch.ones(1, n_frame, dtype=torch.float32)
        test_noise = torch.randn(1, 192, n_frame)
        test_sid = torch.LongTensor([0])
        input_names = ["c", "f0", "mel2ph", "uv", "noise", "sid"]
        output_names = ["audio", ]
        torch.onnx.export(SVCVITS,
                          (
                              test_hidden_unit.to(device),
                              test_pitch.to(device),
                              test_mel2ph.to(device),
                              test_uv.to(device),
                              test_noise.to(device),
                              test_sid.to(device)
                          ),
                          f"checkpoints/{path}/model.onnx",
                          dynamic_axes={
                              "c": [0, 1],
                              "f0": [1],
                              "mel2ph": [1],
                              "uv": [1],
                              "noise": [2],
                          },
                          do_constant_folding=False,
                          opset_version=16,
                          verbose=False,
                          input_names=input_names,
                          output_names=output_names)
 if __name__ == '__main__':
    main(True)
--- a/onnx_export_speaker_mix.py
+++ b/onnx_export_speaker_mix.py
@ -1,138 +0,0 @@
 import json
 import torch
 import utils
 from onnxexport.model_onnx_speaker_mix import SynthesizerTrn
 def main():
    path = "crs"
    device = torch.device("cpu")
    hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json")
    SVCVITS = SynthesizerTrn(
        hps.data.filter_length // 2 + 1,
        hps.train.segment_size // hps.data.hop_length,
        **hps.model)
    _ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", SVCVITS, None)
    _ = SVCVITS.eval().to(device)
    for i in SVCVITS.parameters():
        i.requires_grad = False
    num_frames = 200
    test_hidden_unit = torch.rand(1, num_frames, SVCVITS.gin_channels)
    test_pitch = torch.rand(1, num_frames)
    test_vol = torch.rand(1, num_frames)
    test_mel2ph = torch.LongTensor(torch.arange(0, num_frames)).unsqueeze(0)
    test_uv = torch.ones(1, num_frames, dtype=torch.float32)
    test_noise = torch.randn(1, 192, num_frames)
    test_sid = torch.LongTensor([0])
    export_mix = True
    if len(hps.spk) < 2:
        export_mix = False
    if export_mix:
        spk_mix = []
        n_spk = len(hps.spk)
        for i in range(n_spk):
            spk_mix.append(1.0/float(n_spk))
        test_sid = torch.tensor(spk_mix)
        SVCVITS.export_chara_mix(hps.spk)
        test_sid = test_sid.unsqueeze(0)
        test_sid = test_sid.repeat(num_frames, 1)
    SVCVITS.eval()
    if export_mix:
        daxes = {
            "c": [0, 1],
            "f0": [1],
            "mel2ph": [1],
            "uv": [1],
            "noise": [2],
            "sid":[0]
        }
    else:
        daxes = {
            "c": [0, 1],
            "f0": [1],
            "mel2ph": [1],
            "uv": [1],
            "noise": [2]
        }
    input_names = ["c", "f0", "mel2ph", "uv", "noise", "sid"]
    output_names = ["audio", ]
    if SVCVITS.vol_embedding:
        input_names.append("vol")
        vol_dadict = {"vol" : [1]}
        daxes.update(vol_dadict)
        test_inputs = (
            test_hidden_unit.to(device),
            test_pitch.to(device),
            test_mel2ph.to(device),
            test_uv.to(device),
            test_noise.to(device),
            test_sid.to(device),
            test_vol.to(device)
        )
    else:
        test_inputs = (
            test_hidden_unit.to(device),
            test_pitch.to(device),
            test_mel2ph.to(device),
            test_uv.to(device),
            test_noise.to(device),
            test_sid.to(device)
        )
    # SVCVITS = torch.jit.script(SVCVITS)
    SVCVITS(test_hidden_unit.to(device),
            test_pitch.to(device),
            test_mel2ph.to(device),
            test_uv.to(device),
            test_noise.to(device),
            test_sid.to(device),
            test_vol.to(device))
    torch.onnx.export(
        SVCVITS,
        test_inputs,
        f"checkpoints/{path}/{path}_SoVits.onnx",
        dynamic_axes=daxes,
        do_constant_folding=False,
        opset_version=16,
        verbose=False,
        input_names=input_names,
        output_names=output_names
    )
    vec_lay = "layer-12" if SVCVITS.gin_channels == 768 else "layer-9"
    spklist = []
    for key in hps.spk.keys():
        spklist.append(key)
    MoeVSConf = {
        "Folder" : f"{path}",
        "Name" : f"{path}",
        "Type" : "SoVits",
        "Rate" : hps.data.sampling_rate,
        "Hop" : hps.data.hop_length,
        "Hubert": f"vec-{SVCVITS.gin_channels}-{vec_lay}",
        "SoVits4": True,
        "SoVits3": False,
        "CharaMix": export_mix,
        "Volume": SVCVITS.vol_embedding,
        "HiddenSize": SVCVITS.gin_channels,
        "Characters": spklist
    }
    with open(f"checkpoints/{path}.json", 'w') as MoeVsConfFile:
        json.dump(MoeVSConf, MoeVsConfFile, indent = 4)
 if __name__ == '__main__':
    main()
--- a/utils.py
+++ b/utils.py
@ -68,14 +68,16 @@ def plot_data_to_numpy(x, y):
 def f0_to_coarse(f0):
-  is_torch = isinstance(f0, torch.Tensor)
+  f0_mel = 1127 * (1 + f0 / 700).log()
-  f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700)
+  a = (f0_bin - 2) / (f0_mel_max - f0_mel_min)
-  f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1
+  b = f0_mel_min * a - 1.
-
+  f0_mel = torch.where(f0_mel > 0, f0_mel * a - b, f0_mel)
-  f0_mel[f0_mel <= 1] = 1
+  # torch.clip_(f0_mel, min=1., max=float(f0_bin - 1))
-  f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1
+  f0_coarse = torch.round(f0_mel).long()
-  f0_coarse = (f0_mel + 0.5).int() if is_torch else np.rint(f0_mel).astype(np.int)
+  f0_coarse = f0_coarse * (f0_coarse > 0)
-  assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (f0_coarse.max(), f0_coarse.min())
+  f0_coarse = f0_coarse + ((f0_coarse < 1) * 1)
  f0_coarse = f0_coarse * (f0_coarse < f0_bin)
  f0_coarse = f0_coarse + ((f0_coarse >= f0_bin) * (f0_bin - 1))
  return f0_coarse
 def get_content(cmodel, y):
--- a/vdecoder/hifigan/models.py
+++ b/vdecoder/hifigan/models.py
@ -128,6 +128,7 @@ class SineGen(torch.nn.Module):
        self.sampling_rate = samp_rate
        self.voiced_threshold = voiced_threshold
        self.flag_for_pulse = flag_for_pulse
        self.onnx = False
    def _f02uv(self, f0):
        # generate uv signal
@ -193,35 +194,81 @@ class SineGen(torch.nn.Module):
            sines = torch.cos(i_phase * 2 * np.pi)
        return sines
-    def forward(self, f0):
+    def forward(self, f0, upp=None):
        """ sine_tensor, uv = forward(f0)
        input F0: tensor(batchsize=1, length, dim=1)
                  f0 for unvoiced steps should be 0
        output sine_tensor: tensor(batchsize=1, length, dim)
        output uv: tensor(batchsize=1, length, 1)
        """
-        with torch.no_grad():
+        if self.onnx:
-            # fundamental component
+            with torch.no_grad():
-            fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device))
+                f0 = f0[:, None].transpose(1, 2)
                f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
                # fundamental component
                f0_buf[:, :, 0] = f0[:, :, 0]
                for idx in np.arange(self.harmonic_num):
                    f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
                        idx + 2
                    )  # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
                rad_values = (f0_buf / self.sampling_rate) % 1  ###%1意味着n_har的乘积无法后处理优化
                rand_ini = torch.rand(
                    f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
                )
                rand_ini[:, 0] = 0
                rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
                tmp_over_one = torch.cumsum(rad_values, 1)  # % 1  #####%1意味着后面的cumsum无法再优化
                tmp_over_one *= upp
                tmp_over_one = F.interpolate(
                    tmp_over_one.transpose(2, 1),
                    scale_factor=upp,
                    mode="linear",
                    align_corners=True,
                ).transpose(2, 1)
                rad_values = F.interpolate(
                    rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
                ).transpose(
                    2, 1
                )  #######
                tmp_over_one %= 1
                tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
                cumsum_shift = torch.zeros_like(rad_values)
                cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
                sine_waves = torch.sin(
                    torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
                )
                sine_waves = sine_waves * self.sine_amp
                uv = self._f02uv(f0)
                uv = F.interpolate(
                    uv.transpose(2, 1), scale_factor=upp, mode="nearest"
                ).transpose(2, 1)
                noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
                noise = noise_amp * torch.randn_like(sine_waves)
                sine_waves = sine_waves * uv + noise
            return sine_waves, uv, noise
        else:
            with torch.no_grad():
                # fundamental component
                fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device))
-            # generate sine waveforms
+                # generate sine waveforms
-            sine_waves = self._f02sine(fn) * self.sine_amp
+                sine_waves = self._f02sine(fn) * self.sine_amp
-            # generate uv signal
+                # generate uv signal
-            # uv = torch.ones(f0.shape)
+                # uv = torch.ones(f0.shape)
-            # uv = uv * (f0 > self.voiced_threshold)
+                # uv = uv * (f0 > self.voiced_threshold)
-            uv = self._f02uv(f0)
+                uv = self._f02uv(f0)
-            # noise: for unvoiced should be similar to sine_amp
+                # noise: for unvoiced should be similar to sine_amp
-            #        std = self.sine_amp/3 -> max value ~ self.sine_amp
+                #        std = self.sine_amp/3 -> max value ~ self.sine_amp
-            # .       for voiced regions is self.noise_std
+                # .       for voiced regions is self.noise_std
-            noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
+                noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
-            noise = noise_amp * torch.randn_like(sine_waves)
+                noise = noise_amp * torch.randn_like(sine_waves)
-            # first: set the unvoiced part to 0 by uv
+                # first: set the unvoiced part to 0 by uv
-            # then: additive noise
+                # then: additive noise
-            sine_waves = sine_waves * uv + noise
+                sine_waves = sine_waves * uv + noise
-        return sine_waves, uv, noise
+            return sine_waves, uv, noise
 class SourceModuleHnNSF(torch.nn.Module):
@ -257,7 +304,7 @@ class SourceModuleHnNSF(torch.nn.Module):
        self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
        self.l_tanh = torch.nn.Tanh()
-    def forward(self, x):
+    def forward(self, x, upp=None):
        """
        Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
        F0_sampled (batchsize, length, 1)
@ -265,7 +312,7 @@ class SourceModuleHnNSF(torch.nn.Module):
        noise_source (batchsize, length 1)
        """
        # source for harmonic branch
-        sine_wavs, uv, _ = self.l_sin_gen(x)
+        sine_wavs, uv, _ = self.l_sin_gen(x, upp)
        sine_merge = self.l_tanh(self.l_linear(sine_wavs.to(self.l_linear.weight.dtype)))
        # source for noise branch, in the same shape as uv
@ -309,12 +356,19 @@ class Generator(torch.nn.Module):
        self.ups.apply(init_weights)
        self.conv_post.apply(init_weights)
        self.cond = nn.Conv1d(h['gin_channels'], h['upsample_initial_channel'], 1)
        self.upp = np.prod(h["upsample_rates"])
        self.onnx = False
    def OnnxExport(self):
        self.onnx = True
        self.m_source.l_sin_gen.onnx = True
    def forward(self, x, f0, g=None):
        # print(1,x.shape,f0.shape,f0[:, None].shape)
-        f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2)  # bs,n,t
+        if not self.onnx:
            f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2)  # bs,n,t
        # print(2,f0.shape)
-        har_source, noi_source, uv = self.m_source(f0)
+        har_source, noi_source, uv = self.m_source(f0, self.upp)
        har_source = har_source.transpose(1, 2)
        x = self.conv_pre(x)
        x = x + self.cond(g)