NsfHifigan在DML中上采样出现错误以及SourceModuleHnNSF这两个BUG的修复

2023-07-17 17:51:55 +08:00 · 2023-07-17 17:51:55 +08:00 · 90c9ccc6a8
parent 72deb15df3
commit 90c9ccc6a8
5 changed files with 274 additions and 218 deletions
--- a/onnx_export.py
+++ b/onnx_export.py
@ -1,56 +1,138 @@
+import json
 import torch
-
 import utils
-from onnxexport.model_onnx import SynthesizerTrn
+from onnxexport.model_onnx_speaker_mix import SynthesizerTrn


-def main(NetExport):
-    path = "SoVits4.0"
-    if NetExport:
-        device = torch.device("cpu")
-        hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json")
-        SVCVITS = SynthesizerTrn(
-            hps.data.filter_length // 2 + 1,
-            hps.train.segment_size // hps.data.hop_length,
-            **hps.model)
-        _ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", SVCVITS, None)
-        _ = SVCVITS.eval().to(device)
-        for i in SVCVITS.parameters():
-            i.requires_grad = False
-        
-        n_frame = 10
-        test_hidden_unit = torch.rand(1, n_frame, 256)
-        test_pitch = torch.rand(1, n_frame)
-        test_mel2ph = torch.arange(0, n_frame, dtype=torch.int64)[None] # torch.LongTensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]).unsqueeze(0)
-        test_uv = torch.ones(1, n_frame, dtype=torch.float32)
-        test_noise = torch.randn(1, 192, n_frame)
-        test_sid = torch.LongTensor([0])
-        input_names = ["c", "f0", "mel2ph", "uv", "noise", "sid"]
-        output_names = ["audio", ]
-        
-        torch.onnx.export(SVCVITS,
-                          (
-                              test_hidden_unit.to(device),
-                              test_pitch.to(device),
-                              test_mel2ph.to(device),
-                              test_uv.to(device),
-                              test_noise.to(device),
-                              test_sid.to(device)
-                          ),
-                          f"checkpoints/{path}/model.onnx",
-                          dynamic_axes={
-                              "c": [0, 1],
-                              "f0": [1],
-                              "mel2ph": [1],
-                              "uv": [1],
-                              "noise": [2],
-                          },
-                          do_constant_folding=False,
-                          opset_version=16,
-                          verbose=False,
-                          input_names=input_names,
-                          output_names=output_names)
+def main():
+    path = "crs"
+
+    device = torch.device("cpu")
+    hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json")
+    SVCVITS = SynthesizerTrn(
+        hps.data.filter_length // 2 + 1,
+        hps.train.segment_size // hps.data.hop_length,
+        **hps.model)
+    _ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", SVCVITS, None)
+    _ = SVCVITS.eval().to(device)
+    for i in SVCVITS.parameters():
+        i.requires_grad = False
+    
+    num_frames = 200
+
+    test_hidden_unit = torch.rand(1, num_frames, SVCVITS.gin_channels)
+    test_pitch = torch.rand(1, num_frames)
+    test_vol = torch.rand(1, num_frames)
+    test_mel2ph = torch.LongTensor(torch.arange(0, num_frames)).unsqueeze(0)
+    test_uv = torch.ones(1, num_frames, dtype=torch.float32)
+    test_noise = torch.randn(1, 192, num_frames)
+    test_sid = torch.LongTensor([0])
+    export_mix = True
+    if len(hps.spk) < 2:
+        export_mix = False
+    
+    if export_mix:
+        spk_mix = []
+        n_spk = len(hps.spk)
+        for i in range(n_spk):
+            spk_mix.append(1.0/float(n_spk))
+        test_sid = torch.tensor(spk_mix)
+        SVCVITS.export_chara_mix(hps.spk)
+        test_sid = test_sid.unsqueeze(0)
+        test_sid = test_sid.repeat(num_frames, 1)
+    
+    SVCVITS.eval()
+
+    if export_mix:
+        daxes = {
+            "c": [0, 1],
+            "f0": [1],
+            "mel2ph": [1],
+            "uv": [1],
+            "noise": [2],
+            "sid":[0]
+        }
+    else:
+        daxes = {
+            "c": [0, 1],
+            "f0": [1],
+            "mel2ph": [1],
+            "uv": [1],
+            "noise": [2]
+        }
+    
+    input_names = ["c", "f0", "mel2ph", "uv", "noise", "sid"]
+    output_names = ["audio", ]
+
+    if SVCVITS.vol_embedding:
+        input_names.append("vol")
+        vol_dadict = {"vol" : [1]}
+        daxes.update(vol_dadict)
+        test_inputs = (
+            test_hidden_unit.to(device),
+            test_pitch.to(device),
+            test_mel2ph.to(device),
+            test_uv.to(device),
+            test_noise.to(device),
+            test_sid.to(device),
+            test_vol.to(device)
+        )
+    else:
+        test_inputs = (
+            test_hidden_unit.to(device),
+            test_pitch.to(device),
+            test_mel2ph.to(device),
+            test_uv.to(device),
+            test_noise.to(device),
+            test_sid.to(device)
+        )
+
+    # SVCVITS = torch.jit.script(SVCVITS)
+    SVCVITS(test_hidden_unit.to(device),
+            test_pitch.to(device),
+            test_mel2ph.to(device),
+            test_uv.to(device),
+            test_noise.to(device),
+            test_sid.to(device),
+            test_vol.to(device))
+
+    SVCVITS.dec.OnnxExport()
+
+    torch.onnx.export(
+        SVCVITS,
+        test_inputs,
+        f"checkpoints/{path}/{path}_SoVits.onnx",
+        dynamic_axes=daxes,
+        do_constant_folding=False,
+        opset_version=16,
+        verbose=False,
+        input_names=input_names,
+        output_names=output_names
+    )
+
+    vec_lay = "layer-12" if SVCVITS.gin_channels == 768 else "layer-9"
+    spklist = []
+    for key in hps.spk.keys():
+        spklist.append(key)
+
+    MoeVSConf = {
+        "Folder" : f"{path}",
+        "Name" : f"{path}",
+        "Type" : "SoVits",
+        "Rate" : hps.data.sampling_rate,
+        "Hop" : hps.data.hop_length,
+        "Hubert": f"vec-{SVCVITS.gin_channels}-{vec_lay}",
+        "SoVits4": True,
+        "SoVits3": False,
+        "CharaMix": export_mix,
+        "Volume": SVCVITS.vol_embedding,
+        "HiddenSize": SVCVITS.gin_channels,
+        "Characters": spklist
+    }
+
+    with open(f"checkpoints/{path}.json", 'w') as MoeVsConfFile:
+        json.dump(MoeVSConf, MoeVsConfFile, indent = 4)


 if __name__ == '__main__':
-    main(True)
+    main()
--- a/onnx_export_old.py
+++ b/onnx_export_old.py
@ -0,0 +1,56 @@
+import torch
+
+import utils
+from onnxexport.model_onnx import SynthesizerTrn
+
+
+def main(NetExport):
+    path = "SoVits4.0"
+    if NetExport:
+        device = torch.device("cpu")
+        hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json")
+        SVCVITS = SynthesizerTrn(
+            hps.data.filter_length // 2 + 1,
+            hps.train.segment_size // hps.data.hop_length,
+            **hps.model)
+        _ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", SVCVITS, None)
+        _ = SVCVITS.eval().to(device)
+        for i in SVCVITS.parameters():
+            i.requires_grad = False
+        
+        n_frame = 10
+        test_hidden_unit = torch.rand(1, n_frame, 256)
+        test_pitch = torch.rand(1, n_frame)
+        test_mel2ph = torch.arange(0, n_frame, dtype=torch.int64)[None] # torch.LongTensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]).unsqueeze(0)
+        test_uv = torch.ones(1, n_frame, dtype=torch.float32)
+        test_noise = torch.randn(1, 192, n_frame)
+        test_sid = torch.LongTensor([0])
+        input_names = ["c", "f0", "mel2ph", "uv", "noise", "sid"]
+        output_names = ["audio", ]
+        
+        torch.onnx.export(SVCVITS,
+                          (
+                              test_hidden_unit.to(device),
+                              test_pitch.to(device),
+                              test_mel2ph.to(device),
+                              test_uv.to(device),
+                              test_noise.to(device),
+                              test_sid.to(device)
+                          ),
+                          f"checkpoints/{path}/model.onnx",
+                          dynamic_axes={
+                              "c": [0, 1],
+                              "f0": [1],
+                              "mel2ph": [1],
+                              "uv": [1],
+                              "noise": [2],
+                          },
+                          do_constant_folding=False,
+                          opset_version=16,
+                          verbose=False,
+                          input_names=input_names,
+                          output_names=output_names)
+
+
+if __name__ == '__main__':
+    main(True)
--- a/onnx_export_speaker_mix.py
+++ b/onnx_export_speaker_mix.py
@ -1,138 +0,0 @@
-import json
-
-import torch
-
-import utils
-from onnxexport.model_onnx_speaker_mix import SynthesizerTrn
-
-
-def main():
-    path = "crs"
-
-    device = torch.device("cpu")
-    hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json")
-    SVCVITS = SynthesizerTrn(
-        hps.data.filter_length // 2 + 1,
-        hps.train.segment_size // hps.data.hop_length,
-        **hps.model)
-    _ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", SVCVITS, None)
-    _ = SVCVITS.eval().to(device)
-    for i in SVCVITS.parameters():
-        i.requires_grad = False
-    
-    num_frames = 200
-
-    test_hidden_unit = torch.rand(1, num_frames, SVCVITS.gin_channels)
-    test_pitch = torch.rand(1, num_frames)
-    test_vol = torch.rand(1, num_frames)
-    test_mel2ph = torch.LongTensor(torch.arange(0, num_frames)).unsqueeze(0)
-    test_uv = torch.ones(1, num_frames, dtype=torch.float32)
-    test_noise = torch.randn(1, 192, num_frames)
-    test_sid = torch.LongTensor([0])
-    export_mix = True
-    if len(hps.spk) < 2:
-        export_mix = False
-    
-    if export_mix:
-        spk_mix = []
-        n_spk = len(hps.spk)
-        for i in range(n_spk):
-            spk_mix.append(1.0/float(n_spk))
-        test_sid = torch.tensor(spk_mix)
-        SVCVITS.export_chara_mix(hps.spk)
-        test_sid = test_sid.unsqueeze(0)
-        test_sid = test_sid.repeat(num_frames, 1)
-    
-    SVCVITS.eval()
-
-    if export_mix:
-        daxes = {
-            "c": [0, 1],
-            "f0": [1],
-            "mel2ph": [1],
-            "uv": [1],
-            "noise": [2],
-            "sid":[0]
-        }
-    else:
-        daxes = {
-            "c": [0, 1],
-            "f0": [1],
-            "mel2ph": [1],
-            "uv": [1],
-            "noise": [2]
-        }
-    
-    input_names = ["c", "f0", "mel2ph", "uv", "noise", "sid"]
-    output_names = ["audio", ]
-
-    if SVCVITS.vol_embedding:
-        input_names.append("vol")
-        vol_dadict = {"vol" : [1]}
-        daxes.update(vol_dadict)
-        test_inputs = (
-            test_hidden_unit.to(device),
-            test_pitch.to(device),
-            test_mel2ph.to(device),
-            test_uv.to(device),
-            test_noise.to(device),
-            test_sid.to(device),
-            test_vol.to(device)
-        )
-    else:
-        test_inputs = (
-            test_hidden_unit.to(device),
-            test_pitch.to(device),
-            test_mel2ph.to(device),
-            test_uv.to(device),
-            test_noise.to(device),
-            test_sid.to(device)
-        )
-
-    # SVCVITS = torch.jit.script(SVCVITS)
-    SVCVITS(test_hidden_unit.to(device),
-            test_pitch.to(device),
-            test_mel2ph.to(device),
-            test_uv.to(device),
-            test_noise.to(device),
-            test_sid.to(device),
-            test_vol.to(device))
-
-    torch.onnx.export(
-        SVCVITS,
-        test_inputs,
-        f"checkpoints/{path}/{path}_SoVits.onnx",
-        dynamic_axes=daxes,
-        do_constant_folding=False,
-        opset_version=16,
-        verbose=False,
-        input_names=input_names,
-        output_names=output_names
-    )
-
-    vec_lay = "layer-12" if SVCVITS.gin_channels == 768 else "layer-9"
-    spklist = []
-    for key in hps.spk.keys():
-        spklist.append(key)
-
-    MoeVSConf = {
-        "Folder" : f"{path}",
-        "Name" : f"{path}",
-        "Type" : "SoVits",
-        "Rate" : hps.data.sampling_rate,
-        "Hop" : hps.data.hop_length,
-        "Hubert": f"vec-{SVCVITS.gin_channels}-{vec_lay}",
-        "SoVits4": True,
-        "SoVits3": False,
-        "CharaMix": export_mix,
-        "Volume": SVCVITS.vol_embedding,
-        "HiddenSize": SVCVITS.gin_channels,
-        "Characters": spklist
-    }
-
-    with open(f"checkpoints/{path}.json", 'w') as MoeVsConfFile:
-        json.dump(MoeVSConf, MoeVsConfFile, indent = 4)
-
-
-if __name__ == '__main__':
-    main()
--- a/utils.py
+++ b/utils.py
@ -68,14 +68,16 @@ def plot_data_to_numpy(x, y):


 def f0_to_coarse(f0):
-  is_torch = isinstance(f0, torch.Tensor)
-  f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700)
-  f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1
-
-  f0_mel[f0_mel <= 1] = 1
-  f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1
-  f0_coarse = (f0_mel + 0.5).int() if is_torch else np.rint(f0_mel).astype(np.int)
-  assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (f0_coarse.max(), f0_coarse.min())
+  f0_mel = 1127 * (1 + f0 / 700).log()
+  a = (f0_bin - 2) / (f0_mel_max - f0_mel_min)
+  b = f0_mel_min * a - 1.
+  f0_mel = torch.where(f0_mel > 0, f0_mel * a - b, f0_mel)
+  # torch.clip_(f0_mel, min=1., max=float(f0_bin - 1))
+  f0_coarse = torch.round(f0_mel).long()
+  f0_coarse = f0_coarse * (f0_coarse > 0)
+  f0_coarse = f0_coarse + ((f0_coarse < 1) * 1)
+  f0_coarse = f0_coarse * (f0_coarse < f0_bin)
+  f0_coarse = f0_coarse + ((f0_coarse >= f0_bin) * (f0_bin - 1))
  return f0_coarse

 def get_content(cmodel, y):
--- a/vdecoder/hifigan/models.py
+++ b/vdecoder/hifigan/models.py
@ -128,6 +128,7 @@ class SineGen(torch.nn.Module):
        self.sampling_rate = samp_rate
        self.voiced_threshold = voiced_threshold
        self.flag_for_pulse = flag_for_pulse
+        self.onnx = False

    def _f02uv(self, f0):
        # generate uv signal
@ -193,35 +194,81 @@ class SineGen(torch.nn.Module):
            sines = torch.cos(i_phase * 2 * np.pi)
        return sines

-    def forward(self, f0):
+    def forward(self, f0, upp=None):
        """ sine_tensor, uv = forward(f0)
        input F0: tensor(batchsize=1, length, dim=1)
                  f0 for unvoiced steps should be 0
        output sine_tensor: tensor(batchsize=1, length, dim)
        output uv: tensor(batchsize=1, length, 1)
        """
-        with torch.no_grad():
-            # fundamental component
-            fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device))
+        if self.onnx:
+            with torch.no_grad():
+                f0 = f0[:, None].transpose(1, 2)
+                f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
+                # fundamental component
+                f0_buf[:, :, 0] = f0[:, :, 0]
+                for idx in np.arange(self.harmonic_num):
+                    f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
+                        idx + 2
+                    )  # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
+                rad_values = (f0_buf / self.sampling_rate) % 1  ###%1意味着n_har的乘积无法后处理优化
+                rand_ini = torch.rand(
+                    f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
+                )
+                rand_ini[:, 0] = 0
+                rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
+                tmp_over_one = torch.cumsum(rad_values, 1)  # % 1  #####%1意味着后面的cumsum无法再优化
+                tmp_over_one *= upp
+                tmp_over_one = F.interpolate(
+                    tmp_over_one.transpose(2, 1),
+                    scale_factor=upp,
+                    mode="linear",
+                    align_corners=True,
+                ).transpose(2, 1)
+                rad_values = F.interpolate(
+                    rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
+                ).transpose(
+                    2, 1
+                )  #######
+                tmp_over_one %= 1
+                tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
+                cumsum_shift = torch.zeros_like(rad_values)
+                cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
+                sine_waves = torch.sin(
+                    torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
+                )
+                sine_waves = sine_waves * self.sine_amp
+                uv = self._f02uv(f0)
+                uv = F.interpolate(
+                    uv.transpose(2, 1), scale_factor=upp, mode="nearest"
+                ).transpose(2, 1)
+                noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
+                noise = noise_amp * torch.randn_like(sine_waves)
+                sine_waves = sine_waves * uv + noise
+            return sine_waves, uv, noise
+        else:
+            with torch.no_grad():
+                # fundamental component
+                fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device))

-            # generate sine waveforms
-            sine_waves = self._f02sine(fn) * self.sine_amp
+                # generate sine waveforms
+                sine_waves = self._f02sine(fn) * self.sine_amp

-            # generate uv signal
-            # uv = torch.ones(f0.shape)
-            # uv = uv * (f0 > self.voiced_threshold)
-            uv = self._f02uv(f0)
+                # generate uv signal
+                # uv = torch.ones(f0.shape)
+                # uv = uv * (f0 > self.voiced_threshold)
+                uv = self._f02uv(f0)

-            # noise: for unvoiced should be similar to sine_amp
-            #        std = self.sine_amp/3 -> max value ~ self.sine_amp
-            # .       for voiced regions is self.noise_std
-            noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
-            noise = noise_amp * torch.randn_like(sine_waves)
+                # noise: for unvoiced should be similar to sine_amp
+                #        std = self.sine_amp/3 -> max value ~ self.sine_amp
+                # .       for voiced regions is self.noise_std
+                noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
+                noise = noise_amp * torch.randn_like(sine_waves)

-            # first: set the unvoiced part to 0 by uv
-            # then: additive noise
-            sine_waves = sine_waves * uv + noise
-        return sine_waves, uv, noise
+                # first: set the unvoiced part to 0 by uv
+                # then: additive noise
+                sine_waves = sine_waves * uv + noise
+            return sine_waves, uv, noise


 class SourceModuleHnNSF(torch.nn.Module):
@ -257,7 +304,7 @@ class SourceModuleHnNSF(torch.nn.Module):
        self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
        self.l_tanh = torch.nn.Tanh()

-    def forward(self, x):
+    def forward(self, x, upp=None):
        """
        Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
        F0_sampled (batchsize, length, 1)
@ -265,7 +312,7 @@ class SourceModuleHnNSF(torch.nn.Module):
        noise_source (batchsize, length 1)
        """
        # source for harmonic branch
-        sine_wavs, uv, _ = self.l_sin_gen(x)
+        sine_wavs, uv, _ = self.l_sin_gen(x, upp)
        sine_merge = self.l_tanh(self.l_linear(sine_wavs.to(self.l_linear.weight.dtype)))

        # source for noise branch, in the same shape as uv
@ -309,12 +356,19 @@ class Generator(torch.nn.Module):
        self.ups.apply(init_weights)
        self.conv_post.apply(init_weights)
        self.cond = nn.Conv1d(h['gin_channels'], h['upsample_initial_channel'], 1)
+        self.upp = np.prod(h["upsample_rates"])
+        self.onnx = False
+
+    def OnnxExport(self):
+        self.onnx = True
+        self.m_source.l_sin_gen.onnx = True

    def forward(self, x, f0, g=None):
        # print(1,x.shape,f0.shape,f0[:, None].shape)
-        f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2)  # bs,n,t
+        if not self.onnx:
+            f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2)  # bs,n,t
        # print(2,f0.shape)
-        har_source, noi_source, uv = self.m_source(f0)
+        har_source, noi_source, uv = self.m_source(f0, self.upp)
        har_source = har_source.transpose(1, 2)
        x = self.conv_pre(x)
        x = x + self.cond(g)