diff --git a/onnx_export.py b/onnx_export.py index 27f49dd..2364aab 100644 --- a/onnx_export.py +++ b/onnx_export.py @@ -1,56 +1,138 @@ +import json import torch - import utils -from onnxexport.model_onnx import SynthesizerTrn +from onnxexport.model_onnx_speaker_mix import SynthesizerTrn -def main(NetExport): - path = "SoVits4.0" - if NetExport: - device = torch.device("cpu") - hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json") - SVCVITS = SynthesizerTrn( - hps.data.filter_length // 2 + 1, - hps.train.segment_size // hps.data.hop_length, - **hps.model) - _ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", SVCVITS, None) - _ = SVCVITS.eval().to(device) - for i in SVCVITS.parameters(): - i.requires_grad = False - - n_frame = 10 - test_hidden_unit = torch.rand(1, n_frame, 256) - test_pitch = torch.rand(1, n_frame) - test_mel2ph = torch.arange(0, n_frame, dtype=torch.int64)[None] # torch.LongTensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]).unsqueeze(0) - test_uv = torch.ones(1, n_frame, dtype=torch.float32) - test_noise = torch.randn(1, 192, n_frame) - test_sid = torch.LongTensor([0]) - input_names = ["c", "f0", "mel2ph", "uv", "noise", "sid"] - output_names = ["audio", ] - - torch.onnx.export(SVCVITS, - ( - test_hidden_unit.to(device), - test_pitch.to(device), - test_mel2ph.to(device), - test_uv.to(device), - test_noise.to(device), - test_sid.to(device) - ), - f"checkpoints/{path}/model.onnx", - dynamic_axes={ - "c": [0, 1], - "f0": [1], - "mel2ph": [1], - "uv": [1], - "noise": [2], - }, - do_constant_folding=False, - opset_version=16, - verbose=False, - input_names=input_names, - output_names=output_names) +def main(): + path = "crs" + + device = torch.device("cpu") + hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json") + SVCVITS = SynthesizerTrn( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + **hps.model) + _ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", SVCVITS, None) + _ = SVCVITS.eval().to(device) + for i in SVCVITS.parameters(): + i.requires_grad = False + + num_frames = 200 + + test_hidden_unit = torch.rand(1, num_frames, SVCVITS.gin_channels) + test_pitch = torch.rand(1, num_frames) + test_vol = torch.rand(1, num_frames) + test_mel2ph = torch.LongTensor(torch.arange(0, num_frames)).unsqueeze(0) + test_uv = torch.ones(1, num_frames, dtype=torch.float32) + test_noise = torch.randn(1, 192, num_frames) + test_sid = torch.LongTensor([0]) + export_mix = True + if len(hps.spk) < 2: + export_mix = False + + if export_mix: + spk_mix = [] + n_spk = len(hps.spk) + for i in range(n_spk): + spk_mix.append(1.0/float(n_spk)) + test_sid = torch.tensor(spk_mix) + SVCVITS.export_chara_mix(hps.spk) + test_sid = test_sid.unsqueeze(0) + test_sid = test_sid.repeat(num_frames, 1) + + SVCVITS.eval() + + if export_mix: + daxes = { + "c": [0, 1], + "f0": [1], + "mel2ph": [1], + "uv": [1], + "noise": [2], + "sid":[0] + } + else: + daxes = { + "c": [0, 1], + "f0": [1], + "mel2ph": [1], + "uv": [1], + "noise": [2] + } + + input_names = ["c", "f0", "mel2ph", "uv", "noise", "sid"] + output_names = ["audio", ] + + if SVCVITS.vol_embedding: + input_names.append("vol") + vol_dadict = {"vol" : [1]} + daxes.update(vol_dadict) + test_inputs = ( + test_hidden_unit.to(device), + test_pitch.to(device), + test_mel2ph.to(device), + test_uv.to(device), + test_noise.to(device), + test_sid.to(device), + test_vol.to(device) + ) + else: + test_inputs = ( + test_hidden_unit.to(device), + test_pitch.to(device), + test_mel2ph.to(device), + test_uv.to(device), + test_noise.to(device), + test_sid.to(device) + ) + + # SVCVITS = torch.jit.script(SVCVITS) + SVCVITS(test_hidden_unit.to(device), + test_pitch.to(device), + test_mel2ph.to(device), + test_uv.to(device), + test_noise.to(device), + test_sid.to(device), + test_vol.to(device)) + + SVCVITS.dec.OnnxExport() + + torch.onnx.export( + SVCVITS, + test_inputs, + f"checkpoints/{path}/{path}_SoVits.onnx", + dynamic_axes=daxes, + do_constant_folding=False, + opset_version=16, + verbose=False, + input_names=input_names, + output_names=output_names + ) + + vec_lay = "layer-12" if SVCVITS.gin_channels == 768 else "layer-9" + spklist = [] + for key in hps.spk.keys(): + spklist.append(key) + + MoeVSConf = { + "Folder" : f"{path}", + "Name" : f"{path}", + "Type" : "SoVits", + "Rate" : hps.data.sampling_rate, + "Hop" : hps.data.hop_length, + "Hubert": f"vec-{SVCVITS.gin_channels}-{vec_lay}", + "SoVits4": True, + "SoVits3": False, + "CharaMix": export_mix, + "Volume": SVCVITS.vol_embedding, + "HiddenSize": SVCVITS.gin_channels, + "Characters": spklist + } + + with open(f"checkpoints/{path}.json", 'w') as MoeVsConfFile: + json.dump(MoeVSConf, MoeVsConfFile, indent = 4) if __name__ == '__main__': - main(True) + main() diff --git a/onnx_export_old.py b/onnx_export_old.py new file mode 100644 index 0000000..27f49dd --- /dev/null +++ b/onnx_export_old.py @@ -0,0 +1,56 @@ +import torch + +import utils +from onnxexport.model_onnx import SynthesizerTrn + + +def main(NetExport): + path = "SoVits4.0" + if NetExport: + device = torch.device("cpu") + hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json") + SVCVITS = SynthesizerTrn( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + **hps.model) + _ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", SVCVITS, None) + _ = SVCVITS.eval().to(device) + for i in SVCVITS.parameters(): + i.requires_grad = False + + n_frame = 10 + test_hidden_unit = torch.rand(1, n_frame, 256) + test_pitch = torch.rand(1, n_frame) + test_mel2ph = torch.arange(0, n_frame, dtype=torch.int64)[None] # torch.LongTensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]).unsqueeze(0) + test_uv = torch.ones(1, n_frame, dtype=torch.float32) + test_noise = torch.randn(1, 192, n_frame) + test_sid = torch.LongTensor([0]) + input_names = ["c", "f0", "mel2ph", "uv", "noise", "sid"] + output_names = ["audio", ] + + torch.onnx.export(SVCVITS, + ( + test_hidden_unit.to(device), + test_pitch.to(device), + test_mel2ph.to(device), + test_uv.to(device), + test_noise.to(device), + test_sid.to(device) + ), + f"checkpoints/{path}/model.onnx", + dynamic_axes={ + "c": [0, 1], + "f0": [1], + "mel2ph": [1], + "uv": [1], + "noise": [2], + }, + do_constant_folding=False, + opset_version=16, + verbose=False, + input_names=input_names, + output_names=output_names) + + +if __name__ == '__main__': + main(True) diff --git a/onnx_export_speaker_mix.py b/onnx_export_speaker_mix.py deleted file mode 100644 index 42c11ad..0000000 --- a/onnx_export_speaker_mix.py +++ /dev/null @@ -1,138 +0,0 @@ -import json - -import torch - -import utils -from onnxexport.model_onnx_speaker_mix import SynthesizerTrn - - -def main(): - path = "crs" - - device = torch.device("cpu") - hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json") - SVCVITS = SynthesizerTrn( - hps.data.filter_length // 2 + 1, - hps.train.segment_size // hps.data.hop_length, - **hps.model) - _ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", SVCVITS, None) - _ = SVCVITS.eval().to(device) - for i in SVCVITS.parameters(): - i.requires_grad = False - - num_frames = 200 - - test_hidden_unit = torch.rand(1, num_frames, SVCVITS.gin_channels) - test_pitch = torch.rand(1, num_frames) - test_vol = torch.rand(1, num_frames) - test_mel2ph = torch.LongTensor(torch.arange(0, num_frames)).unsqueeze(0) - test_uv = torch.ones(1, num_frames, dtype=torch.float32) - test_noise = torch.randn(1, 192, num_frames) - test_sid = torch.LongTensor([0]) - export_mix = True - if len(hps.spk) < 2: - export_mix = False - - if export_mix: - spk_mix = [] - n_spk = len(hps.spk) - for i in range(n_spk): - spk_mix.append(1.0/float(n_spk)) - test_sid = torch.tensor(spk_mix) - SVCVITS.export_chara_mix(hps.spk) - test_sid = test_sid.unsqueeze(0) - test_sid = test_sid.repeat(num_frames, 1) - - SVCVITS.eval() - - if export_mix: - daxes = { - "c": [0, 1], - "f0": [1], - "mel2ph": [1], - "uv": [1], - "noise": [2], - "sid":[0] - } - else: - daxes = { - "c": [0, 1], - "f0": [1], - "mel2ph": [1], - "uv": [1], - "noise": [2] - } - - input_names = ["c", "f0", "mel2ph", "uv", "noise", "sid"] - output_names = ["audio", ] - - if SVCVITS.vol_embedding: - input_names.append("vol") - vol_dadict = {"vol" : [1]} - daxes.update(vol_dadict) - test_inputs = ( - test_hidden_unit.to(device), - test_pitch.to(device), - test_mel2ph.to(device), - test_uv.to(device), - test_noise.to(device), - test_sid.to(device), - test_vol.to(device) - ) - else: - test_inputs = ( - test_hidden_unit.to(device), - test_pitch.to(device), - test_mel2ph.to(device), - test_uv.to(device), - test_noise.to(device), - test_sid.to(device) - ) - - # SVCVITS = torch.jit.script(SVCVITS) - SVCVITS(test_hidden_unit.to(device), - test_pitch.to(device), - test_mel2ph.to(device), - test_uv.to(device), - test_noise.to(device), - test_sid.to(device), - test_vol.to(device)) - - torch.onnx.export( - SVCVITS, - test_inputs, - f"checkpoints/{path}/{path}_SoVits.onnx", - dynamic_axes=daxes, - do_constant_folding=False, - opset_version=16, - verbose=False, - input_names=input_names, - output_names=output_names - ) - - vec_lay = "layer-12" if SVCVITS.gin_channels == 768 else "layer-9" - spklist = [] - for key in hps.spk.keys(): - spklist.append(key) - - MoeVSConf = { - "Folder" : f"{path}", - "Name" : f"{path}", - "Type" : "SoVits", - "Rate" : hps.data.sampling_rate, - "Hop" : hps.data.hop_length, - "Hubert": f"vec-{SVCVITS.gin_channels}-{vec_lay}", - "SoVits4": True, - "SoVits3": False, - "CharaMix": export_mix, - "Volume": SVCVITS.vol_embedding, - "HiddenSize": SVCVITS.gin_channels, - "Characters": spklist - } - - with open(f"checkpoints/{path}.json", 'w') as MoeVsConfFile: - json.dump(MoeVSConf, MoeVsConfFile, indent = 4) - - -if __name__ == '__main__': - main() diff --git a/utils.py b/utils.py index df1b51e..98dd163 100644 --- a/utils.py +++ b/utils.py @@ -68,14 +68,16 @@ def plot_data_to_numpy(x, y): def f0_to_coarse(f0): - is_torch = isinstance(f0, torch.Tensor) - f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700) - f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1 - - f0_mel[f0_mel <= 1] = 1 - f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1 - f0_coarse = (f0_mel + 0.5).int() if is_torch else np.rint(f0_mel).astype(np.int) - assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (f0_coarse.max(), f0_coarse.min()) + f0_mel = 1127 * (1 + f0 / 700).log() + a = (f0_bin - 2) / (f0_mel_max - f0_mel_min) + b = f0_mel_min * a - 1. + f0_mel = torch.where(f0_mel > 0, f0_mel * a - b, f0_mel) + # torch.clip_(f0_mel, min=1., max=float(f0_bin - 1)) + f0_coarse = torch.round(f0_mel).long() + f0_coarse = f0_coarse * (f0_coarse > 0) + f0_coarse = f0_coarse + ((f0_coarse < 1) * 1) + f0_coarse = f0_coarse * (f0_coarse < f0_bin) + f0_coarse = f0_coarse + ((f0_coarse >= f0_bin) * (f0_bin - 1)) return f0_coarse def get_content(cmodel, y): diff --git a/vdecoder/hifigan/models.py b/vdecoder/hifigan/models.py index 8e79752..1075533 100644 --- a/vdecoder/hifigan/models.py +++ b/vdecoder/hifigan/models.py @@ -128,6 +128,7 @@ class SineGen(torch.nn.Module): self.sampling_rate = samp_rate self.voiced_threshold = voiced_threshold self.flag_for_pulse = flag_for_pulse + self.onnx = False def _f02uv(self, f0): # generate uv signal @@ -193,35 +194,81 @@ class SineGen(torch.nn.Module): sines = torch.cos(i_phase * 2 * np.pi) return sines - def forward(self, f0): + def forward(self, f0, upp=None): """ sine_tensor, uv = forward(f0) input F0: tensor(batchsize=1, length, dim=1) f0 for unvoiced steps should be 0 output sine_tensor: tensor(batchsize=1, length, dim) output uv: tensor(batchsize=1, length, 1) """ - with torch.no_grad(): - # fundamental component - fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device)) + if self.onnx: + with torch.no_grad(): + f0 = f0[:, None].transpose(1, 2) + f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) + # fundamental component + f0_buf[:, :, 0] = f0[:, :, 0] + for idx in np.arange(self.harmonic_num): + f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * ( + idx + 2 + ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic + rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化 + rand_ini = torch.rand( + f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device + ) + rand_ini[:, 0] = 0 + rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini + tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化 + tmp_over_one *= upp + tmp_over_one = F.interpolate( + tmp_over_one.transpose(2, 1), + scale_factor=upp, + mode="linear", + align_corners=True, + ).transpose(2, 1) + rad_values = F.interpolate( + rad_values.transpose(2, 1), scale_factor=upp, mode="nearest" + ).transpose( + 2, 1 + ) ####### + tmp_over_one %= 1 + tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0 + cumsum_shift = torch.zeros_like(rad_values) + cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 + sine_waves = torch.sin( + torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi + ) + sine_waves = sine_waves * self.sine_amp + uv = self._f02uv(f0) + uv = F.interpolate( + uv.transpose(2, 1), scale_factor=upp, mode="nearest" + ).transpose(2, 1) + noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 + noise = noise_amp * torch.randn_like(sine_waves) + sine_waves = sine_waves * uv + noise + return sine_waves, uv, noise + else: + with torch.no_grad(): + # fundamental component + fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device)) - # generate sine waveforms - sine_waves = self._f02sine(fn) * self.sine_amp + # generate sine waveforms + sine_waves = self._f02sine(fn) * self.sine_amp - # generate uv signal - # uv = torch.ones(f0.shape) - # uv = uv * (f0 > self.voiced_threshold) - uv = self._f02uv(f0) + # generate uv signal + # uv = torch.ones(f0.shape) + # uv = uv * (f0 > self.voiced_threshold) + uv = self._f02uv(f0) - # noise: for unvoiced should be similar to sine_amp - # std = self.sine_amp/3 -> max value ~ self.sine_amp - # . for voiced regions is self.noise_std - noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 - noise = noise_amp * torch.randn_like(sine_waves) + # noise: for unvoiced should be similar to sine_amp + # std = self.sine_amp/3 -> max value ~ self.sine_amp + # . for voiced regions is self.noise_std + noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 + noise = noise_amp * torch.randn_like(sine_waves) - # first: set the unvoiced part to 0 by uv - # then: additive noise - sine_waves = sine_waves * uv + noise - return sine_waves, uv, noise + # first: set the unvoiced part to 0 by uv + # then: additive noise + sine_waves = sine_waves * uv + noise + return sine_waves, uv, noise class SourceModuleHnNSF(torch.nn.Module): @@ -257,7 +304,7 @@ class SourceModuleHnNSF(torch.nn.Module): self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) self.l_tanh = torch.nn.Tanh() - def forward(self, x): + def forward(self, x, upp=None): """ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) F0_sampled (batchsize, length, 1) @@ -265,7 +312,7 @@ class SourceModuleHnNSF(torch.nn.Module): noise_source (batchsize, length 1) """ # source for harmonic branch - sine_wavs, uv, _ = self.l_sin_gen(x) + sine_wavs, uv, _ = self.l_sin_gen(x, upp) sine_merge = self.l_tanh(self.l_linear(sine_wavs.to(self.l_linear.weight.dtype))) # source for noise branch, in the same shape as uv @@ -309,12 +356,19 @@ class Generator(torch.nn.Module): self.ups.apply(init_weights) self.conv_post.apply(init_weights) self.cond = nn.Conv1d(h['gin_channels'], h['upsample_initial_channel'], 1) + self.upp = np.prod(h["upsample_rates"]) + self.onnx = False + + def OnnxExport(self): + self.onnx = True + self.m_source.l_sin_gen.onnx = True def forward(self, x, f0, g=None): # print(1,x.shape,f0.shape,f0[:, None].shape) - f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2) # bs,n,t + if not self.onnx: + f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2) # bs,n,t # print(2,f0.shape) - har_source, noi_source, uv = self.m_source(f0) + har_source, noi_source, uv = self.m_source(f0, self.upp) har_source = har_source.transpose(1, 2) x = self.conv_pre(x) x = x + self.cond(g)