NsfHifigan在DML中上采样出现错误以及SourceModuleHnNSF这两个BUG的修复

This commit is contained in:
白叶 藤原 2023-07-17 17:51:55 +08:00
parent 72deb15df3
commit 90c9ccc6a8
5 changed files with 274 additions and 218 deletions

View File

@ -1,56 +1,138 @@
import json
import torch import torch
import utils import utils
from onnxexport.model_onnx import SynthesizerTrn from onnxexport.model_onnx_speaker_mix import SynthesizerTrn
def main(NetExport): def main():
path = "SoVits4.0" path = "crs"
if NetExport:
device = torch.device("cpu") device = torch.device("cpu")
hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json") hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json")
SVCVITS = SynthesizerTrn( SVCVITS = SynthesizerTrn(
hps.data.filter_length // 2 + 1, hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length, hps.train.segment_size // hps.data.hop_length,
**hps.model) **hps.model)
_ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", SVCVITS, None) _ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", SVCVITS, None)
_ = SVCVITS.eval().to(device) _ = SVCVITS.eval().to(device)
for i in SVCVITS.parameters(): for i in SVCVITS.parameters():
i.requires_grad = False i.requires_grad = False
n_frame = 10 num_frames = 200
test_hidden_unit = torch.rand(1, n_frame, 256)
test_pitch = torch.rand(1, n_frame) test_hidden_unit = torch.rand(1, num_frames, SVCVITS.gin_channels)
test_mel2ph = torch.arange(0, n_frame, dtype=torch.int64)[None] # torch.LongTensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]).unsqueeze(0) test_pitch = torch.rand(1, num_frames)
test_uv = torch.ones(1, n_frame, dtype=torch.float32) test_vol = torch.rand(1, num_frames)
test_noise = torch.randn(1, 192, n_frame) test_mel2ph = torch.LongTensor(torch.arange(0, num_frames)).unsqueeze(0)
test_sid = torch.LongTensor([0]) test_uv = torch.ones(1, num_frames, dtype=torch.float32)
input_names = ["c", "f0", "mel2ph", "uv", "noise", "sid"] test_noise = torch.randn(1, 192, num_frames)
output_names = ["audio", ] test_sid = torch.LongTensor([0])
export_mix = True
torch.onnx.export(SVCVITS, if len(hps.spk) < 2:
( export_mix = False
test_hidden_unit.to(device),
test_pitch.to(device), if export_mix:
test_mel2ph.to(device), spk_mix = []
test_uv.to(device), n_spk = len(hps.spk)
test_noise.to(device), for i in range(n_spk):
test_sid.to(device) spk_mix.append(1.0/float(n_spk))
), test_sid = torch.tensor(spk_mix)
f"checkpoints/{path}/model.onnx", SVCVITS.export_chara_mix(hps.spk)
dynamic_axes={ test_sid = test_sid.unsqueeze(0)
"c": [0, 1], test_sid = test_sid.repeat(num_frames, 1)
"f0": [1],
"mel2ph": [1], SVCVITS.eval()
"uv": [1],
"noise": [2], if export_mix:
}, daxes = {
do_constant_folding=False, "c": [0, 1],
opset_version=16, "f0": [1],
verbose=False, "mel2ph": [1],
input_names=input_names, "uv": [1],
output_names=output_names) "noise": [2],
"sid":[0]
}
else:
daxes = {
"c": [0, 1],
"f0": [1],
"mel2ph": [1],
"uv": [1],
"noise": [2]
}
input_names = ["c", "f0", "mel2ph", "uv", "noise", "sid"]
output_names = ["audio", ]
if SVCVITS.vol_embedding:
input_names.append("vol")
vol_dadict = {"vol" : [1]}
daxes.update(vol_dadict)
test_inputs = (
test_hidden_unit.to(device),
test_pitch.to(device),
test_mel2ph.to(device),
test_uv.to(device),
test_noise.to(device),
test_sid.to(device),
test_vol.to(device)
)
else:
test_inputs = (
test_hidden_unit.to(device),
test_pitch.to(device),
test_mel2ph.to(device),
test_uv.to(device),
test_noise.to(device),
test_sid.to(device)
)
# SVCVITS = torch.jit.script(SVCVITS)
SVCVITS(test_hidden_unit.to(device),
test_pitch.to(device),
test_mel2ph.to(device),
test_uv.to(device),
test_noise.to(device),
test_sid.to(device),
test_vol.to(device))
SVCVITS.dec.OnnxExport()
torch.onnx.export(
SVCVITS,
test_inputs,
f"checkpoints/{path}/{path}_SoVits.onnx",
dynamic_axes=daxes,
do_constant_folding=False,
opset_version=16,
verbose=False,
input_names=input_names,
output_names=output_names
)
vec_lay = "layer-12" if SVCVITS.gin_channels == 768 else "layer-9"
spklist = []
for key in hps.spk.keys():
spklist.append(key)
MoeVSConf = {
"Folder" : f"{path}",
"Name" : f"{path}",
"Type" : "SoVits",
"Rate" : hps.data.sampling_rate,
"Hop" : hps.data.hop_length,
"Hubert": f"vec-{SVCVITS.gin_channels}-{vec_lay}",
"SoVits4": True,
"SoVits3": False,
"CharaMix": export_mix,
"Volume": SVCVITS.vol_embedding,
"HiddenSize": SVCVITS.gin_channels,
"Characters": spklist
}
with open(f"checkpoints/{path}.json", 'w') as MoeVsConfFile:
json.dump(MoeVSConf, MoeVsConfFile, indent = 4)
if __name__ == '__main__': if __name__ == '__main__':
main(True) main()

56
onnx_export_old.py Normal file
View File

@ -0,0 +1,56 @@
import torch
import utils
from onnxexport.model_onnx import SynthesizerTrn
def main(NetExport):
path = "SoVits4.0"
if NetExport:
device = torch.device("cpu")
hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json")
SVCVITS = SynthesizerTrn(
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
**hps.model)
_ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", SVCVITS, None)
_ = SVCVITS.eval().to(device)
for i in SVCVITS.parameters():
i.requires_grad = False
n_frame = 10
test_hidden_unit = torch.rand(1, n_frame, 256)
test_pitch = torch.rand(1, n_frame)
test_mel2ph = torch.arange(0, n_frame, dtype=torch.int64)[None] # torch.LongTensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]).unsqueeze(0)
test_uv = torch.ones(1, n_frame, dtype=torch.float32)
test_noise = torch.randn(1, 192, n_frame)
test_sid = torch.LongTensor([0])
input_names = ["c", "f0", "mel2ph", "uv", "noise", "sid"]
output_names = ["audio", ]
torch.onnx.export(SVCVITS,
(
test_hidden_unit.to(device),
test_pitch.to(device),
test_mel2ph.to(device),
test_uv.to(device),
test_noise.to(device),
test_sid.to(device)
),
f"checkpoints/{path}/model.onnx",
dynamic_axes={
"c": [0, 1],
"f0": [1],
"mel2ph": [1],
"uv": [1],
"noise": [2],
},
do_constant_folding=False,
opset_version=16,
verbose=False,
input_names=input_names,
output_names=output_names)
if __name__ == '__main__':
main(True)

View File

@ -1,138 +0,0 @@
import json
import torch
import utils
from onnxexport.model_onnx_speaker_mix import SynthesizerTrn
def main():
path = "crs"
device = torch.device("cpu")
hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json")
SVCVITS = SynthesizerTrn(
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
**hps.model)
_ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", SVCVITS, None)
_ = SVCVITS.eval().to(device)
for i in SVCVITS.parameters():
i.requires_grad = False
num_frames = 200
test_hidden_unit = torch.rand(1, num_frames, SVCVITS.gin_channels)
test_pitch = torch.rand(1, num_frames)
test_vol = torch.rand(1, num_frames)
test_mel2ph = torch.LongTensor(torch.arange(0, num_frames)).unsqueeze(0)
test_uv = torch.ones(1, num_frames, dtype=torch.float32)
test_noise = torch.randn(1, 192, num_frames)
test_sid = torch.LongTensor([0])
export_mix = True
if len(hps.spk) < 2:
export_mix = False
if export_mix:
spk_mix = []
n_spk = len(hps.spk)
for i in range(n_spk):
spk_mix.append(1.0/float(n_spk))
test_sid = torch.tensor(spk_mix)
SVCVITS.export_chara_mix(hps.spk)
test_sid = test_sid.unsqueeze(0)
test_sid = test_sid.repeat(num_frames, 1)
SVCVITS.eval()
if export_mix:
daxes = {
"c": [0, 1],
"f0": [1],
"mel2ph": [1],
"uv": [1],
"noise": [2],
"sid":[0]
}
else:
daxes = {
"c": [0, 1],
"f0": [1],
"mel2ph": [1],
"uv": [1],
"noise": [2]
}
input_names = ["c", "f0", "mel2ph", "uv", "noise", "sid"]
output_names = ["audio", ]
if SVCVITS.vol_embedding:
input_names.append("vol")
vol_dadict = {"vol" : [1]}
daxes.update(vol_dadict)
test_inputs = (
test_hidden_unit.to(device),
test_pitch.to(device),
test_mel2ph.to(device),
test_uv.to(device),
test_noise.to(device),
test_sid.to(device),
test_vol.to(device)
)
else:
test_inputs = (
test_hidden_unit.to(device),
test_pitch.to(device),
test_mel2ph.to(device),
test_uv.to(device),
test_noise.to(device),
test_sid.to(device)
)
# SVCVITS = torch.jit.script(SVCVITS)
SVCVITS(test_hidden_unit.to(device),
test_pitch.to(device),
test_mel2ph.to(device),
test_uv.to(device),
test_noise.to(device),
test_sid.to(device),
test_vol.to(device))
torch.onnx.export(
SVCVITS,
test_inputs,
f"checkpoints/{path}/{path}_SoVits.onnx",
dynamic_axes=daxes,
do_constant_folding=False,
opset_version=16,
verbose=False,
input_names=input_names,
output_names=output_names
)
vec_lay = "layer-12" if SVCVITS.gin_channels == 768 else "layer-9"
spklist = []
for key in hps.spk.keys():
spklist.append(key)
MoeVSConf = {
"Folder" : f"{path}",
"Name" : f"{path}",
"Type" : "SoVits",
"Rate" : hps.data.sampling_rate,
"Hop" : hps.data.hop_length,
"Hubert": f"vec-{SVCVITS.gin_channels}-{vec_lay}",
"SoVits4": True,
"SoVits3": False,
"CharaMix": export_mix,
"Volume": SVCVITS.vol_embedding,
"HiddenSize": SVCVITS.gin_channels,
"Characters": spklist
}
with open(f"checkpoints/{path}.json", 'w') as MoeVsConfFile:
json.dump(MoeVSConf, MoeVsConfFile, indent = 4)
if __name__ == '__main__':
main()

View File

@ -68,14 +68,16 @@ def plot_data_to_numpy(x, y):
def f0_to_coarse(f0): def f0_to_coarse(f0):
is_torch = isinstance(f0, torch.Tensor) f0_mel = 1127 * (1 + f0 / 700).log()
f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700) a = (f0_bin - 2) / (f0_mel_max - f0_mel_min)
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1 b = f0_mel_min * a - 1.
f0_mel = torch.where(f0_mel > 0, f0_mel * a - b, f0_mel)
f0_mel[f0_mel <= 1] = 1 # torch.clip_(f0_mel, min=1., max=float(f0_bin - 1))
f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1 f0_coarse = torch.round(f0_mel).long()
f0_coarse = (f0_mel + 0.5).int() if is_torch else np.rint(f0_mel).astype(np.int) f0_coarse = f0_coarse * (f0_coarse > 0)
assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (f0_coarse.max(), f0_coarse.min()) f0_coarse = f0_coarse + ((f0_coarse < 1) * 1)
f0_coarse = f0_coarse * (f0_coarse < f0_bin)
f0_coarse = f0_coarse + ((f0_coarse >= f0_bin) * (f0_bin - 1))
return f0_coarse return f0_coarse
def get_content(cmodel, y): def get_content(cmodel, y):

View File

@ -128,6 +128,7 @@ class SineGen(torch.nn.Module):
self.sampling_rate = samp_rate self.sampling_rate = samp_rate
self.voiced_threshold = voiced_threshold self.voiced_threshold = voiced_threshold
self.flag_for_pulse = flag_for_pulse self.flag_for_pulse = flag_for_pulse
self.onnx = False
def _f02uv(self, f0): def _f02uv(self, f0):
# generate uv signal # generate uv signal
@ -193,35 +194,81 @@ class SineGen(torch.nn.Module):
sines = torch.cos(i_phase * 2 * np.pi) sines = torch.cos(i_phase * 2 * np.pi)
return sines return sines
def forward(self, f0): def forward(self, f0, upp=None):
""" sine_tensor, uv = forward(f0) """ sine_tensor, uv = forward(f0)
input F0: tensor(batchsize=1, length, dim=1) input F0: tensor(batchsize=1, length, dim=1)
f0 for unvoiced steps should be 0 f0 for unvoiced steps should be 0
output sine_tensor: tensor(batchsize=1, length, dim) output sine_tensor: tensor(batchsize=1, length, dim)
output uv: tensor(batchsize=1, length, 1) output uv: tensor(batchsize=1, length, 1)
""" """
with torch.no_grad(): if self.onnx:
# fundamental component with torch.no_grad():
fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device)) f0 = f0[:, None].transpose(1, 2)
f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
# fundamental component
f0_buf[:, :, 0] = f0[:, :, 0]
for idx in np.arange(self.harmonic_num):
f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
idx + 2
) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化
rand_ini = torch.rand(
f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
)
rand_ini[:, 0] = 0
rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化
tmp_over_one *= upp
tmp_over_one = F.interpolate(
tmp_over_one.transpose(2, 1),
scale_factor=upp,
mode="linear",
align_corners=True,
).transpose(2, 1)
rad_values = F.interpolate(
rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
).transpose(
2, 1
) #######
tmp_over_one %= 1
tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
cumsum_shift = torch.zeros_like(rad_values)
cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
sine_waves = torch.sin(
torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
)
sine_waves = sine_waves * self.sine_amp
uv = self._f02uv(f0)
uv = F.interpolate(
uv.transpose(2, 1), scale_factor=upp, mode="nearest"
).transpose(2, 1)
noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
noise = noise_amp * torch.randn_like(sine_waves)
sine_waves = sine_waves * uv + noise
return sine_waves, uv, noise
else:
with torch.no_grad():
# fundamental component
fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device))
# generate sine waveforms # generate sine waveforms
sine_waves = self._f02sine(fn) * self.sine_amp sine_waves = self._f02sine(fn) * self.sine_amp
# generate uv signal # generate uv signal
# uv = torch.ones(f0.shape) # uv = torch.ones(f0.shape)
# uv = uv * (f0 > self.voiced_threshold) # uv = uv * (f0 > self.voiced_threshold)
uv = self._f02uv(f0) uv = self._f02uv(f0)
# noise: for unvoiced should be similar to sine_amp # noise: for unvoiced should be similar to sine_amp
# std = self.sine_amp/3 -> max value ~ self.sine_amp # std = self.sine_amp/3 -> max value ~ self.sine_amp
# . for voiced regions is self.noise_std # . for voiced regions is self.noise_std
noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
noise = noise_amp * torch.randn_like(sine_waves) noise = noise_amp * torch.randn_like(sine_waves)
# first: set the unvoiced part to 0 by uv # first: set the unvoiced part to 0 by uv
# then: additive noise # then: additive noise
sine_waves = sine_waves * uv + noise sine_waves = sine_waves * uv + noise
return sine_waves, uv, noise return sine_waves, uv, noise
class SourceModuleHnNSF(torch.nn.Module): class SourceModuleHnNSF(torch.nn.Module):
@ -257,7 +304,7 @@ class SourceModuleHnNSF(torch.nn.Module):
self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
self.l_tanh = torch.nn.Tanh() self.l_tanh = torch.nn.Tanh()
def forward(self, x): def forward(self, x, upp=None):
""" """
Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
F0_sampled (batchsize, length, 1) F0_sampled (batchsize, length, 1)
@ -265,7 +312,7 @@ class SourceModuleHnNSF(torch.nn.Module):
noise_source (batchsize, length 1) noise_source (batchsize, length 1)
""" """
# source for harmonic branch # source for harmonic branch
sine_wavs, uv, _ = self.l_sin_gen(x) sine_wavs, uv, _ = self.l_sin_gen(x, upp)
sine_merge = self.l_tanh(self.l_linear(sine_wavs.to(self.l_linear.weight.dtype))) sine_merge = self.l_tanh(self.l_linear(sine_wavs.to(self.l_linear.weight.dtype)))
# source for noise branch, in the same shape as uv # source for noise branch, in the same shape as uv
@ -309,12 +356,19 @@ class Generator(torch.nn.Module):
self.ups.apply(init_weights) self.ups.apply(init_weights)
self.conv_post.apply(init_weights) self.conv_post.apply(init_weights)
self.cond = nn.Conv1d(h['gin_channels'], h['upsample_initial_channel'], 1) self.cond = nn.Conv1d(h['gin_channels'], h['upsample_initial_channel'], 1)
self.upp = np.prod(h["upsample_rates"])
self.onnx = False
def OnnxExport(self):
self.onnx = True
self.m_source.l_sin_gen.onnx = True
def forward(self, x, f0, g=None): def forward(self, x, f0, g=None):
# print(1,x.shape,f0.shape,f0[:, None].shape) # print(1,x.shape,f0.shape,f0[:, None].shape)
f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2) # bs,n,t if not self.onnx:
f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2) # bs,n,t
# print(2,f0.shape) # print(2,f0.shape)
har_source, noi_source, uv = self.m_source(f0) har_source, noi_source, uv = self.m_source(f0, self.upp)
har_source = har_source.transpose(1, 2) har_source = har_source.transpose(1, 2)
x = self.conv_pre(x) x = self.conv_pre(x)
x = x + self.cond(g) x = x + self.cond(g)