NsfHifigan在DML中上采样出现错误以及SourceModuleHnNSF这两个BUG的修复
This commit is contained in:
parent
72deb15df3
commit
90c9ccc6a8
180
onnx_export.py
180
onnx_export.py
|
@ -1,56 +1,138 @@
|
||||||
|
import json
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
import utils
|
import utils
|
||||||
from onnxexport.model_onnx import SynthesizerTrn
|
from onnxexport.model_onnx_speaker_mix import SynthesizerTrn
|
||||||
|
|
||||||
|
|
||||||
def main(NetExport):
|
def main():
|
||||||
path = "SoVits4.0"
|
path = "crs"
|
||||||
if NetExport:
|
|
||||||
device = torch.device("cpu")
|
device = torch.device("cpu")
|
||||||
hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json")
|
hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json")
|
||||||
SVCVITS = SynthesizerTrn(
|
SVCVITS = SynthesizerTrn(
|
||||||
hps.data.filter_length // 2 + 1,
|
hps.data.filter_length // 2 + 1,
|
||||||
hps.train.segment_size // hps.data.hop_length,
|
hps.train.segment_size // hps.data.hop_length,
|
||||||
**hps.model)
|
**hps.model)
|
||||||
_ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", SVCVITS, None)
|
_ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", SVCVITS, None)
|
||||||
_ = SVCVITS.eval().to(device)
|
_ = SVCVITS.eval().to(device)
|
||||||
for i in SVCVITS.parameters():
|
for i in SVCVITS.parameters():
|
||||||
i.requires_grad = False
|
i.requires_grad = False
|
||||||
|
|
||||||
n_frame = 10
|
num_frames = 200
|
||||||
test_hidden_unit = torch.rand(1, n_frame, 256)
|
|
||||||
test_pitch = torch.rand(1, n_frame)
|
test_hidden_unit = torch.rand(1, num_frames, SVCVITS.gin_channels)
|
||||||
test_mel2ph = torch.arange(0, n_frame, dtype=torch.int64)[None] # torch.LongTensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]).unsqueeze(0)
|
test_pitch = torch.rand(1, num_frames)
|
||||||
test_uv = torch.ones(1, n_frame, dtype=torch.float32)
|
test_vol = torch.rand(1, num_frames)
|
||||||
test_noise = torch.randn(1, 192, n_frame)
|
test_mel2ph = torch.LongTensor(torch.arange(0, num_frames)).unsqueeze(0)
|
||||||
test_sid = torch.LongTensor([0])
|
test_uv = torch.ones(1, num_frames, dtype=torch.float32)
|
||||||
input_names = ["c", "f0", "mel2ph", "uv", "noise", "sid"]
|
test_noise = torch.randn(1, 192, num_frames)
|
||||||
output_names = ["audio", ]
|
test_sid = torch.LongTensor([0])
|
||||||
|
export_mix = True
|
||||||
torch.onnx.export(SVCVITS,
|
if len(hps.spk) < 2:
|
||||||
(
|
export_mix = False
|
||||||
test_hidden_unit.to(device),
|
|
||||||
test_pitch.to(device),
|
if export_mix:
|
||||||
test_mel2ph.to(device),
|
spk_mix = []
|
||||||
test_uv.to(device),
|
n_spk = len(hps.spk)
|
||||||
test_noise.to(device),
|
for i in range(n_spk):
|
||||||
test_sid.to(device)
|
spk_mix.append(1.0/float(n_spk))
|
||||||
),
|
test_sid = torch.tensor(spk_mix)
|
||||||
f"checkpoints/{path}/model.onnx",
|
SVCVITS.export_chara_mix(hps.spk)
|
||||||
dynamic_axes={
|
test_sid = test_sid.unsqueeze(0)
|
||||||
"c": [0, 1],
|
test_sid = test_sid.repeat(num_frames, 1)
|
||||||
"f0": [1],
|
|
||||||
"mel2ph": [1],
|
SVCVITS.eval()
|
||||||
"uv": [1],
|
|
||||||
"noise": [2],
|
if export_mix:
|
||||||
},
|
daxes = {
|
||||||
do_constant_folding=False,
|
"c": [0, 1],
|
||||||
opset_version=16,
|
"f0": [1],
|
||||||
verbose=False,
|
"mel2ph": [1],
|
||||||
input_names=input_names,
|
"uv": [1],
|
||||||
output_names=output_names)
|
"noise": [2],
|
||||||
|
"sid":[0]
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
daxes = {
|
||||||
|
"c": [0, 1],
|
||||||
|
"f0": [1],
|
||||||
|
"mel2ph": [1],
|
||||||
|
"uv": [1],
|
||||||
|
"noise": [2]
|
||||||
|
}
|
||||||
|
|
||||||
|
input_names = ["c", "f0", "mel2ph", "uv", "noise", "sid"]
|
||||||
|
output_names = ["audio", ]
|
||||||
|
|
||||||
|
if SVCVITS.vol_embedding:
|
||||||
|
input_names.append("vol")
|
||||||
|
vol_dadict = {"vol" : [1]}
|
||||||
|
daxes.update(vol_dadict)
|
||||||
|
test_inputs = (
|
||||||
|
test_hidden_unit.to(device),
|
||||||
|
test_pitch.to(device),
|
||||||
|
test_mel2ph.to(device),
|
||||||
|
test_uv.to(device),
|
||||||
|
test_noise.to(device),
|
||||||
|
test_sid.to(device),
|
||||||
|
test_vol.to(device)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
test_inputs = (
|
||||||
|
test_hidden_unit.to(device),
|
||||||
|
test_pitch.to(device),
|
||||||
|
test_mel2ph.to(device),
|
||||||
|
test_uv.to(device),
|
||||||
|
test_noise.to(device),
|
||||||
|
test_sid.to(device)
|
||||||
|
)
|
||||||
|
|
||||||
|
# SVCVITS = torch.jit.script(SVCVITS)
|
||||||
|
SVCVITS(test_hidden_unit.to(device),
|
||||||
|
test_pitch.to(device),
|
||||||
|
test_mel2ph.to(device),
|
||||||
|
test_uv.to(device),
|
||||||
|
test_noise.to(device),
|
||||||
|
test_sid.to(device),
|
||||||
|
test_vol.to(device))
|
||||||
|
|
||||||
|
SVCVITS.dec.OnnxExport()
|
||||||
|
|
||||||
|
torch.onnx.export(
|
||||||
|
SVCVITS,
|
||||||
|
test_inputs,
|
||||||
|
f"checkpoints/{path}/{path}_SoVits.onnx",
|
||||||
|
dynamic_axes=daxes,
|
||||||
|
do_constant_folding=False,
|
||||||
|
opset_version=16,
|
||||||
|
verbose=False,
|
||||||
|
input_names=input_names,
|
||||||
|
output_names=output_names
|
||||||
|
)
|
||||||
|
|
||||||
|
vec_lay = "layer-12" if SVCVITS.gin_channels == 768 else "layer-9"
|
||||||
|
spklist = []
|
||||||
|
for key in hps.spk.keys():
|
||||||
|
spklist.append(key)
|
||||||
|
|
||||||
|
MoeVSConf = {
|
||||||
|
"Folder" : f"{path}",
|
||||||
|
"Name" : f"{path}",
|
||||||
|
"Type" : "SoVits",
|
||||||
|
"Rate" : hps.data.sampling_rate,
|
||||||
|
"Hop" : hps.data.hop_length,
|
||||||
|
"Hubert": f"vec-{SVCVITS.gin_channels}-{vec_lay}",
|
||||||
|
"SoVits4": True,
|
||||||
|
"SoVits3": False,
|
||||||
|
"CharaMix": export_mix,
|
||||||
|
"Volume": SVCVITS.vol_embedding,
|
||||||
|
"HiddenSize": SVCVITS.gin_channels,
|
||||||
|
"Characters": spklist
|
||||||
|
}
|
||||||
|
|
||||||
|
with open(f"checkpoints/{path}.json", 'w') as MoeVsConfFile:
|
||||||
|
json.dump(MoeVSConf, MoeVsConfFile, indent = 4)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main(True)
|
main()
|
||||||
|
|
|
@ -0,0 +1,56 @@
|
||||||
|
import torch
|
||||||
|
|
||||||
|
import utils
|
||||||
|
from onnxexport.model_onnx import SynthesizerTrn
|
||||||
|
|
||||||
|
|
||||||
|
def main(NetExport):
|
||||||
|
path = "SoVits4.0"
|
||||||
|
if NetExport:
|
||||||
|
device = torch.device("cpu")
|
||||||
|
hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json")
|
||||||
|
SVCVITS = SynthesizerTrn(
|
||||||
|
hps.data.filter_length // 2 + 1,
|
||||||
|
hps.train.segment_size // hps.data.hop_length,
|
||||||
|
**hps.model)
|
||||||
|
_ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", SVCVITS, None)
|
||||||
|
_ = SVCVITS.eval().to(device)
|
||||||
|
for i in SVCVITS.parameters():
|
||||||
|
i.requires_grad = False
|
||||||
|
|
||||||
|
n_frame = 10
|
||||||
|
test_hidden_unit = torch.rand(1, n_frame, 256)
|
||||||
|
test_pitch = torch.rand(1, n_frame)
|
||||||
|
test_mel2ph = torch.arange(0, n_frame, dtype=torch.int64)[None] # torch.LongTensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]).unsqueeze(0)
|
||||||
|
test_uv = torch.ones(1, n_frame, dtype=torch.float32)
|
||||||
|
test_noise = torch.randn(1, 192, n_frame)
|
||||||
|
test_sid = torch.LongTensor([0])
|
||||||
|
input_names = ["c", "f0", "mel2ph", "uv", "noise", "sid"]
|
||||||
|
output_names = ["audio", ]
|
||||||
|
|
||||||
|
torch.onnx.export(SVCVITS,
|
||||||
|
(
|
||||||
|
test_hidden_unit.to(device),
|
||||||
|
test_pitch.to(device),
|
||||||
|
test_mel2ph.to(device),
|
||||||
|
test_uv.to(device),
|
||||||
|
test_noise.to(device),
|
||||||
|
test_sid.to(device)
|
||||||
|
),
|
||||||
|
f"checkpoints/{path}/model.onnx",
|
||||||
|
dynamic_axes={
|
||||||
|
"c": [0, 1],
|
||||||
|
"f0": [1],
|
||||||
|
"mel2ph": [1],
|
||||||
|
"uv": [1],
|
||||||
|
"noise": [2],
|
||||||
|
},
|
||||||
|
do_constant_folding=False,
|
||||||
|
opset_version=16,
|
||||||
|
verbose=False,
|
||||||
|
input_names=input_names,
|
||||||
|
output_names=output_names)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main(True)
|
|
@ -1,138 +0,0 @@
|
||||||
import json
|
|
||||||
|
|
||||||
import torch
|
|
||||||
|
|
||||||
import utils
|
|
||||||
from onnxexport.model_onnx_speaker_mix import SynthesizerTrn
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
path = "crs"
|
|
||||||
|
|
||||||
device = torch.device("cpu")
|
|
||||||
hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json")
|
|
||||||
SVCVITS = SynthesizerTrn(
|
|
||||||
hps.data.filter_length // 2 + 1,
|
|
||||||
hps.train.segment_size // hps.data.hop_length,
|
|
||||||
**hps.model)
|
|
||||||
_ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", SVCVITS, None)
|
|
||||||
_ = SVCVITS.eval().to(device)
|
|
||||||
for i in SVCVITS.parameters():
|
|
||||||
i.requires_grad = False
|
|
||||||
|
|
||||||
num_frames = 200
|
|
||||||
|
|
||||||
test_hidden_unit = torch.rand(1, num_frames, SVCVITS.gin_channels)
|
|
||||||
test_pitch = torch.rand(1, num_frames)
|
|
||||||
test_vol = torch.rand(1, num_frames)
|
|
||||||
test_mel2ph = torch.LongTensor(torch.arange(0, num_frames)).unsqueeze(0)
|
|
||||||
test_uv = torch.ones(1, num_frames, dtype=torch.float32)
|
|
||||||
test_noise = torch.randn(1, 192, num_frames)
|
|
||||||
test_sid = torch.LongTensor([0])
|
|
||||||
export_mix = True
|
|
||||||
if len(hps.spk) < 2:
|
|
||||||
export_mix = False
|
|
||||||
|
|
||||||
if export_mix:
|
|
||||||
spk_mix = []
|
|
||||||
n_spk = len(hps.spk)
|
|
||||||
for i in range(n_spk):
|
|
||||||
spk_mix.append(1.0/float(n_spk))
|
|
||||||
test_sid = torch.tensor(spk_mix)
|
|
||||||
SVCVITS.export_chara_mix(hps.spk)
|
|
||||||
test_sid = test_sid.unsqueeze(0)
|
|
||||||
test_sid = test_sid.repeat(num_frames, 1)
|
|
||||||
|
|
||||||
SVCVITS.eval()
|
|
||||||
|
|
||||||
if export_mix:
|
|
||||||
daxes = {
|
|
||||||
"c": [0, 1],
|
|
||||||
"f0": [1],
|
|
||||||
"mel2ph": [1],
|
|
||||||
"uv": [1],
|
|
||||||
"noise": [2],
|
|
||||||
"sid":[0]
|
|
||||||
}
|
|
||||||
else:
|
|
||||||
daxes = {
|
|
||||||
"c": [0, 1],
|
|
||||||
"f0": [1],
|
|
||||||
"mel2ph": [1],
|
|
||||||
"uv": [1],
|
|
||||||
"noise": [2]
|
|
||||||
}
|
|
||||||
|
|
||||||
input_names = ["c", "f0", "mel2ph", "uv", "noise", "sid"]
|
|
||||||
output_names = ["audio", ]
|
|
||||||
|
|
||||||
if SVCVITS.vol_embedding:
|
|
||||||
input_names.append("vol")
|
|
||||||
vol_dadict = {"vol" : [1]}
|
|
||||||
daxes.update(vol_dadict)
|
|
||||||
test_inputs = (
|
|
||||||
test_hidden_unit.to(device),
|
|
||||||
test_pitch.to(device),
|
|
||||||
test_mel2ph.to(device),
|
|
||||||
test_uv.to(device),
|
|
||||||
test_noise.to(device),
|
|
||||||
test_sid.to(device),
|
|
||||||
test_vol.to(device)
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
test_inputs = (
|
|
||||||
test_hidden_unit.to(device),
|
|
||||||
test_pitch.to(device),
|
|
||||||
test_mel2ph.to(device),
|
|
||||||
test_uv.to(device),
|
|
||||||
test_noise.to(device),
|
|
||||||
test_sid.to(device)
|
|
||||||
)
|
|
||||||
|
|
||||||
# SVCVITS = torch.jit.script(SVCVITS)
|
|
||||||
SVCVITS(test_hidden_unit.to(device),
|
|
||||||
test_pitch.to(device),
|
|
||||||
test_mel2ph.to(device),
|
|
||||||
test_uv.to(device),
|
|
||||||
test_noise.to(device),
|
|
||||||
test_sid.to(device),
|
|
||||||
test_vol.to(device))
|
|
||||||
|
|
||||||
torch.onnx.export(
|
|
||||||
SVCVITS,
|
|
||||||
test_inputs,
|
|
||||||
f"checkpoints/{path}/{path}_SoVits.onnx",
|
|
||||||
dynamic_axes=daxes,
|
|
||||||
do_constant_folding=False,
|
|
||||||
opset_version=16,
|
|
||||||
verbose=False,
|
|
||||||
input_names=input_names,
|
|
||||||
output_names=output_names
|
|
||||||
)
|
|
||||||
|
|
||||||
vec_lay = "layer-12" if SVCVITS.gin_channels == 768 else "layer-9"
|
|
||||||
spklist = []
|
|
||||||
for key in hps.spk.keys():
|
|
||||||
spklist.append(key)
|
|
||||||
|
|
||||||
MoeVSConf = {
|
|
||||||
"Folder" : f"{path}",
|
|
||||||
"Name" : f"{path}",
|
|
||||||
"Type" : "SoVits",
|
|
||||||
"Rate" : hps.data.sampling_rate,
|
|
||||||
"Hop" : hps.data.hop_length,
|
|
||||||
"Hubert": f"vec-{SVCVITS.gin_channels}-{vec_lay}",
|
|
||||||
"SoVits4": True,
|
|
||||||
"SoVits3": False,
|
|
||||||
"CharaMix": export_mix,
|
|
||||||
"Volume": SVCVITS.vol_embedding,
|
|
||||||
"HiddenSize": SVCVITS.gin_channels,
|
|
||||||
"Characters": spklist
|
|
||||||
}
|
|
||||||
|
|
||||||
with open(f"checkpoints/{path}.json", 'w') as MoeVsConfFile:
|
|
||||||
json.dump(MoeVSConf, MoeVsConfFile, indent = 4)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
18
utils.py
18
utils.py
|
@ -68,14 +68,16 @@ def plot_data_to_numpy(x, y):
|
||||||
|
|
||||||
|
|
||||||
def f0_to_coarse(f0):
|
def f0_to_coarse(f0):
|
||||||
is_torch = isinstance(f0, torch.Tensor)
|
f0_mel = 1127 * (1 + f0 / 700).log()
|
||||||
f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700)
|
a = (f0_bin - 2) / (f0_mel_max - f0_mel_min)
|
||||||
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1
|
b = f0_mel_min * a - 1.
|
||||||
|
f0_mel = torch.where(f0_mel > 0, f0_mel * a - b, f0_mel)
|
||||||
f0_mel[f0_mel <= 1] = 1
|
# torch.clip_(f0_mel, min=1., max=float(f0_bin - 1))
|
||||||
f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1
|
f0_coarse = torch.round(f0_mel).long()
|
||||||
f0_coarse = (f0_mel + 0.5).int() if is_torch else np.rint(f0_mel).astype(np.int)
|
f0_coarse = f0_coarse * (f0_coarse > 0)
|
||||||
assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (f0_coarse.max(), f0_coarse.min())
|
f0_coarse = f0_coarse + ((f0_coarse < 1) * 1)
|
||||||
|
f0_coarse = f0_coarse * (f0_coarse < f0_bin)
|
||||||
|
f0_coarse = f0_coarse + ((f0_coarse >= f0_bin) * (f0_bin - 1))
|
||||||
return f0_coarse
|
return f0_coarse
|
||||||
|
|
||||||
def get_content(cmodel, y):
|
def get_content(cmodel, y):
|
||||||
|
|
|
@ -128,6 +128,7 @@ class SineGen(torch.nn.Module):
|
||||||
self.sampling_rate = samp_rate
|
self.sampling_rate = samp_rate
|
||||||
self.voiced_threshold = voiced_threshold
|
self.voiced_threshold = voiced_threshold
|
||||||
self.flag_for_pulse = flag_for_pulse
|
self.flag_for_pulse = flag_for_pulse
|
||||||
|
self.onnx = False
|
||||||
|
|
||||||
def _f02uv(self, f0):
|
def _f02uv(self, f0):
|
||||||
# generate uv signal
|
# generate uv signal
|
||||||
|
@ -193,35 +194,81 @@ class SineGen(torch.nn.Module):
|
||||||
sines = torch.cos(i_phase * 2 * np.pi)
|
sines = torch.cos(i_phase * 2 * np.pi)
|
||||||
return sines
|
return sines
|
||||||
|
|
||||||
def forward(self, f0):
|
def forward(self, f0, upp=None):
|
||||||
""" sine_tensor, uv = forward(f0)
|
""" sine_tensor, uv = forward(f0)
|
||||||
input F0: tensor(batchsize=1, length, dim=1)
|
input F0: tensor(batchsize=1, length, dim=1)
|
||||||
f0 for unvoiced steps should be 0
|
f0 for unvoiced steps should be 0
|
||||||
output sine_tensor: tensor(batchsize=1, length, dim)
|
output sine_tensor: tensor(batchsize=1, length, dim)
|
||||||
output uv: tensor(batchsize=1, length, 1)
|
output uv: tensor(batchsize=1, length, 1)
|
||||||
"""
|
"""
|
||||||
with torch.no_grad():
|
if self.onnx:
|
||||||
# fundamental component
|
with torch.no_grad():
|
||||||
fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device))
|
f0 = f0[:, None].transpose(1, 2)
|
||||||
|
f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
|
||||||
|
# fundamental component
|
||||||
|
f0_buf[:, :, 0] = f0[:, :, 0]
|
||||||
|
for idx in np.arange(self.harmonic_num):
|
||||||
|
f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
|
||||||
|
idx + 2
|
||||||
|
) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
|
||||||
|
rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化
|
||||||
|
rand_ini = torch.rand(
|
||||||
|
f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
|
||||||
|
)
|
||||||
|
rand_ini[:, 0] = 0
|
||||||
|
rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
|
||||||
|
tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化
|
||||||
|
tmp_over_one *= upp
|
||||||
|
tmp_over_one = F.interpolate(
|
||||||
|
tmp_over_one.transpose(2, 1),
|
||||||
|
scale_factor=upp,
|
||||||
|
mode="linear",
|
||||||
|
align_corners=True,
|
||||||
|
).transpose(2, 1)
|
||||||
|
rad_values = F.interpolate(
|
||||||
|
rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
|
||||||
|
).transpose(
|
||||||
|
2, 1
|
||||||
|
) #######
|
||||||
|
tmp_over_one %= 1
|
||||||
|
tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
|
||||||
|
cumsum_shift = torch.zeros_like(rad_values)
|
||||||
|
cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
|
||||||
|
sine_waves = torch.sin(
|
||||||
|
torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
|
||||||
|
)
|
||||||
|
sine_waves = sine_waves * self.sine_amp
|
||||||
|
uv = self._f02uv(f0)
|
||||||
|
uv = F.interpolate(
|
||||||
|
uv.transpose(2, 1), scale_factor=upp, mode="nearest"
|
||||||
|
).transpose(2, 1)
|
||||||
|
noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
|
||||||
|
noise = noise_amp * torch.randn_like(sine_waves)
|
||||||
|
sine_waves = sine_waves * uv + noise
|
||||||
|
return sine_waves, uv, noise
|
||||||
|
else:
|
||||||
|
with torch.no_grad():
|
||||||
|
# fundamental component
|
||||||
|
fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device))
|
||||||
|
|
||||||
# generate sine waveforms
|
# generate sine waveforms
|
||||||
sine_waves = self._f02sine(fn) * self.sine_amp
|
sine_waves = self._f02sine(fn) * self.sine_amp
|
||||||
|
|
||||||
# generate uv signal
|
# generate uv signal
|
||||||
# uv = torch.ones(f0.shape)
|
# uv = torch.ones(f0.shape)
|
||||||
# uv = uv * (f0 > self.voiced_threshold)
|
# uv = uv * (f0 > self.voiced_threshold)
|
||||||
uv = self._f02uv(f0)
|
uv = self._f02uv(f0)
|
||||||
|
|
||||||
# noise: for unvoiced should be similar to sine_amp
|
# noise: for unvoiced should be similar to sine_amp
|
||||||
# std = self.sine_amp/3 -> max value ~ self.sine_amp
|
# std = self.sine_amp/3 -> max value ~ self.sine_amp
|
||||||
# . for voiced regions is self.noise_std
|
# . for voiced regions is self.noise_std
|
||||||
noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
|
noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
|
||||||
noise = noise_amp * torch.randn_like(sine_waves)
|
noise = noise_amp * torch.randn_like(sine_waves)
|
||||||
|
|
||||||
# first: set the unvoiced part to 0 by uv
|
# first: set the unvoiced part to 0 by uv
|
||||||
# then: additive noise
|
# then: additive noise
|
||||||
sine_waves = sine_waves * uv + noise
|
sine_waves = sine_waves * uv + noise
|
||||||
return sine_waves, uv, noise
|
return sine_waves, uv, noise
|
||||||
|
|
||||||
|
|
||||||
class SourceModuleHnNSF(torch.nn.Module):
|
class SourceModuleHnNSF(torch.nn.Module):
|
||||||
|
@ -257,7 +304,7 @@ class SourceModuleHnNSF(torch.nn.Module):
|
||||||
self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
|
self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
|
||||||
self.l_tanh = torch.nn.Tanh()
|
self.l_tanh = torch.nn.Tanh()
|
||||||
|
|
||||||
def forward(self, x):
|
def forward(self, x, upp=None):
|
||||||
"""
|
"""
|
||||||
Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
|
Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
|
||||||
F0_sampled (batchsize, length, 1)
|
F0_sampled (batchsize, length, 1)
|
||||||
|
@ -265,7 +312,7 @@ class SourceModuleHnNSF(torch.nn.Module):
|
||||||
noise_source (batchsize, length 1)
|
noise_source (batchsize, length 1)
|
||||||
"""
|
"""
|
||||||
# source for harmonic branch
|
# source for harmonic branch
|
||||||
sine_wavs, uv, _ = self.l_sin_gen(x)
|
sine_wavs, uv, _ = self.l_sin_gen(x, upp)
|
||||||
sine_merge = self.l_tanh(self.l_linear(sine_wavs.to(self.l_linear.weight.dtype)))
|
sine_merge = self.l_tanh(self.l_linear(sine_wavs.to(self.l_linear.weight.dtype)))
|
||||||
|
|
||||||
# source for noise branch, in the same shape as uv
|
# source for noise branch, in the same shape as uv
|
||||||
|
@ -309,12 +356,19 @@ class Generator(torch.nn.Module):
|
||||||
self.ups.apply(init_weights)
|
self.ups.apply(init_weights)
|
||||||
self.conv_post.apply(init_weights)
|
self.conv_post.apply(init_weights)
|
||||||
self.cond = nn.Conv1d(h['gin_channels'], h['upsample_initial_channel'], 1)
|
self.cond = nn.Conv1d(h['gin_channels'], h['upsample_initial_channel'], 1)
|
||||||
|
self.upp = np.prod(h["upsample_rates"])
|
||||||
|
self.onnx = False
|
||||||
|
|
||||||
|
def OnnxExport(self):
|
||||||
|
self.onnx = True
|
||||||
|
self.m_source.l_sin_gen.onnx = True
|
||||||
|
|
||||||
def forward(self, x, f0, g=None):
|
def forward(self, x, f0, g=None):
|
||||||
# print(1,x.shape,f0.shape,f0[:, None].shape)
|
# print(1,x.shape,f0.shape,f0[:, None].shape)
|
||||||
f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2) # bs,n,t
|
if not self.onnx:
|
||||||
|
f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2) # bs,n,t
|
||||||
# print(2,f0.shape)
|
# print(2,f0.shape)
|
||||||
har_source, noi_source, uv = self.m_source(f0)
|
har_source, noi_source, uv = self.m_source(f0, self.upp)
|
||||||
har_source = har_source.transpose(1, 2)
|
har_source = har_source.transpose(1, 2)
|
||||||
x = self.conv_pre(x)
|
x = self.conv_pre(x)
|
||||||
x = x + self.cond(g)
|
x = x + self.cond(g)
|
||||||
|
|
Loading…
Reference in New Issue