NsfHifigan在DML中上采样出现错误以及SourceModuleHnNSF这两个BUG的修复
This commit is contained in:
parent
72deb15df3
commit
90c9ccc6a8
180
onnx_export.py
180
onnx_export.py
|
@ -1,56 +1,138 @@
|
|||
import json
|
||||
import torch
|
||||
|
||||
import utils
|
||||
from onnxexport.model_onnx import SynthesizerTrn
|
||||
from onnxexport.model_onnx_speaker_mix import SynthesizerTrn
|
||||
|
||||
|
||||
def main(NetExport):
|
||||
path = "SoVits4.0"
|
||||
if NetExport:
|
||||
device = torch.device("cpu")
|
||||
hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json")
|
||||
SVCVITS = SynthesizerTrn(
|
||||
hps.data.filter_length // 2 + 1,
|
||||
hps.train.segment_size // hps.data.hop_length,
|
||||
**hps.model)
|
||||
_ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", SVCVITS, None)
|
||||
_ = SVCVITS.eval().to(device)
|
||||
for i in SVCVITS.parameters():
|
||||
i.requires_grad = False
|
||||
|
||||
n_frame = 10
|
||||
test_hidden_unit = torch.rand(1, n_frame, 256)
|
||||
test_pitch = torch.rand(1, n_frame)
|
||||
test_mel2ph = torch.arange(0, n_frame, dtype=torch.int64)[None] # torch.LongTensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]).unsqueeze(0)
|
||||
test_uv = torch.ones(1, n_frame, dtype=torch.float32)
|
||||
test_noise = torch.randn(1, 192, n_frame)
|
||||
test_sid = torch.LongTensor([0])
|
||||
input_names = ["c", "f0", "mel2ph", "uv", "noise", "sid"]
|
||||
output_names = ["audio", ]
|
||||
|
||||
torch.onnx.export(SVCVITS,
|
||||
(
|
||||
test_hidden_unit.to(device),
|
||||
test_pitch.to(device),
|
||||
test_mel2ph.to(device),
|
||||
test_uv.to(device),
|
||||
test_noise.to(device),
|
||||
test_sid.to(device)
|
||||
),
|
||||
f"checkpoints/{path}/model.onnx",
|
||||
dynamic_axes={
|
||||
"c": [0, 1],
|
||||
"f0": [1],
|
||||
"mel2ph": [1],
|
||||
"uv": [1],
|
||||
"noise": [2],
|
||||
},
|
||||
do_constant_folding=False,
|
||||
opset_version=16,
|
||||
verbose=False,
|
||||
input_names=input_names,
|
||||
output_names=output_names)
|
||||
def main():
|
||||
path = "crs"
|
||||
|
||||
device = torch.device("cpu")
|
||||
hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json")
|
||||
SVCVITS = SynthesizerTrn(
|
||||
hps.data.filter_length // 2 + 1,
|
||||
hps.train.segment_size // hps.data.hop_length,
|
||||
**hps.model)
|
||||
_ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", SVCVITS, None)
|
||||
_ = SVCVITS.eval().to(device)
|
||||
for i in SVCVITS.parameters():
|
||||
i.requires_grad = False
|
||||
|
||||
num_frames = 200
|
||||
|
||||
test_hidden_unit = torch.rand(1, num_frames, SVCVITS.gin_channels)
|
||||
test_pitch = torch.rand(1, num_frames)
|
||||
test_vol = torch.rand(1, num_frames)
|
||||
test_mel2ph = torch.LongTensor(torch.arange(0, num_frames)).unsqueeze(0)
|
||||
test_uv = torch.ones(1, num_frames, dtype=torch.float32)
|
||||
test_noise = torch.randn(1, 192, num_frames)
|
||||
test_sid = torch.LongTensor([0])
|
||||
export_mix = True
|
||||
if len(hps.spk) < 2:
|
||||
export_mix = False
|
||||
|
||||
if export_mix:
|
||||
spk_mix = []
|
||||
n_spk = len(hps.spk)
|
||||
for i in range(n_spk):
|
||||
spk_mix.append(1.0/float(n_spk))
|
||||
test_sid = torch.tensor(spk_mix)
|
||||
SVCVITS.export_chara_mix(hps.spk)
|
||||
test_sid = test_sid.unsqueeze(0)
|
||||
test_sid = test_sid.repeat(num_frames, 1)
|
||||
|
||||
SVCVITS.eval()
|
||||
|
||||
if export_mix:
|
||||
daxes = {
|
||||
"c": [0, 1],
|
||||
"f0": [1],
|
||||
"mel2ph": [1],
|
||||
"uv": [1],
|
||||
"noise": [2],
|
||||
"sid":[0]
|
||||
}
|
||||
else:
|
||||
daxes = {
|
||||
"c": [0, 1],
|
||||
"f0": [1],
|
||||
"mel2ph": [1],
|
||||
"uv": [1],
|
||||
"noise": [2]
|
||||
}
|
||||
|
||||
input_names = ["c", "f0", "mel2ph", "uv", "noise", "sid"]
|
||||
output_names = ["audio", ]
|
||||
|
||||
if SVCVITS.vol_embedding:
|
||||
input_names.append("vol")
|
||||
vol_dadict = {"vol" : [1]}
|
||||
daxes.update(vol_dadict)
|
||||
test_inputs = (
|
||||
test_hidden_unit.to(device),
|
||||
test_pitch.to(device),
|
||||
test_mel2ph.to(device),
|
||||
test_uv.to(device),
|
||||
test_noise.to(device),
|
||||
test_sid.to(device),
|
||||
test_vol.to(device)
|
||||
)
|
||||
else:
|
||||
test_inputs = (
|
||||
test_hidden_unit.to(device),
|
||||
test_pitch.to(device),
|
||||
test_mel2ph.to(device),
|
||||
test_uv.to(device),
|
||||
test_noise.to(device),
|
||||
test_sid.to(device)
|
||||
)
|
||||
|
||||
# SVCVITS = torch.jit.script(SVCVITS)
|
||||
SVCVITS(test_hidden_unit.to(device),
|
||||
test_pitch.to(device),
|
||||
test_mel2ph.to(device),
|
||||
test_uv.to(device),
|
||||
test_noise.to(device),
|
||||
test_sid.to(device),
|
||||
test_vol.to(device))
|
||||
|
||||
SVCVITS.dec.OnnxExport()
|
||||
|
||||
torch.onnx.export(
|
||||
SVCVITS,
|
||||
test_inputs,
|
||||
f"checkpoints/{path}/{path}_SoVits.onnx",
|
||||
dynamic_axes=daxes,
|
||||
do_constant_folding=False,
|
||||
opset_version=16,
|
||||
verbose=False,
|
||||
input_names=input_names,
|
||||
output_names=output_names
|
||||
)
|
||||
|
||||
vec_lay = "layer-12" if SVCVITS.gin_channels == 768 else "layer-9"
|
||||
spklist = []
|
||||
for key in hps.spk.keys():
|
||||
spklist.append(key)
|
||||
|
||||
MoeVSConf = {
|
||||
"Folder" : f"{path}",
|
||||
"Name" : f"{path}",
|
||||
"Type" : "SoVits",
|
||||
"Rate" : hps.data.sampling_rate,
|
||||
"Hop" : hps.data.hop_length,
|
||||
"Hubert": f"vec-{SVCVITS.gin_channels}-{vec_lay}",
|
||||
"SoVits4": True,
|
||||
"SoVits3": False,
|
||||
"CharaMix": export_mix,
|
||||
"Volume": SVCVITS.vol_embedding,
|
||||
"HiddenSize": SVCVITS.gin_channels,
|
||||
"Characters": spklist
|
||||
}
|
||||
|
||||
with open(f"checkpoints/{path}.json", 'w') as MoeVsConfFile:
|
||||
json.dump(MoeVSConf, MoeVsConfFile, indent = 4)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(True)
|
||||
main()
|
||||
|
|
|
@ -0,0 +1,56 @@
|
|||
import torch
|
||||
|
||||
import utils
|
||||
from onnxexport.model_onnx import SynthesizerTrn
|
||||
|
||||
|
||||
def main(NetExport):
|
||||
path = "SoVits4.0"
|
||||
if NetExport:
|
||||
device = torch.device("cpu")
|
||||
hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json")
|
||||
SVCVITS = SynthesizerTrn(
|
||||
hps.data.filter_length // 2 + 1,
|
||||
hps.train.segment_size // hps.data.hop_length,
|
||||
**hps.model)
|
||||
_ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", SVCVITS, None)
|
||||
_ = SVCVITS.eval().to(device)
|
||||
for i in SVCVITS.parameters():
|
||||
i.requires_grad = False
|
||||
|
||||
n_frame = 10
|
||||
test_hidden_unit = torch.rand(1, n_frame, 256)
|
||||
test_pitch = torch.rand(1, n_frame)
|
||||
test_mel2ph = torch.arange(0, n_frame, dtype=torch.int64)[None] # torch.LongTensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]).unsqueeze(0)
|
||||
test_uv = torch.ones(1, n_frame, dtype=torch.float32)
|
||||
test_noise = torch.randn(1, 192, n_frame)
|
||||
test_sid = torch.LongTensor([0])
|
||||
input_names = ["c", "f0", "mel2ph", "uv", "noise", "sid"]
|
||||
output_names = ["audio", ]
|
||||
|
||||
torch.onnx.export(SVCVITS,
|
||||
(
|
||||
test_hidden_unit.to(device),
|
||||
test_pitch.to(device),
|
||||
test_mel2ph.to(device),
|
||||
test_uv.to(device),
|
||||
test_noise.to(device),
|
||||
test_sid.to(device)
|
||||
),
|
||||
f"checkpoints/{path}/model.onnx",
|
||||
dynamic_axes={
|
||||
"c": [0, 1],
|
||||
"f0": [1],
|
||||
"mel2ph": [1],
|
||||
"uv": [1],
|
||||
"noise": [2],
|
||||
},
|
||||
do_constant_folding=False,
|
||||
opset_version=16,
|
||||
verbose=False,
|
||||
input_names=input_names,
|
||||
output_names=output_names)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(True)
|
|
@ -1,138 +0,0 @@
|
|||
import json
|
||||
|
||||
import torch
|
||||
|
||||
import utils
|
||||
from onnxexport.model_onnx_speaker_mix import SynthesizerTrn
|
||||
|
||||
|
||||
def main():
|
||||
path = "crs"
|
||||
|
||||
device = torch.device("cpu")
|
||||
hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json")
|
||||
SVCVITS = SynthesizerTrn(
|
||||
hps.data.filter_length // 2 + 1,
|
||||
hps.train.segment_size // hps.data.hop_length,
|
||||
**hps.model)
|
||||
_ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", SVCVITS, None)
|
||||
_ = SVCVITS.eval().to(device)
|
||||
for i in SVCVITS.parameters():
|
||||
i.requires_grad = False
|
||||
|
||||
num_frames = 200
|
||||
|
||||
test_hidden_unit = torch.rand(1, num_frames, SVCVITS.gin_channels)
|
||||
test_pitch = torch.rand(1, num_frames)
|
||||
test_vol = torch.rand(1, num_frames)
|
||||
test_mel2ph = torch.LongTensor(torch.arange(0, num_frames)).unsqueeze(0)
|
||||
test_uv = torch.ones(1, num_frames, dtype=torch.float32)
|
||||
test_noise = torch.randn(1, 192, num_frames)
|
||||
test_sid = torch.LongTensor([0])
|
||||
export_mix = True
|
||||
if len(hps.spk) < 2:
|
||||
export_mix = False
|
||||
|
||||
if export_mix:
|
||||
spk_mix = []
|
||||
n_spk = len(hps.spk)
|
||||
for i in range(n_spk):
|
||||
spk_mix.append(1.0/float(n_spk))
|
||||
test_sid = torch.tensor(spk_mix)
|
||||
SVCVITS.export_chara_mix(hps.spk)
|
||||
test_sid = test_sid.unsqueeze(0)
|
||||
test_sid = test_sid.repeat(num_frames, 1)
|
||||
|
||||
SVCVITS.eval()
|
||||
|
||||
if export_mix:
|
||||
daxes = {
|
||||
"c": [0, 1],
|
||||
"f0": [1],
|
||||
"mel2ph": [1],
|
||||
"uv": [1],
|
||||
"noise": [2],
|
||||
"sid":[0]
|
||||
}
|
||||
else:
|
||||
daxes = {
|
||||
"c": [0, 1],
|
||||
"f0": [1],
|
||||
"mel2ph": [1],
|
||||
"uv": [1],
|
||||
"noise": [2]
|
||||
}
|
||||
|
||||
input_names = ["c", "f0", "mel2ph", "uv", "noise", "sid"]
|
||||
output_names = ["audio", ]
|
||||
|
||||
if SVCVITS.vol_embedding:
|
||||
input_names.append("vol")
|
||||
vol_dadict = {"vol" : [1]}
|
||||
daxes.update(vol_dadict)
|
||||
test_inputs = (
|
||||
test_hidden_unit.to(device),
|
||||
test_pitch.to(device),
|
||||
test_mel2ph.to(device),
|
||||
test_uv.to(device),
|
||||
test_noise.to(device),
|
||||
test_sid.to(device),
|
||||
test_vol.to(device)
|
||||
)
|
||||
else:
|
||||
test_inputs = (
|
||||
test_hidden_unit.to(device),
|
||||
test_pitch.to(device),
|
||||
test_mel2ph.to(device),
|
||||
test_uv.to(device),
|
||||
test_noise.to(device),
|
||||
test_sid.to(device)
|
||||
)
|
||||
|
||||
# SVCVITS = torch.jit.script(SVCVITS)
|
||||
SVCVITS(test_hidden_unit.to(device),
|
||||
test_pitch.to(device),
|
||||
test_mel2ph.to(device),
|
||||
test_uv.to(device),
|
||||
test_noise.to(device),
|
||||
test_sid.to(device),
|
||||
test_vol.to(device))
|
||||
|
||||
torch.onnx.export(
|
||||
SVCVITS,
|
||||
test_inputs,
|
||||
f"checkpoints/{path}/{path}_SoVits.onnx",
|
||||
dynamic_axes=daxes,
|
||||
do_constant_folding=False,
|
||||
opset_version=16,
|
||||
verbose=False,
|
||||
input_names=input_names,
|
||||
output_names=output_names
|
||||
)
|
||||
|
||||
vec_lay = "layer-12" if SVCVITS.gin_channels == 768 else "layer-9"
|
||||
spklist = []
|
||||
for key in hps.spk.keys():
|
||||
spklist.append(key)
|
||||
|
||||
MoeVSConf = {
|
||||
"Folder" : f"{path}",
|
||||
"Name" : f"{path}",
|
||||
"Type" : "SoVits",
|
||||
"Rate" : hps.data.sampling_rate,
|
||||
"Hop" : hps.data.hop_length,
|
||||
"Hubert": f"vec-{SVCVITS.gin_channels}-{vec_lay}",
|
||||
"SoVits4": True,
|
||||
"SoVits3": False,
|
||||
"CharaMix": export_mix,
|
||||
"Volume": SVCVITS.vol_embedding,
|
||||
"HiddenSize": SVCVITS.gin_channels,
|
||||
"Characters": spklist
|
||||
}
|
||||
|
||||
with open(f"checkpoints/{path}.json", 'w') as MoeVsConfFile:
|
||||
json.dump(MoeVSConf, MoeVsConfFile, indent = 4)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
18
utils.py
18
utils.py
|
@ -68,14 +68,16 @@ def plot_data_to_numpy(x, y):
|
|||
|
||||
|
||||
def f0_to_coarse(f0):
|
||||
is_torch = isinstance(f0, torch.Tensor)
|
||||
f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700)
|
||||
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1
|
||||
|
||||
f0_mel[f0_mel <= 1] = 1
|
||||
f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1
|
||||
f0_coarse = (f0_mel + 0.5).int() if is_torch else np.rint(f0_mel).astype(np.int)
|
||||
assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (f0_coarse.max(), f0_coarse.min())
|
||||
f0_mel = 1127 * (1 + f0 / 700).log()
|
||||
a = (f0_bin - 2) / (f0_mel_max - f0_mel_min)
|
||||
b = f0_mel_min * a - 1.
|
||||
f0_mel = torch.where(f0_mel > 0, f0_mel * a - b, f0_mel)
|
||||
# torch.clip_(f0_mel, min=1., max=float(f0_bin - 1))
|
||||
f0_coarse = torch.round(f0_mel).long()
|
||||
f0_coarse = f0_coarse * (f0_coarse > 0)
|
||||
f0_coarse = f0_coarse + ((f0_coarse < 1) * 1)
|
||||
f0_coarse = f0_coarse * (f0_coarse < f0_bin)
|
||||
f0_coarse = f0_coarse + ((f0_coarse >= f0_bin) * (f0_bin - 1))
|
||||
return f0_coarse
|
||||
|
||||
def get_content(cmodel, y):
|
||||
|
|
|
@ -128,6 +128,7 @@ class SineGen(torch.nn.Module):
|
|||
self.sampling_rate = samp_rate
|
||||
self.voiced_threshold = voiced_threshold
|
||||
self.flag_for_pulse = flag_for_pulse
|
||||
self.onnx = False
|
||||
|
||||
def _f02uv(self, f0):
|
||||
# generate uv signal
|
||||
|
@ -193,35 +194,81 @@ class SineGen(torch.nn.Module):
|
|||
sines = torch.cos(i_phase * 2 * np.pi)
|
||||
return sines
|
||||
|
||||
def forward(self, f0):
|
||||
def forward(self, f0, upp=None):
|
||||
""" sine_tensor, uv = forward(f0)
|
||||
input F0: tensor(batchsize=1, length, dim=1)
|
||||
f0 for unvoiced steps should be 0
|
||||
output sine_tensor: tensor(batchsize=1, length, dim)
|
||||
output uv: tensor(batchsize=1, length, 1)
|
||||
"""
|
||||
with torch.no_grad():
|
||||
# fundamental component
|
||||
fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device))
|
||||
if self.onnx:
|
||||
with torch.no_grad():
|
||||
f0 = f0[:, None].transpose(1, 2)
|
||||
f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
|
||||
# fundamental component
|
||||
f0_buf[:, :, 0] = f0[:, :, 0]
|
||||
for idx in np.arange(self.harmonic_num):
|
||||
f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
|
||||
idx + 2
|
||||
) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
|
||||
rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化
|
||||
rand_ini = torch.rand(
|
||||
f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
|
||||
)
|
||||
rand_ini[:, 0] = 0
|
||||
rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
|
||||
tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化
|
||||
tmp_over_one *= upp
|
||||
tmp_over_one = F.interpolate(
|
||||
tmp_over_one.transpose(2, 1),
|
||||
scale_factor=upp,
|
||||
mode="linear",
|
||||
align_corners=True,
|
||||
).transpose(2, 1)
|
||||
rad_values = F.interpolate(
|
||||
rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
|
||||
).transpose(
|
||||
2, 1
|
||||
) #######
|
||||
tmp_over_one %= 1
|
||||
tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
|
||||
cumsum_shift = torch.zeros_like(rad_values)
|
||||
cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
|
||||
sine_waves = torch.sin(
|
||||
torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
|
||||
)
|
||||
sine_waves = sine_waves * self.sine_amp
|
||||
uv = self._f02uv(f0)
|
||||
uv = F.interpolate(
|
||||
uv.transpose(2, 1), scale_factor=upp, mode="nearest"
|
||||
).transpose(2, 1)
|
||||
noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
|
||||
noise = noise_amp * torch.randn_like(sine_waves)
|
||||
sine_waves = sine_waves * uv + noise
|
||||
return sine_waves, uv, noise
|
||||
else:
|
||||
with torch.no_grad():
|
||||
# fundamental component
|
||||
fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device))
|
||||
|
||||
# generate sine waveforms
|
||||
sine_waves = self._f02sine(fn) * self.sine_amp
|
||||
# generate sine waveforms
|
||||
sine_waves = self._f02sine(fn) * self.sine_amp
|
||||
|
||||
# generate uv signal
|
||||
# uv = torch.ones(f0.shape)
|
||||
# uv = uv * (f0 > self.voiced_threshold)
|
||||
uv = self._f02uv(f0)
|
||||
# generate uv signal
|
||||
# uv = torch.ones(f0.shape)
|
||||
# uv = uv * (f0 > self.voiced_threshold)
|
||||
uv = self._f02uv(f0)
|
||||
|
||||
# noise: for unvoiced should be similar to sine_amp
|
||||
# std = self.sine_amp/3 -> max value ~ self.sine_amp
|
||||
# . for voiced regions is self.noise_std
|
||||
noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
|
||||
noise = noise_amp * torch.randn_like(sine_waves)
|
||||
# noise: for unvoiced should be similar to sine_amp
|
||||
# std = self.sine_amp/3 -> max value ~ self.sine_amp
|
||||
# . for voiced regions is self.noise_std
|
||||
noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
|
||||
noise = noise_amp * torch.randn_like(sine_waves)
|
||||
|
||||
# first: set the unvoiced part to 0 by uv
|
||||
# then: additive noise
|
||||
sine_waves = sine_waves * uv + noise
|
||||
return sine_waves, uv, noise
|
||||
# first: set the unvoiced part to 0 by uv
|
||||
# then: additive noise
|
||||
sine_waves = sine_waves * uv + noise
|
||||
return sine_waves, uv, noise
|
||||
|
||||
|
||||
class SourceModuleHnNSF(torch.nn.Module):
|
||||
|
@ -257,7 +304,7 @@ class SourceModuleHnNSF(torch.nn.Module):
|
|||
self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
|
||||
self.l_tanh = torch.nn.Tanh()
|
||||
|
||||
def forward(self, x):
|
||||
def forward(self, x, upp=None):
|
||||
"""
|
||||
Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
|
||||
F0_sampled (batchsize, length, 1)
|
||||
|
@ -265,7 +312,7 @@ class SourceModuleHnNSF(torch.nn.Module):
|
|||
noise_source (batchsize, length 1)
|
||||
"""
|
||||
# source for harmonic branch
|
||||
sine_wavs, uv, _ = self.l_sin_gen(x)
|
||||
sine_wavs, uv, _ = self.l_sin_gen(x, upp)
|
||||
sine_merge = self.l_tanh(self.l_linear(sine_wavs.to(self.l_linear.weight.dtype)))
|
||||
|
||||
# source for noise branch, in the same shape as uv
|
||||
|
@ -309,12 +356,19 @@ class Generator(torch.nn.Module):
|
|||
self.ups.apply(init_weights)
|
||||
self.conv_post.apply(init_weights)
|
||||
self.cond = nn.Conv1d(h['gin_channels'], h['upsample_initial_channel'], 1)
|
||||
self.upp = np.prod(h["upsample_rates"])
|
||||
self.onnx = False
|
||||
|
||||
def OnnxExport(self):
|
||||
self.onnx = True
|
||||
self.m_source.l_sin_gen.onnx = True
|
||||
|
||||
def forward(self, x, f0, g=None):
|
||||
# print(1,x.shape,f0.shape,f0[:, None].shape)
|
||||
f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2) # bs,n,t
|
||||
if not self.onnx:
|
||||
f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2) # bs,n,t
|
||||
# print(2,f0.shape)
|
||||
har_source, noi_source, uv = self.m_source(f0)
|
||||
har_source, noi_source, uv = self.m_source(f0, self.upp)
|
||||
har_source = har_source.transpose(1, 2)
|
||||
x = self.conv_pre(x)
|
||||
x = x + self.cond(g)
|
||||
|
|
Loading…
Reference in New Issue