Support diffusion spk_mix (unstable)
This commit is contained in:
parent
867f0c5a43
commit
6491e7b1ef
|
@ -73,6 +73,45 @@ class Unit2Mel(nn.Module):
|
|||
|
||||
# diffusion
|
||||
self.decoder = GaussianDiffusion(WaveNet(out_dims, n_layers, n_chans, n_hidden), out_dims=out_dims)
|
||||
self.input_channel = input_channel
|
||||
|
||||
def init_spkembed(self, units, f0, volume, spk_id = None, spk_mix_dict = None, aug_shift = None,
|
||||
gt_spec=None, infer=True, infer_speedup=10, method='dpm-solver', k_step=300, use_tqdm=True):
|
||||
|
||||
'''
|
||||
input:
|
||||
B x n_frames x n_unit
|
||||
return:
|
||||
dict of B x n_frames x feat
|
||||
'''
|
||||
x = self.unit_embed(units) + self.f0_embed((1+ f0 / 700).log()) + self.volume_embed(volume)
|
||||
if self.n_spk is not None and self.n_spk > 1:
|
||||
if spk_mix_dict is not None:
|
||||
spk_embed_mix = torch.zeros((1,1,self.hidden_size))
|
||||
for k, v in spk_mix_dict.items():
|
||||
spk_id_torch = torch.LongTensor(np.array([[k]])).to(units.device)
|
||||
spk_embeddd = self.spk_embed(spk_id_torch)
|
||||
self.speaker_map[k] = spk_embeddd
|
||||
spk_embed_mix = spk_embed_mix + v * spk_embeddd
|
||||
x = x + spk_embed_mix
|
||||
else:
|
||||
x = x + self.spk_embed(spk_id - 1)
|
||||
self.speaker_map = self.speaker_map.unsqueeze(0)
|
||||
self.speaker_map = self.speaker_map.detach()
|
||||
return x.transpose(1, 2)
|
||||
|
||||
def init_spkmix(self, n_spk):
|
||||
self.speaker_map = torch.zeros((n_spk,1,1,n_hidden))
|
||||
hubert_hidden_size = self.input_channel
|
||||
n_frames = 10
|
||||
hubert = torch.randn((1, n_frames, hubert_hidden_size))
|
||||
mel2ph = torch.arange(end=n_frames).unsqueeze(0).long()
|
||||
f0 = torch.randn((1, n_frames))
|
||||
volume = torch.randn((1, n_frames))
|
||||
spks = {}
|
||||
for i in range(n_spk):
|
||||
spks.update({i:1.0/float(self.n_spk)})
|
||||
orgouttt = self.init_spkembed(hubert, f0.unsqueeze(-1), volume.unsqueeze(-1), spk_mix_dict=spks)
|
||||
|
||||
def forward(self, units, f0, volume, spk_id = None, spk_mix_dict = None, aug_shift = None,
|
||||
gt_spec=None, infer=True, infer_speedup=10, method='dpm-solver', k_step=300, use_tqdm=True):
|
||||
|
@ -91,7 +130,14 @@ class Unit2Mel(nn.Module):
|
|||
spk_id_torch = torch.LongTensor(np.array([[k]])).to(units.device)
|
||||
x = x + v * self.spk_embed(spk_id_torch)
|
||||
else:
|
||||
x = x + self.spk_embed(spk_id)
|
||||
if len(spk_id) > 1:
|
||||
g = spk_id.reshape((spk_id.shape[0], spk_id.shape[1], 1, 1, 1)) # [N, S, B, 1, 1]
|
||||
g = g * self.speaker_map # [N, S, B, 1, H]
|
||||
g = torch.sum(g, dim=1) # [N, 1, B, 1, H]
|
||||
g = g.transpose(0, -1).transpose(0, -2).squeeze(0) # [B, H, N]
|
||||
x = x + g
|
||||
else:
|
||||
x = x + self.spk_embed(spk_id)
|
||||
if self.aug_shift_embed is not None and aug_shift is not None:
|
||||
x = x + self.aug_shift_embed(aug_shift / 5)
|
||||
x = self.decoder(x, gt_spec=gt_spec, infer=infer, infer_speedup=infer_speedup, method=method, k_step=k_step, use_tqdm=use_tqdm)
|
||||
|
|
|
@ -150,6 +150,7 @@ class Svc(object):
|
|||
self.hop_size = self.diffusion_args.data.block_size
|
||||
self.spk2id = self.diffusion_args.spk
|
||||
self.speech_encoder = self.diffusion_args.data.encoder
|
||||
self.diffusion_model.init_spkmix(len(self.spk2id))
|
||||
else:
|
||||
print("No diffusion model or config found. Shallow diffusion mode will False")
|
||||
self.shallow_diffusion = self.only_diffusion = False
|
||||
|
@ -181,8 +182,7 @@ class Svc(object):
|
|||
_ = self.net_g_ms.half().eval().to(self.dev)
|
||||
else:
|
||||
_ = self.net_g_ms.eval().to(self.dev)
|
||||
|
||||
|
||||
self.net_g_ms.EnableCharacterMix(len(self.spk2id), self.dev)
|
||||
|
||||
def get_unit_f0(self, wav, tran, cluster_infer_ratio, speaker, f0_filter ,f0_predictor,cr_threshold=0.05):
|
||||
|
||||
|
@ -230,6 +230,8 @@ class Svc(object):
|
|||
sid = speaker[:, frame:frame+n_frames].transpose(0,1)
|
||||
else:
|
||||
speaker_id = self.spk2id.get(speaker)
|
||||
if speaker_id is None:
|
||||
raise RuntimeError("The name you entered is not in the speaker list!")
|
||||
if not speaker_id and type(speaker) is int:
|
||||
if len(self.spk2id.__dict__) >= speaker:
|
||||
speaker_id = speaker
|
||||
|
@ -307,6 +309,10 @@ class Svc(object):
|
|||
k_step = 100,
|
||||
use_spk_mix = False
|
||||
):
|
||||
if use_spk_mix:
|
||||
if len(self.spk2id) == 1:
|
||||
spk = self.spk2id.keys()[0]
|
||||
use_spk_mix = False
|
||||
wav_path = Path(raw_audio_path).with_suffix('.wav')
|
||||
chunks = slicer.cut(wav_path, db_thresh=slice_db)
|
||||
audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)
|
||||
|
@ -319,7 +325,6 @@ class Svc(object):
|
|||
|
||||
if use_spk_mix:
|
||||
assert len(self.spk2id) == len(spk)
|
||||
self.net_g_ms.EnableCharacterMix(len(self.spk2id), self.dev)
|
||||
audio_length = 0
|
||||
for (slice_tag, data) in audio_data:
|
||||
aud_length = int(np.ceil(len(data) / audio_sr * self.target_sample))
|
||||
|
|
|
@ -27,7 +27,7 @@ def main():
|
|||
parser.add_argument('-cl', '--clip', type=float, default=0, help='音频强制切片,默认0为自动切片,单位为秒/s')
|
||||
parser.add_argument('-n', '--clean_names', type=str, nargs='+', default=["test.wav"], help='wav文件名列表,放在raw文件夹下')
|
||||
parser.add_argument('-t', '--trans', type=int, nargs='+', default=[0], help='音高调整,支持正负(半音)')
|
||||
parser.add_argument('-s', '--spk_list', type=str, nargs='+', default=['nen'], help='合成目标说话人名称')
|
||||
parser.add_argument('-s', '--spk_list', type=str, nargs='+', default=['buyizi'], help='合成目标说话人名称')
|
||||
|
||||
# 可选项部分
|
||||
parser.add_argument('-a', '--auto_predict_f0', action='store_true', default=False, help='语音转换自动预测音高,转换歌声时不要打开这个会严重跑调')
|
||||
|
@ -37,7 +37,7 @@ def main():
|
|||
parser.add_argument('-f0p', '--f0_predictor', type=str, default="pm", help='选择F0预测器,可选择crepe,pm,dio,harvest,默认为pm(注意:crepe为原F0使用均值滤波器)')
|
||||
parser.add_argument('-eh', '--enhance', action='store_true', default=False, help='是否使用NSF_HIFIGAN增强器,该选项对部分训练集少的模型有一定的音质增强效果,但是对训练好的模型有反面效果,默认关闭')
|
||||
parser.add_argument('-shd', '--shallow_diffusion', action='store_true', default=False, help='是否使用浅层扩散,使用后可解决一部分电音问题,默认关闭,该选项打开时,NSF_HIFIGAN增强器将会被禁止')
|
||||
parser.add_argument('-usm', '--use_spk_mix', action='store_true', default=True, help='是否使用角色融合')
|
||||
parser.add_argument('-usm', '--use_spk_mix', action='store_true', default=False, help='是否使用角色融合')
|
||||
|
||||
# 浅扩散设置
|
||||
parser.add_argument('-dm', '--diffusion_model_path', type=str, default="logs/44k/diffusion/model_0.pt", help='扩散模型路径')
|
||||
|
@ -82,7 +82,8 @@ def main():
|
|||
use_spk_mix = args.use_spk_mix
|
||||
svc_model = Svc(args.model_path, args.config_path, args.device, args.cluster_model_path,enhance,diffusion_model_path,diffusion_config_path,shallow_diffusion,only_diffusion)
|
||||
infer_tool.mkdir(["raw", "results"])
|
||||
|
||||
if len(spk_mix_map)<=1:
|
||||
use_spk_mix = False
|
||||
if use_spk_mix:
|
||||
spk_list = [spk_mix_map]
|
||||
|
||||
|
|
Loading…
Reference in New Issue