diff --git a/.gitignore b/.gitignore index a34450c..4617d47 100644 --- a/.gitignore +++ b/.gitignore @@ -157,3 +157,10 @@ filelists/val.txt .idea/vcs.xml .idea/inspectionProfiles/profiles_settings.xml .idea/inspectionProfiles/Project_Default.xml +pretrain/vec-768-layer-12.onnx +pretrain/hubert-soft.onnx +pretrain/hubert4.0.onnx +pretrain/vec-256-layer-9.onnx +pretrain/vec-256-layer-12.onnx +pretrain/vec-768-layer-9.onnx +.vscode/launch.json diff --git a/inference/infer_tool.py b/inference/infer_tool.py index 9d0e016..b8d294d 100644 --- a/inference/infer_tool.py +++ b/inference/infer_tool.py @@ -219,16 +219,23 @@ class Svc(object): f0_predictor='pm', enhancer_adaptive_key = 0, cr_threshold = 0.05, - k_step = 100 + k_step = 100, + frame = 0, + spk_mix = False ): - - speaker_id = self.spk2id.get(speaker) - if not speaker_id and type(speaker) is int: - if len(self.spk2id.__dict__) >= speaker: - speaker_id = speaker - sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0) wav, sr = librosa.load(raw_path, sr=self.target_sample) - c, f0, uv = self.get_unit_f0(wav, tran, cluster_infer_ratio, speaker, f0_filter,f0_predictor,cr_threshold=cr_threshold) + if spk_mix: + c, f0, uv = self.get_unit_f0(wav, tran, 0, None, f0_filter,f0_predictor,cr_threshold=cr_threshold) + n_frames = f0.size(1) + sid = speaker[:, frame:frame+n_frames].transpose(0,1) + else: + speaker_id = self.spk2id.get(speaker) + if not speaker_id and type(speaker) is int: + if len(self.spk2id.__dict__) >= speaker: + speaker_id = speaker + sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0) + c, f0, uv = self.get_unit_f0(wav, tran, cluster_infer_ratio, speaker, f0_filter,f0_predictor,cr_threshold=cr_threshold) + n_frames = f0.size(1) if "half" in self.net_g_path and torch.cuda.is_available(): c = c.half() with torch.no_grad(): @@ -266,7 +273,7 @@ class Svc(object): adaptive_key = enhancer_adaptive_key) use_time = time.time() - start print("vits use time:{}".format(use_time)) - return audio, audio.shape[-1] + return audio, audio.shape[-1], n_frames def clear_empty(self): # clean up vram @@ -297,7 +304,8 @@ class Svc(object): f0_predictor='pm', enhancer_adaptive_key = 0, cr_threshold = 0.05, - k_step = 100 + k_step = 100, + use_spk_mix = False ): wav_path = Path(raw_audio_path).with_suffix('.wav') chunks = slicer.cut(wav_path, db_thresh=slice_db) @@ -308,7 +316,59 @@ class Svc(object): lg_size_c_l = (lg_size-lg_size_r)//2 lg_size_c_r = lg_size-lg_size_r-lg_size_c_l lg = np.linspace(0,1,lg_size_r) if lg_size!=0 else 0 - + + if use_spk_mix: + assert len(self.spk2id) == len(spk) + self.net_g_ms.EnableCharacterMix(len(self.spk2id), self.dev) + audio_length = 0 + for (slice_tag, data) in audio_data: + aud_length = int(np.ceil(len(data) / audio_sr * self.target_sample)) + if slice_tag: + audio_length += aud_length // self.hop_size + continue + if per_size != 0: + datas = split_list_by_n(data, per_size,lg_size) + else: + datas = [data] + for k,dat in enumerate(datas): + pad_len = int(audio_sr * pad_seconds) + per_length = int(np.ceil(len(dat) / audio_sr * self.target_sample)) + a_length = per_length + 2 * pad_len + audio_length += a_length // self.hop_size + audio_length += len(audio_data) + spk_mix_tensor = torch.zeros(size=(len(spk), audio_length)).to(self.dev) + for i in range(len(spk)): + last_end = None + for mix in spk[i]: + if mix[3]<0. or mix[2]<0.: + raise RuntimeError("mix value must higer Than zero!") + begin = int(audio_length * mix[0]) + end = int(audio_length * mix[1]) + length = end - begin + if length<=0: raise RuntimeError("begin Must lower Than end!") + step = (mix[3] - mix[2])/length + if last_end is not None: + if last_end != begin: + raise RuntimeError("[i]EndTime Must Equal [i+1]BeginTime!") + last_end = end + spk_mix_data = torch.arange(mix[2],mix[3],step).to(self.dev) + if(len(spk_mix_data) 1: # [N, S] * [S, B, 1, H] + g = g.reshape((g.shape[0], g.shape[1], 1, 1, 1)) # [N, S, B, 1, 1] + g = g * self.speaker_map # [N, S, B, 1, H] + g = torch.sum(g, dim=1) # [N, 1, B, 1, H] + g = g.transpose(0, -1).transpose(0, -2).squeeze(0) # [B, H, N] + else: + if g.dim() == 1: + g = g.unsqueeze(0) + g = self.emb_g(g).transpose(1, 2) + x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to(c.dtype) x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1, 2) diff --git a/spkmix.py b/spkmix.py new file mode 100644 index 0000000..1d266e0 --- /dev/null +++ b/spkmix.py @@ -0,0 +1,11 @@ +# 角色混合轨道 编写规则: +# 角色ID : [[起始时间1, 终止时间1, 起始数值1, 起始数值1], [起始时间2, 终止时间2, 起始数值2, 起始数值2]] +# 起始时间和前一个的终止时间必须相同,第一个起始时间必须为0,最后一个终止时间必须为1 (时间的范围为0-1) +# 全部角色必须填写,不使用的角色填[[0., 1., 0., 0.]]即可 +# 融合数值可以随便填,在指定的时间段内从起始数值线性变化为终止数值,内部会自动确保线性组合为1,可以放心使用 + +spk_mix_map = { + 0 : [[0., 0.5, 1, 0.5], [0.5, 1, 0.5, 1]], + 1 : [[0., 0.35, 1, 0.5], [0.35, 0.75, 0.75, 1], [0.75, 1, 0.45, 1]], + 2 : [[0., 0.35, 1, 0.5], [0.35, 0.75, 0.75, 1], [0.75, 1, 0.45, 1]] +} \ No newline at end of file diff --git a/utils.py b/utils.py index 0e3c170..394c5cb 100644 --- a/utils.py +++ b/utils.py @@ -9,7 +9,6 @@ import subprocess import warnings import random import functools - import librosa import numpy as np from scipy.io.wavfile import read @@ -110,19 +109,19 @@ def get_speech_encoder(speech_encoder,device=None,**kargs): speech_encoder_object = ContentVec256L9(device = device) elif speech_encoder == "vec256l9-onnx": from vencoder.ContentVec256L9_Onnx import ContentVec256L9_Onnx - speech_encoder_object = ContentVec256L9(device = device) + speech_encoder_object = ContentVec256L9_Onnx(device = device) elif speech_encoder == "vec256l12-onnx": from vencoder.ContentVec256L12_Onnx import ContentVec256L12_Onnx - speech_encoder_object = ContentVec256L9(device = device) + speech_encoder_object = ContentVec256L12_Onnx(device = device) elif speech_encoder == "vec768l9-onnx": from vencoder.ContentVec768L9_Onnx import ContentVec768L9_Onnx - speech_encoder_object = ContentVec256L9(device = device) + speech_encoder_object = ContentVec768L9_Onnx(device = device) elif speech_encoder == "vec768l12-onnx": from vencoder.ContentVec768L12_Onnx import ContentVec768L12_Onnx - speech_encoder_object = ContentVec256L9(device = device) + speech_encoder_object = ContentVec768L12_Onnx(device = device) elif speech_encoder == "hubertsoft-onnx": from vencoder.HubertSoft_Onnx import HubertSoft_Onnx - speech_encoder_object = HubertSoft(device = device) + speech_encoder_object = HubertSoft_Onnx(device = device) elif speech_encoder == "hubertsoft": from vencoder.HubertSoft import HubertSoft speech_encoder_object = HubertSoft(device = device)