Support CharacterMix TimeLine
This commit is contained in:
parent
04e4495dce
commit
867f0c5a43
|
@ -157,3 +157,10 @@ filelists/val.txt
|
|||
.idea/vcs.xml
|
||||
.idea/inspectionProfiles/profiles_settings.xml
|
||||
.idea/inspectionProfiles/Project_Default.xml
|
||||
pretrain/vec-768-layer-12.onnx
|
||||
pretrain/hubert-soft.onnx
|
||||
pretrain/hubert4.0.onnx
|
||||
pretrain/vec-256-layer-9.onnx
|
||||
pretrain/vec-256-layer-12.onnx
|
||||
pretrain/vec-768-layer-9.onnx
|
||||
.vscode/launch.json
|
||||
|
|
|
@ -219,16 +219,23 @@ class Svc(object):
|
|||
f0_predictor='pm',
|
||||
enhancer_adaptive_key = 0,
|
||||
cr_threshold = 0.05,
|
||||
k_step = 100
|
||||
k_step = 100,
|
||||
frame = 0,
|
||||
spk_mix = False
|
||||
):
|
||||
|
||||
speaker_id = self.spk2id.get(speaker)
|
||||
if not speaker_id and type(speaker) is int:
|
||||
if len(self.spk2id.__dict__) >= speaker:
|
||||
speaker_id = speaker
|
||||
sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0)
|
||||
wav, sr = librosa.load(raw_path, sr=self.target_sample)
|
||||
c, f0, uv = self.get_unit_f0(wav, tran, cluster_infer_ratio, speaker, f0_filter,f0_predictor,cr_threshold=cr_threshold)
|
||||
if spk_mix:
|
||||
c, f0, uv = self.get_unit_f0(wav, tran, 0, None, f0_filter,f0_predictor,cr_threshold=cr_threshold)
|
||||
n_frames = f0.size(1)
|
||||
sid = speaker[:, frame:frame+n_frames].transpose(0,1)
|
||||
else:
|
||||
speaker_id = self.spk2id.get(speaker)
|
||||
if not speaker_id and type(speaker) is int:
|
||||
if len(self.spk2id.__dict__) >= speaker:
|
||||
speaker_id = speaker
|
||||
sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0)
|
||||
c, f0, uv = self.get_unit_f0(wav, tran, cluster_infer_ratio, speaker, f0_filter,f0_predictor,cr_threshold=cr_threshold)
|
||||
n_frames = f0.size(1)
|
||||
if "half" in self.net_g_path and torch.cuda.is_available():
|
||||
c = c.half()
|
||||
with torch.no_grad():
|
||||
|
@ -266,7 +273,7 @@ class Svc(object):
|
|||
adaptive_key = enhancer_adaptive_key)
|
||||
use_time = time.time() - start
|
||||
print("vits use time:{}".format(use_time))
|
||||
return audio, audio.shape[-1]
|
||||
return audio, audio.shape[-1], n_frames
|
||||
|
||||
def clear_empty(self):
|
||||
# clean up vram
|
||||
|
@ -297,7 +304,8 @@ class Svc(object):
|
|||
f0_predictor='pm',
|
||||
enhancer_adaptive_key = 0,
|
||||
cr_threshold = 0.05,
|
||||
k_step = 100
|
||||
k_step = 100,
|
||||
use_spk_mix = False
|
||||
):
|
||||
wav_path = Path(raw_audio_path).with_suffix('.wav')
|
||||
chunks = slicer.cut(wav_path, db_thresh=slice_db)
|
||||
|
@ -308,7 +316,59 @@ class Svc(object):
|
|||
lg_size_c_l = (lg_size-lg_size_r)//2
|
||||
lg_size_c_r = lg_size-lg_size_r-lg_size_c_l
|
||||
lg = np.linspace(0,1,lg_size_r) if lg_size!=0 else 0
|
||||
|
||||
|
||||
if use_spk_mix:
|
||||
assert len(self.spk2id) == len(spk)
|
||||
self.net_g_ms.EnableCharacterMix(len(self.spk2id), self.dev)
|
||||
audio_length = 0
|
||||
for (slice_tag, data) in audio_data:
|
||||
aud_length = int(np.ceil(len(data) / audio_sr * self.target_sample))
|
||||
if slice_tag:
|
||||
audio_length += aud_length // self.hop_size
|
||||
continue
|
||||
if per_size != 0:
|
||||
datas = split_list_by_n(data, per_size,lg_size)
|
||||
else:
|
||||
datas = [data]
|
||||
for k,dat in enumerate(datas):
|
||||
pad_len = int(audio_sr * pad_seconds)
|
||||
per_length = int(np.ceil(len(dat) / audio_sr * self.target_sample))
|
||||
a_length = per_length + 2 * pad_len
|
||||
audio_length += a_length // self.hop_size
|
||||
audio_length += len(audio_data)
|
||||
spk_mix_tensor = torch.zeros(size=(len(spk), audio_length)).to(self.dev)
|
||||
for i in range(len(spk)):
|
||||
last_end = None
|
||||
for mix in spk[i]:
|
||||
if mix[3]<0. or mix[2]<0.:
|
||||
raise RuntimeError("mix value must higer Than zero!")
|
||||
begin = int(audio_length * mix[0])
|
||||
end = int(audio_length * mix[1])
|
||||
length = end - begin
|
||||
if length<=0: raise RuntimeError("begin Must lower Than end!")
|
||||
step = (mix[3] - mix[2])/length
|
||||
if last_end is not None:
|
||||
if last_end != begin:
|
||||
raise RuntimeError("[i]EndTime Must Equal [i+1]BeginTime!")
|
||||
last_end = end
|
||||
spk_mix_data = torch.arange(mix[2],mix[3],step).to(self.dev)
|
||||
if(len(spk_mix_data)<length):
|
||||
num_pad = length - len(spk_mix_data)
|
||||
spk_mix_data = torch.nn.functional.pad(spk_mix_data, [0, num_pad], mode="reflect").to(self.dev)
|
||||
spk_mix_tensor[i][begin:end] = spk_mix_data[:length]
|
||||
|
||||
spk_mix_ten = torch.sum(spk_mix_tensor,dim=0).unsqueeze(0).to(self.dev)
|
||||
# spk_mix_tensor[0][spk_mix_ten<0.001] = 1.0
|
||||
for i, x in enumerate(spk_mix_ten[0]):
|
||||
if x == 0.0:
|
||||
spk_mix_ten[0][i] = 1.0
|
||||
spk_mix_tensor[:,i] = 1.0 / len(spk)
|
||||
spk_mix_tensor = spk_mix_tensor / spk_mix_ten
|
||||
if not ((torch.sum(spk_mix_tensor,dim=0) - 1.)<0.0001).all():
|
||||
raise RuntimeError("sum(spk_mix_tensor) not equal 1")
|
||||
spk = spk_mix_tensor
|
||||
|
||||
global_frame = 0
|
||||
audio = []
|
||||
for (slice_tag, data) in audio_data:
|
||||
print(f'#=====segment start, {round(len(data) / audio_sr, 3)}s======')
|
||||
|
@ -318,6 +378,7 @@ class Svc(object):
|
|||
print('jump empty segment')
|
||||
_audio = np.zeros(length)
|
||||
audio.extend(list(pad_array(_audio, length)))
|
||||
global_frame += length // self.hop_size
|
||||
continue
|
||||
if per_size != 0:
|
||||
datas = split_list_by_n(data, per_size,lg_size)
|
||||
|
@ -332,15 +393,18 @@ class Svc(object):
|
|||
raw_path = io.BytesIO()
|
||||
soundfile.write(raw_path, dat, audio_sr, format="wav")
|
||||
raw_path.seek(0)
|
||||
out_audio, out_sr = self.infer(spk, tran, raw_path,
|
||||
out_audio, out_sr, out_frame = self.infer(spk, tran, raw_path,
|
||||
cluster_infer_ratio=cluster_infer_ratio,
|
||||
auto_predict_f0=auto_predict_f0,
|
||||
noice_scale=noice_scale,
|
||||
f0_predictor = f0_predictor,
|
||||
enhancer_adaptive_key = enhancer_adaptive_key,
|
||||
cr_threshold = cr_threshold,
|
||||
k_step = k_step
|
||||
k_step = k_step,
|
||||
frame = global_frame,
|
||||
spk_mix = use_spk_mix
|
||||
)
|
||||
global_frame += out_frame
|
||||
_audio = out_audio.cpu().numpy()
|
||||
pad_len = int(self.target_sample * pad_seconds)
|
||||
_audio = _audio[pad_len:-pad_len]
|
||||
|
|
|
@ -2,12 +2,11 @@ import io
|
|||
import logging
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from spkmix import spk_mix_map
|
||||
import librosa
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import soundfile
|
||||
|
||||
from inference import infer_tool
|
||||
from inference import slicer
|
||||
from inference.infer_tool import Svc
|
||||
|
@ -23,10 +22,10 @@ def main():
|
|||
parser = argparse.ArgumentParser(description='sovits4 inference')
|
||||
|
||||
# 一定要设置的部分
|
||||
parser.add_argument('-m', '--model_path', type=str, default="logs/44k/G_0.pth", help='模型路径')
|
||||
parser.add_argument('-c', '--config_path', type=str, default="configs/config.json", help='配置文件路径')
|
||||
parser.add_argument('-m', '--model_path', type=str, default="logs/44k/G_37600.pth", help='模型路径')
|
||||
parser.add_argument('-c', '--config_path', type=str, default="logs/44k/config.json", help='配置文件路径')
|
||||
parser.add_argument('-cl', '--clip', type=float, default=0, help='音频强制切片,默认0为自动切片,单位为秒/s')
|
||||
parser.add_argument('-n', '--clean_names', type=str, nargs='+', default=["君の知らない物語-src.wav"], help='wav文件名列表,放在raw文件夹下')
|
||||
parser.add_argument('-n', '--clean_names', type=str, nargs='+', default=["test.wav"], help='wav文件名列表,放在raw文件夹下')
|
||||
parser.add_argument('-t', '--trans', type=int, nargs='+', default=[0], help='音高调整,支持正负(半音)')
|
||||
parser.add_argument('-s', '--spk_list', type=str, nargs='+', default=['nen'], help='合成目标说话人名称')
|
||||
|
||||
|
@ -38,6 +37,7 @@ def main():
|
|||
parser.add_argument('-f0p', '--f0_predictor', type=str, default="pm", help='选择F0预测器,可选择crepe,pm,dio,harvest,默认为pm(注意:crepe为原F0使用均值滤波器)')
|
||||
parser.add_argument('-eh', '--enhance', action='store_true', default=False, help='是否使用NSF_HIFIGAN增强器,该选项对部分训练集少的模型有一定的音质增强效果,但是对训练好的模型有反面效果,默认关闭')
|
||||
parser.add_argument('-shd', '--shallow_diffusion', action='store_true', default=False, help='是否使用浅层扩散,使用后可解决一部分电音问题,默认关闭,该选项打开时,NSF_HIFIGAN增强器将会被禁止')
|
||||
parser.add_argument('-usm', '--use_spk_mix', action='store_true', default=True, help='是否使用角色融合')
|
||||
|
||||
# 浅扩散设置
|
||||
parser.add_argument('-dm', '--diffusion_model_path', type=str, default="logs/44k/diffusion/model_0.pt", help='扩散模型路径')
|
||||
|
@ -79,9 +79,12 @@ def main():
|
|||
k_step = args.k_step
|
||||
only_diffusion = args.only_diffusion
|
||||
shallow_diffusion = args.shallow_diffusion
|
||||
|
||||
use_spk_mix = args.use_spk_mix
|
||||
svc_model = Svc(args.model_path, args.config_path, args.device, args.cluster_model_path,enhance,diffusion_model_path,diffusion_config_path,shallow_diffusion,only_diffusion)
|
||||
infer_tool.mkdir(["raw", "results"])
|
||||
|
||||
if use_spk_mix:
|
||||
spk_list = [spk_mix_map]
|
||||
|
||||
infer_tool.fill_a_to_b(trans, clean_names)
|
||||
for clean_name, tran in zip(clean_names, trans):
|
||||
|
@ -105,7 +108,8 @@ def main():
|
|||
"f0_predictor" : f0p,
|
||||
"enhancer_adaptive_key" : enhancer_adaptive_key,
|
||||
"cr_threshold" : cr_threshold,
|
||||
"k_step":k_step
|
||||
"k_step":k_step,
|
||||
"use_spk_mix":use_spk_mix
|
||||
}
|
||||
audio = svc_model.slice_inference(**kwarg)
|
||||
key = "auto" if auto_predict_f0 else f"{tran}key"
|
||||
|
@ -113,7 +117,9 @@ def main():
|
|||
isdiffusion = "sovits"
|
||||
if shallow_diffusion : isdiffusion = "sovdiff"
|
||||
if only_diffusion : isdiffusion = "diff"
|
||||
res_path = f'./results/{clean_name}_{key}_{spk}{cluster_name}_{isdiffusion}.{wav_format}'
|
||||
if type(spk) != type('aa'):
|
||||
spk = "spk_mix"
|
||||
res_path = f'results/{clean_name}_{key}_{spk}{cluster_name}_{isdiffusion}.{wav_format}'
|
||||
soundfile.write(res_path, audio, svc_model.target_sample, format=wav_format)
|
||||
svc_model.clear_empty()
|
||||
|
||||
|
|
28
models.py
28
models.py
|
@ -382,11 +382,11 @@ class SynthesizerTrn(nn.Module):
|
|||
self.emb_uv = nn.Embedding(2, hidden_channels)
|
||||
self.character_mix = False
|
||||
|
||||
def EnableCharacterMix(self, n_speakers_map):
|
||||
self.speaker_map = torch.zeros((n_speakers_map, 1, 1, self.gin_channels))
|
||||
def EnableCharacterMix(self, n_speakers_map, device):
|
||||
self.speaker_map = torch.zeros((n_speakers_map, 1, 1, self.gin_channels)).to(device)
|
||||
for i in range(n_speakers_map):
|
||||
self.speaker_map[i] = self.emb_g(torch.LongTensor([[i]]))
|
||||
self.speaker_map = self.speaker_map.unsqueeze(0)
|
||||
self.speaker_map[i] = self.emb_g(torch.LongTensor([[i]]).to(device))
|
||||
self.speaker_map = self.speaker_map.unsqueeze(0).to(device)
|
||||
self.character_mix = True
|
||||
|
||||
def forward(self, c, f0, uv, spec, g=None, c_lengths=None, spec_lengths=None):
|
||||
|
@ -413,9 +413,25 @@ class SynthesizerTrn(nn.Module):
|
|||
|
||||
return o, ids_slice, spec_mask, (z, z_p, m_p, logs_p, m_q, logs_q), pred_lf0, norm_lf0, lf0
|
||||
|
||||
def infer(self, c, f0, uv, g=None, noice_scale=0.35, predict_f0=False):
|
||||
def infer(self, c, f0, uv, g=None, noice_scale=0.35, seed=52468, predict_f0=False):
|
||||
|
||||
if c.device == torch.device("cuda"):
|
||||
torch.cuda.manual_seed_all(seed)
|
||||
else:
|
||||
torch.manual_seed(seed)
|
||||
|
||||
c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
|
||||
g = self.emb_g(g).transpose(1, 2)
|
||||
|
||||
if self.character_mix and len(g) > 1: # [N, S] * [S, B, 1, H]
|
||||
g = g.reshape((g.shape[0], g.shape[1], 1, 1, 1)) # [N, S, B, 1, 1]
|
||||
g = g * self.speaker_map # [N, S, B, 1, H]
|
||||
g = torch.sum(g, dim=1) # [N, 1, B, 1, H]
|
||||
g = g.transpose(0, -1).transpose(0, -2).squeeze(0) # [B, H, N]
|
||||
else:
|
||||
if g.dim() == 1:
|
||||
g = g.unsqueeze(0)
|
||||
g = self.emb_g(g).transpose(1, 2)
|
||||
|
||||
x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to(c.dtype)
|
||||
x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1, 2)
|
||||
|
||||
|
|
|
@ -0,0 +1,11 @@
|
|||
# 角色混合轨道 编写规则:
|
||||
# 角色ID : [[起始时间1, 终止时间1, 起始数值1, 起始数值1], [起始时间2, 终止时间2, 起始数值2, 起始数值2]]
|
||||
# 起始时间和前一个的终止时间必须相同,第一个起始时间必须为0,最后一个终止时间必须为1 (时间的范围为0-1)
|
||||
# 全部角色必须填写,不使用的角色填[[0., 1., 0., 0.]]即可
|
||||
# 融合数值可以随便填,在指定的时间段内从起始数值线性变化为终止数值,内部会自动确保线性组合为1,可以放心使用
|
||||
|
||||
spk_mix_map = {
|
||||
0 : [[0., 0.5, 1, 0.5], [0.5, 1, 0.5, 1]],
|
||||
1 : [[0., 0.35, 1, 0.5], [0.35, 0.75, 0.75, 1], [0.75, 1, 0.45, 1]],
|
||||
2 : [[0., 0.35, 1, 0.5], [0.35, 0.75, 0.75, 1], [0.75, 1, 0.45, 1]]
|
||||
}
|
11
utils.py
11
utils.py
|
@ -9,7 +9,6 @@ import subprocess
|
|||
import warnings
|
||||
import random
|
||||
import functools
|
||||
|
||||
import librosa
|
||||
import numpy as np
|
||||
from scipy.io.wavfile import read
|
||||
|
@ -110,19 +109,19 @@ def get_speech_encoder(speech_encoder,device=None,**kargs):
|
|||
speech_encoder_object = ContentVec256L9(device = device)
|
||||
elif speech_encoder == "vec256l9-onnx":
|
||||
from vencoder.ContentVec256L9_Onnx import ContentVec256L9_Onnx
|
||||
speech_encoder_object = ContentVec256L9(device = device)
|
||||
speech_encoder_object = ContentVec256L9_Onnx(device = device)
|
||||
elif speech_encoder == "vec256l12-onnx":
|
||||
from vencoder.ContentVec256L12_Onnx import ContentVec256L12_Onnx
|
||||
speech_encoder_object = ContentVec256L9(device = device)
|
||||
speech_encoder_object = ContentVec256L12_Onnx(device = device)
|
||||
elif speech_encoder == "vec768l9-onnx":
|
||||
from vencoder.ContentVec768L9_Onnx import ContentVec768L9_Onnx
|
||||
speech_encoder_object = ContentVec256L9(device = device)
|
||||
speech_encoder_object = ContentVec768L9_Onnx(device = device)
|
||||
elif speech_encoder == "vec768l12-onnx":
|
||||
from vencoder.ContentVec768L12_Onnx import ContentVec768L12_Onnx
|
||||
speech_encoder_object = ContentVec256L9(device = device)
|
||||
speech_encoder_object = ContentVec768L12_Onnx(device = device)
|
||||
elif speech_encoder == "hubertsoft-onnx":
|
||||
from vencoder.HubertSoft_Onnx import HubertSoft_Onnx
|
||||
speech_encoder_object = HubertSoft(device = device)
|
||||
speech_encoder_object = HubertSoft_Onnx(device = device)
|
||||
elif speech_encoder == "hubertsoft":
|
||||
from vencoder.HubertSoft import HubertSoft
|
||||
speech_encoder_object = HubertSoft(device = device)
|
||||
|
|
Loading…
Reference in New Issue