Support CharacterMix TimeLine

This commit is contained in:
白叶 藤原 2023-05-29 00:00:02 +08:00
parent 04e4495dce
commit 867f0c5a43
6 changed files with 136 additions and 33 deletions

7
.gitignore vendored
View File

@ -157,3 +157,10 @@ filelists/val.txt
.idea/vcs.xml .idea/vcs.xml
.idea/inspectionProfiles/profiles_settings.xml .idea/inspectionProfiles/profiles_settings.xml
.idea/inspectionProfiles/Project_Default.xml .idea/inspectionProfiles/Project_Default.xml
pretrain/vec-768-layer-12.onnx
pretrain/hubert-soft.onnx
pretrain/hubert4.0.onnx
pretrain/vec-256-layer-9.onnx
pretrain/vec-256-layer-12.onnx
pretrain/vec-768-layer-9.onnx
.vscode/launch.json

View File

@ -219,16 +219,23 @@ class Svc(object):
f0_predictor='pm', f0_predictor='pm',
enhancer_adaptive_key = 0, enhancer_adaptive_key = 0,
cr_threshold = 0.05, cr_threshold = 0.05,
k_step = 100 k_step = 100,
frame = 0,
spk_mix = False
): ):
speaker_id = self.spk2id.get(speaker)
if not speaker_id and type(speaker) is int:
if len(self.spk2id.__dict__) >= speaker:
speaker_id = speaker
sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0)
wav, sr = librosa.load(raw_path, sr=self.target_sample) wav, sr = librosa.load(raw_path, sr=self.target_sample)
c, f0, uv = self.get_unit_f0(wav, tran, cluster_infer_ratio, speaker, f0_filter,f0_predictor,cr_threshold=cr_threshold) if spk_mix:
c, f0, uv = self.get_unit_f0(wav, tran, 0, None, f0_filter,f0_predictor,cr_threshold=cr_threshold)
n_frames = f0.size(1)
sid = speaker[:, frame:frame+n_frames].transpose(0,1)
else:
speaker_id = self.spk2id.get(speaker)
if not speaker_id and type(speaker) is int:
if len(self.spk2id.__dict__) >= speaker:
speaker_id = speaker
sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0)
c, f0, uv = self.get_unit_f0(wav, tran, cluster_infer_ratio, speaker, f0_filter,f0_predictor,cr_threshold=cr_threshold)
n_frames = f0.size(1)
if "half" in self.net_g_path and torch.cuda.is_available(): if "half" in self.net_g_path and torch.cuda.is_available():
c = c.half() c = c.half()
with torch.no_grad(): with torch.no_grad():
@ -266,7 +273,7 @@ class Svc(object):
adaptive_key = enhancer_adaptive_key) adaptive_key = enhancer_adaptive_key)
use_time = time.time() - start use_time = time.time() - start
print("vits use time:{}".format(use_time)) print("vits use time:{}".format(use_time))
return audio, audio.shape[-1] return audio, audio.shape[-1], n_frames
def clear_empty(self): def clear_empty(self):
# clean up vram # clean up vram
@ -297,7 +304,8 @@ class Svc(object):
f0_predictor='pm', f0_predictor='pm',
enhancer_adaptive_key = 0, enhancer_adaptive_key = 0,
cr_threshold = 0.05, cr_threshold = 0.05,
k_step = 100 k_step = 100,
use_spk_mix = False
): ):
wav_path = Path(raw_audio_path).with_suffix('.wav') wav_path = Path(raw_audio_path).with_suffix('.wav')
chunks = slicer.cut(wav_path, db_thresh=slice_db) chunks = slicer.cut(wav_path, db_thresh=slice_db)
@ -308,7 +316,59 @@ class Svc(object):
lg_size_c_l = (lg_size-lg_size_r)//2 lg_size_c_l = (lg_size-lg_size_r)//2
lg_size_c_r = lg_size-lg_size_r-lg_size_c_l lg_size_c_r = lg_size-lg_size_r-lg_size_c_l
lg = np.linspace(0,1,lg_size_r) if lg_size!=0 else 0 lg = np.linspace(0,1,lg_size_r) if lg_size!=0 else 0
if use_spk_mix:
assert len(self.spk2id) == len(spk)
self.net_g_ms.EnableCharacterMix(len(self.spk2id), self.dev)
audio_length = 0
for (slice_tag, data) in audio_data:
aud_length = int(np.ceil(len(data) / audio_sr * self.target_sample))
if slice_tag:
audio_length += aud_length // self.hop_size
continue
if per_size != 0:
datas = split_list_by_n(data, per_size,lg_size)
else:
datas = [data]
for k,dat in enumerate(datas):
pad_len = int(audio_sr * pad_seconds)
per_length = int(np.ceil(len(dat) / audio_sr * self.target_sample))
a_length = per_length + 2 * pad_len
audio_length += a_length // self.hop_size
audio_length += len(audio_data)
spk_mix_tensor = torch.zeros(size=(len(spk), audio_length)).to(self.dev)
for i in range(len(spk)):
last_end = None
for mix in spk[i]:
if mix[3]<0. or mix[2]<0.:
raise RuntimeError("mix value must higer Than zero!")
begin = int(audio_length * mix[0])
end = int(audio_length * mix[1])
length = end - begin
if length<=0: raise RuntimeError("begin Must lower Than end!")
step = (mix[3] - mix[2])/length
if last_end is not None:
if last_end != begin:
raise RuntimeError("[i]EndTime Must Equal [i+1]BeginTime!")
last_end = end
spk_mix_data = torch.arange(mix[2],mix[3],step).to(self.dev)
if(len(spk_mix_data)<length):
num_pad = length - len(spk_mix_data)
spk_mix_data = torch.nn.functional.pad(spk_mix_data, [0, num_pad], mode="reflect").to(self.dev)
spk_mix_tensor[i][begin:end] = spk_mix_data[:length]
spk_mix_ten = torch.sum(spk_mix_tensor,dim=0).unsqueeze(0).to(self.dev)
# spk_mix_tensor[0][spk_mix_ten<0.001] = 1.0
for i, x in enumerate(spk_mix_ten[0]):
if x == 0.0:
spk_mix_ten[0][i] = 1.0
spk_mix_tensor[:,i] = 1.0 / len(spk)
spk_mix_tensor = spk_mix_tensor / spk_mix_ten
if not ((torch.sum(spk_mix_tensor,dim=0) - 1.)<0.0001).all():
raise RuntimeError("sum(spk_mix_tensor) not equal 1")
spk = spk_mix_tensor
global_frame = 0
audio = [] audio = []
for (slice_tag, data) in audio_data: for (slice_tag, data) in audio_data:
print(f'#=====segment start, {round(len(data) / audio_sr, 3)}s======') print(f'#=====segment start, {round(len(data) / audio_sr, 3)}s======')
@ -318,6 +378,7 @@ class Svc(object):
print('jump empty segment') print('jump empty segment')
_audio = np.zeros(length) _audio = np.zeros(length)
audio.extend(list(pad_array(_audio, length))) audio.extend(list(pad_array(_audio, length)))
global_frame += length // self.hop_size
continue continue
if per_size != 0: if per_size != 0:
datas = split_list_by_n(data, per_size,lg_size) datas = split_list_by_n(data, per_size,lg_size)
@ -332,15 +393,18 @@ class Svc(object):
raw_path = io.BytesIO() raw_path = io.BytesIO()
soundfile.write(raw_path, dat, audio_sr, format="wav") soundfile.write(raw_path, dat, audio_sr, format="wav")
raw_path.seek(0) raw_path.seek(0)
out_audio, out_sr = self.infer(spk, tran, raw_path, out_audio, out_sr, out_frame = self.infer(spk, tran, raw_path,
cluster_infer_ratio=cluster_infer_ratio, cluster_infer_ratio=cluster_infer_ratio,
auto_predict_f0=auto_predict_f0, auto_predict_f0=auto_predict_f0,
noice_scale=noice_scale, noice_scale=noice_scale,
f0_predictor = f0_predictor, f0_predictor = f0_predictor,
enhancer_adaptive_key = enhancer_adaptive_key, enhancer_adaptive_key = enhancer_adaptive_key,
cr_threshold = cr_threshold, cr_threshold = cr_threshold,
k_step = k_step k_step = k_step,
frame = global_frame,
spk_mix = use_spk_mix
) )
global_frame += out_frame
_audio = out_audio.cpu().numpy() _audio = out_audio.cpu().numpy()
pad_len = int(self.target_sample * pad_seconds) pad_len = int(self.target_sample * pad_seconds)
_audio = _audio[pad_len:-pad_len] _audio = _audio[pad_len:-pad_len]

View File

@ -2,12 +2,11 @@ import io
import logging import logging
import time import time
from pathlib import Path from pathlib import Path
from spkmix import spk_mix_map
import librosa import librosa
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import numpy as np import numpy as np
import soundfile import soundfile
from inference import infer_tool from inference import infer_tool
from inference import slicer from inference import slicer
from inference.infer_tool import Svc from inference.infer_tool import Svc
@ -23,10 +22,10 @@ def main():
parser = argparse.ArgumentParser(description='sovits4 inference') parser = argparse.ArgumentParser(description='sovits4 inference')
# 一定要设置的部分 # 一定要设置的部分
parser.add_argument('-m', '--model_path', type=str, default="logs/44k/G_0.pth", help='模型路径') parser.add_argument('-m', '--model_path', type=str, default="logs/44k/G_37600.pth", help='模型路径')
parser.add_argument('-c', '--config_path', type=str, default="configs/config.json", help='配置文件路径') parser.add_argument('-c', '--config_path', type=str, default="logs/44k/config.json", help='配置文件路径')
parser.add_argument('-cl', '--clip', type=float, default=0, help='音频强制切片默认0为自动切片单位为秒/s') parser.add_argument('-cl', '--clip', type=float, default=0, help='音频强制切片默认0为自动切片单位为秒/s')
parser.add_argument('-n', '--clean_names', type=str, nargs='+', default=["君の知らない物語-src.wav"], help='wav文件名列表放在raw文件夹下') parser.add_argument('-n', '--clean_names', type=str, nargs='+', default=["test.wav"], help='wav文件名列表放在raw文件夹下')
parser.add_argument('-t', '--trans', type=int, nargs='+', default=[0], help='音高调整,支持正负(半音)') parser.add_argument('-t', '--trans', type=int, nargs='+', default=[0], help='音高调整,支持正负(半音)')
parser.add_argument('-s', '--spk_list', type=str, nargs='+', default=['nen'], help='合成目标说话人名称') parser.add_argument('-s', '--spk_list', type=str, nargs='+', default=['nen'], help='合成目标说话人名称')
@ -38,6 +37,7 @@ def main():
parser.add_argument('-f0p', '--f0_predictor', type=str, default="pm", help='选择F0预测器,可选择crepe,pm,dio,harvest,默认为pm(注意crepe为原F0使用均值滤波器)') parser.add_argument('-f0p', '--f0_predictor', type=str, default="pm", help='选择F0预测器,可选择crepe,pm,dio,harvest,默认为pm(注意crepe为原F0使用均值滤波器)')
parser.add_argument('-eh', '--enhance', action='store_true', default=False, help='是否使用NSF_HIFIGAN增强器,该选项对部分训练集少的模型有一定的音质增强效果,但是对训练好的模型有反面效果,默认关闭') parser.add_argument('-eh', '--enhance', action='store_true', default=False, help='是否使用NSF_HIFIGAN增强器,该选项对部分训练集少的模型有一定的音质增强效果,但是对训练好的模型有反面效果,默认关闭')
parser.add_argument('-shd', '--shallow_diffusion', action='store_true', default=False, help='是否使用浅层扩散使用后可解决一部分电音问题默认关闭该选项打开时NSF_HIFIGAN增强器将会被禁止') parser.add_argument('-shd', '--shallow_diffusion', action='store_true', default=False, help='是否使用浅层扩散使用后可解决一部分电音问题默认关闭该选项打开时NSF_HIFIGAN增强器将会被禁止')
parser.add_argument('-usm', '--use_spk_mix', action='store_true', default=True, help='是否使用角色融合')
# 浅扩散设置 # 浅扩散设置
parser.add_argument('-dm', '--diffusion_model_path', type=str, default="logs/44k/diffusion/model_0.pt", help='扩散模型路径') parser.add_argument('-dm', '--diffusion_model_path', type=str, default="logs/44k/diffusion/model_0.pt", help='扩散模型路径')
@ -79,9 +79,12 @@ def main():
k_step = args.k_step k_step = args.k_step
only_diffusion = args.only_diffusion only_diffusion = args.only_diffusion
shallow_diffusion = args.shallow_diffusion shallow_diffusion = args.shallow_diffusion
use_spk_mix = args.use_spk_mix
svc_model = Svc(args.model_path, args.config_path, args.device, args.cluster_model_path,enhance,diffusion_model_path,diffusion_config_path,shallow_diffusion,only_diffusion) svc_model = Svc(args.model_path, args.config_path, args.device, args.cluster_model_path,enhance,diffusion_model_path,diffusion_config_path,shallow_diffusion,only_diffusion)
infer_tool.mkdir(["raw", "results"]) infer_tool.mkdir(["raw", "results"])
if use_spk_mix:
spk_list = [spk_mix_map]
infer_tool.fill_a_to_b(trans, clean_names) infer_tool.fill_a_to_b(trans, clean_names)
for clean_name, tran in zip(clean_names, trans): for clean_name, tran in zip(clean_names, trans):
@ -105,7 +108,8 @@ def main():
"f0_predictor" : f0p, "f0_predictor" : f0p,
"enhancer_adaptive_key" : enhancer_adaptive_key, "enhancer_adaptive_key" : enhancer_adaptive_key,
"cr_threshold" : cr_threshold, "cr_threshold" : cr_threshold,
"k_step":k_step "k_step":k_step,
"use_spk_mix":use_spk_mix
} }
audio = svc_model.slice_inference(**kwarg) audio = svc_model.slice_inference(**kwarg)
key = "auto" if auto_predict_f0 else f"{tran}key" key = "auto" if auto_predict_f0 else f"{tran}key"
@ -113,7 +117,9 @@ def main():
isdiffusion = "sovits" isdiffusion = "sovits"
if shallow_diffusion : isdiffusion = "sovdiff" if shallow_diffusion : isdiffusion = "sovdiff"
if only_diffusion : isdiffusion = "diff" if only_diffusion : isdiffusion = "diff"
res_path = f'./results/{clean_name}_{key}_{spk}{cluster_name}_{isdiffusion}.{wav_format}' if type(spk) != type('aa'):
spk = "spk_mix"
res_path = f'results/{clean_name}_{key}_{spk}{cluster_name}_{isdiffusion}.{wav_format}'
soundfile.write(res_path, audio, svc_model.target_sample, format=wav_format) soundfile.write(res_path, audio, svc_model.target_sample, format=wav_format)
svc_model.clear_empty() svc_model.clear_empty()

View File

@ -382,11 +382,11 @@ class SynthesizerTrn(nn.Module):
self.emb_uv = nn.Embedding(2, hidden_channels) self.emb_uv = nn.Embedding(2, hidden_channels)
self.character_mix = False self.character_mix = False
def EnableCharacterMix(self, n_speakers_map): def EnableCharacterMix(self, n_speakers_map, device):
self.speaker_map = torch.zeros((n_speakers_map, 1, 1, self.gin_channels)) self.speaker_map = torch.zeros((n_speakers_map, 1, 1, self.gin_channels)).to(device)
for i in range(n_speakers_map): for i in range(n_speakers_map):
self.speaker_map[i] = self.emb_g(torch.LongTensor([[i]])) self.speaker_map[i] = self.emb_g(torch.LongTensor([[i]]).to(device))
self.speaker_map = self.speaker_map.unsqueeze(0) self.speaker_map = self.speaker_map.unsqueeze(0).to(device)
self.character_mix = True self.character_mix = True
def forward(self, c, f0, uv, spec, g=None, c_lengths=None, spec_lengths=None): def forward(self, c, f0, uv, spec, g=None, c_lengths=None, spec_lengths=None):
@ -413,9 +413,25 @@ class SynthesizerTrn(nn.Module):
return o, ids_slice, spec_mask, (z, z_p, m_p, logs_p, m_q, logs_q), pred_lf0, norm_lf0, lf0 return o, ids_slice, spec_mask, (z, z_p, m_p, logs_p, m_q, logs_q), pred_lf0, norm_lf0, lf0
def infer(self, c, f0, uv, g=None, noice_scale=0.35, predict_f0=False): def infer(self, c, f0, uv, g=None, noice_scale=0.35, seed=52468, predict_f0=False):
if c.device == torch.device("cuda"):
torch.cuda.manual_seed_all(seed)
else:
torch.manual_seed(seed)
c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device) c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
g = self.emb_g(g).transpose(1, 2)
if self.character_mix and len(g) > 1: # [N, S] * [S, B, 1, H]
g = g.reshape((g.shape[0], g.shape[1], 1, 1, 1)) # [N, S, B, 1, 1]
g = g * self.speaker_map # [N, S, B, 1, H]
g = torch.sum(g, dim=1) # [N, 1, B, 1, H]
g = g.transpose(0, -1).transpose(0, -2).squeeze(0) # [B, H, N]
else:
if g.dim() == 1:
g = g.unsqueeze(0)
g = self.emb_g(g).transpose(1, 2)
x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to(c.dtype) x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to(c.dtype)
x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1, 2) x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1, 2)

11
spkmix.py Normal file
View File

@ -0,0 +1,11 @@
# 角色混合轨道 编写规则:
# 角色ID : [[起始时间1, 终止时间1, 起始数值1, 起始数值1], [起始时间2, 终止时间2, 起始数值2, 起始数值2]]
# 起始时间和前一个的终止时间必须相同第一个起始时间必须为0最后一个终止时间必须为1 时间的范围为0-1
# 全部角色必须填写,不使用的角色填[[0., 1., 0., 0.]]即可
# 融合数值可以随便填在指定的时间段内从起始数值线性变化为终止数值内部会自动确保线性组合为1可以放心使用
spk_mix_map = {
0 : [[0., 0.5, 1, 0.5], [0.5, 1, 0.5, 1]],
1 : [[0., 0.35, 1, 0.5], [0.35, 0.75, 0.75, 1], [0.75, 1, 0.45, 1]],
2 : [[0., 0.35, 1, 0.5], [0.35, 0.75, 0.75, 1], [0.75, 1, 0.45, 1]]
}

View File

@ -9,7 +9,6 @@ import subprocess
import warnings import warnings
import random import random
import functools import functools
import librosa import librosa
import numpy as np import numpy as np
from scipy.io.wavfile import read from scipy.io.wavfile import read
@ -110,19 +109,19 @@ def get_speech_encoder(speech_encoder,device=None,**kargs):
speech_encoder_object = ContentVec256L9(device = device) speech_encoder_object = ContentVec256L9(device = device)
elif speech_encoder == "vec256l9-onnx": elif speech_encoder == "vec256l9-onnx":
from vencoder.ContentVec256L9_Onnx import ContentVec256L9_Onnx from vencoder.ContentVec256L9_Onnx import ContentVec256L9_Onnx
speech_encoder_object = ContentVec256L9(device = device) speech_encoder_object = ContentVec256L9_Onnx(device = device)
elif speech_encoder == "vec256l12-onnx": elif speech_encoder == "vec256l12-onnx":
from vencoder.ContentVec256L12_Onnx import ContentVec256L12_Onnx from vencoder.ContentVec256L12_Onnx import ContentVec256L12_Onnx
speech_encoder_object = ContentVec256L9(device = device) speech_encoder_object = ContentVec256L12_Onnx(device = device)
elif speech_encoder == "vec768l9-onnx": elif speech_encoder == "vec768l9-onnx":
from vencoder.ContentVec768L9_Onnx import ContentVec768L9_Onnx from vencoder.ContentVec768L9_Onnx import ContentVec768L9_Onnx
speech_encoder_object = ContentVec256L9(device = device) speech_encoder_object = ContentVec768L9_Onnx(device = device)
elif speech_encoder == "vec768l12-onnx": elif speech_encoder == "vec768l12-onnx":
from vencoder.ContentVec768L12_Onnx import ContentVec768L12_Onnx from vencoder.ContentVec768L12_Onnx import ContentVec768L12_Onnx
speech_encoder_object = ContentVec256L9(device = device) speech_encoder_object = ContentVec768L12_Onnx(device = device)
elif speech_encoder == "hubertsoft-onnx": elif speech_encoder == "hubertsoft-onnx":
from vencoder.HubertSoft_Onnx import HubertSoft_Onnx from vencoder.HubertSoft_Onnx import HubertSoft_Onnx
speech_encoder_object = HubertSoft(device = device) speech_encoder_object = HubertSoft_Onnx(device = device)
elif speech_encoder == "hubertsoft": elif speech_encoder == "hubertsoft":
from vencoder.HubertSoft import HubertSoft from vencoder.HubertSoft import HubertSoft
speech_encoder_object = HubertSoft(device = device) speech_encoder_object = HubertSoft(device = device)