Diif Updata

This commit is contained in:
ylzz1997 2023-05-17 19:20:45 +08:00
parent 8cc7645379
commit 7dbb0ba66a
9 changed files with 128 additions and 83 deletions

View File

@ -145,10 +145,10 @@ class AudioDataset(Dataset):
if n_spk is not None and n_spk > 1:
spk_name = name_ext.split("/")[-2]
spk_id = spk[spk_name] if spk_name in spk else 0
if spk_id < 1 or spk_id > n_spk:
raise ValueError(' [x] Muiti-speaker traing error : spk_id must be a positive integer from 1 to n_spk ')
if spk_id < 0 or spk_id >= n_spk:
raise ValueError(' [x] Muiti-speaker traing error : spk_id must be a positive integer from 0 to n_spk-1 ')
else:
spk_id = 1
spk_id = 0
spk_id = torch.LongTensor(np.array([spk_id])).to(device)
if load_all_data:

View File

@ -22,7 +22,8 @@ def test(args, model, vocoder, loader_test, saver):
# run
with torch.no_grad():
for bidx, data in enumerate(loader_test):
fn = data['name'][0]
fn = data['name'][0].split("/")[-1]
speaker = data['name'][0].split("/")[-2]
print('--------')
print('{}/{} - {}'.format(bidx, num_batches, fn))
@ -65,16 +66,15 @@ def test(args, model, vocoder, loader_test, saver):
test_loss += loss.item()
# log mel
saver.log_spec(data['name'][0], data['mel'], mel)
saver.log_spec(f"{speaker}_{fn}.wav", data['mel'], mel)
# log audio
path_audio = os.path.join(args.data.valid_path, 'audio', data['name_ext'][0])
# log audi
path_audio = data['name_ext'][0]
audio, sr = librosa.load(path_audio, sr=args.data.sampling_rate)
if len(audio.shape) > 1:
audio = librosa.to_mono(audio)
audio = torch.from_numpy(audio).unsqueeze(0).to(signal)
saver.log_audio({fn+'/gt.wav': audio, fn+'/pred.wav': signal})
saver.log_audio({f"{speaker}_{fn}_gt.wav": audio,f"{speaker}_{fn}_pred.wav": signal})
# report
test_loss /= args.train.batch_size
test_loss /= num_batches
@ -107,6 +107,7 @@ def train(args, initial_global_step, model, optimizer, scheduler, vocoder, loade
dtype = torch.bfloat16
else:
raise ValueError(' [x] Unknown amp_dtype: ' + args.train.amp_dtype)
saver.log_info("epoch|batch_idx/num_batches|output_dir|batch/s|lr|time|step")
for epoch in range(args.train.epochs):
for batch_idx, data in enumerate(loader_train):
saver.global_step_increment()

View File

@ -18,8 +18,12 @@ class DotDict(dict):
def load_model_vocoder(
model_path,
device='cpu'):
config_file = os.path.join(os.path.split(model_path)[0], 'config.yaml')
device='cpu',
config_path = None
):
if config_path is None: config_file = os.path.join(os.path.split(model_path)[0], 'config.yaml')
else: config_file = config_path
with open(config_file, "r") as config:
args = yaml.safe_load(config)
args = DotDict(args)
@ -85,9 +89,9 @@ class Unit2Mel(nn.Module):
if spk_mix_dict is not None:
for k, v in spk_mix_dict.items():
spk_id_torch = torch.LongTensor(np.array([[k]])).to(units.device)
x = x + v * self.spk_embed(spk_id_torch - 1)
x = x + v * self.spk_embed(spk_id_torch)
else:
x = x + self.spk_embed(spk_id - 1)
x = x + self.spk_embed(spk_id)
if self.aug_shift_embed is not None and aug_shift is not None:
x = x + self.aug_shift_embed(aug_shift / 5)
x = self.decoder(x, gt_spec=gt_spec, infer=infer, infer_speedup=infer_speedup, method=method, k_step=k_step, use_tqdm=use_tqdm)

View File

@ -19,6 +19,9 @@ import cluster
import utils
from models import SynthesizerTrn
from diffusion.unit2mel import load_model_vocoder
import yaml
logging.getLogger('matplotlib').setLevel(logging.WARNING)
@ -114,7 +117,11 @@ class Svc(object):
def __init__(self, net_g_path, config_path,
device=None,
cluster_model_path="logs/44k/kmeans_10000.pt",
nsf_hifigan_enhance = False
nsf_hifigan_enhance = False,
diffusion_model_path="logs/44k/diffusion/model_0.pt",
diffusion_config_path="configs/diffusion.yaml",
shallow_diffusion = False,
only_diffusion = False,
):
self.net_g_path = net_g_path
if device is None:
@ -127,19 +134,32 @@ class Svc(object):
self.hop_size = self.hps_ms.data.hop_length
self.spk2id = self.hps_ms.spk
self.nsf_hifigan_enhance = nsf_hifigan_enhance
self.only_diffusion = only_diffusion
self.shallow_diffusion = shallow_diffusion
try:
self.speech_encoder = self.hps_ms.model.speech_encoder
except Exception as e:
self.speech_encoder = 'vec768l12'
# load hubert
self.hubert_model = utils.get_speech_encoder(self.speech_encoder,device=self.dev)
self.load_model()
if self.shallow_diffusion or self.only_diffusion:
self.diffusion_model,self.vocoder,self.diffusion_args = load_model_vocoder(diffusion_model_path,self.dev,config_path=diffusion_config_path)
# load hubert and model
if not self.only_diffusion:
self.load_model()
self.hubert_model = utils.get_speech_encoder(self.speech_encoder,device=self.dev)
self.volume_extractor = utils.Volume_Extractor(self.hps_ms.data.hop_length)
assert self.diffusion_args.data.encoder == self.hps_ms.model.speech_encoder
else:
self.hubert_model = utils.get_speech_encoder(self.diffusion_args.data.encoder,device=self.dev)
self.volume_extractor = utils.Volume_Extractor(self.diffusion_args.data.block_size)
if os.path.exists(cluster_model_path):
self.cluster_model = cluster.get_cluster_model(cluster_model_path)
if self.shallow_diffusion : self.nsf_hifigan_enhance = False
if self.nsf_hifigan_enhance:
from modules.enhancer import Enhancer
self.enhancer = Enhancer('nsf-hifigan', 'pretrain/nsf_hifigan/model',device=self.dev)
def load_model(self):
# get model configuration
self.net_g_ms = SynthesizerTrn(
@ -157,7 +177,7 @@ class Svc(object):
def get_unit_f0(self, in_path, tran, cluster_infer_ratio, speaker, f0_filter ,f0_predictor,cr_threshold=0.05):
wav, sr = librosa.load(in_path, sr=self.target_sample)
f0_predictor_object = utils.get_f0_predictor(f0_predictor,hop_length=self.hop_size,sampling_rate=self.target_sample,device=self.dev,threshold=cr_threshold)
f0, uv = f0_predictor_object.compute_f0_uv(wav)
@ -190,7 +210,8 @@ class Svc(object):
f0_filter=False,
f0_predictor='pm',
enhancer_adaptive_key = 0,
cr_threshold = 0.05
cr_threshold = 0.05,
k_step = 100
):
speaker_id = self.spk2id.__dict__.get(speaker)
@ -203,7 +224,44 @@ class Svc(object):
c = c.half()
with torch.no_grad():
start = time.time()
audio = self.net_g_ms.infer(c, f0=f0, g=sid, uv=uv, predict_f0=auto_predict_f0, noice_scale=noice_scale)[0,0].data.float()
if not self.only_diffusion:
audio,f0 = self.net_g_ms.infer(c, f0=f0, g=sid, uv=uv, predict_f0=auto_predict_f0, noice_scale=noice_scale)
audio = audio[0,0].data.float()
if self.shallow_diffusion:
audio_mel = self.vocoder.extract(audio[None,:])
vol = self.volume_extractor.extract(audio[None,:])[None,:,None].to(self.dev)
f0 = f0[:,:,None]
c = c.transpose(-1,-2)
audio_mel = self.diffusion_model(
c,
f0,
vol,
spk_id = sid,
spk_mix_dict = None,
gt_spec=audio_mel,
infer=True,
infer_speedup=self.diffusion_args.infer.speedup,
method=self.diffusion_args.infer.methold,
k_step=k_step)
audio = self.vocoder.infer(audio_mel, f0).squeeze()
else:
wav, sr = librosa.load(raw_path, sr=self.target_sample)
wav = torch.FloatTensor(wav).to(self.dev)
vol = self.volume_extractor.extract(wav[None,:])[None,:,None]
c = c.transpose(-1,-2)
f0 = f0[:,:,None]
audio_mel = self.diffusion_model(
c,
f0,
vol,
spk_id = sid,
spk_mix_dict = None,
gt_spec=None,
infer=True,
infer_speedup=self.diffusion_args.infer.speedup,
method=self.diffusion_args.infer.methold,
k_step=k_step)
audio = self.vocoder.infer(audio_mel, f0).squeeze()
if self.nsf_hifigan_enhance:
audio, _ = self.enhancer.enhance(
audio[None,:],
@ -243,9 +301,10 @@ class Svc(object):
lgr_num =0.75,
f0_predictor='pm',
enhancer_adaptive_key = 0,
cr_threshold = 0.05
cr_threshold = 0.05,
k_step = 100
):
wav_path = raw_audio_path
wav_path = Path(raw_audio_path).with_suffix('.wav')
chunks = slicer.cut(wav_path, db_thresh=slice_db)
audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)
per_size = int(clip_seconds*audio_sr)
@ -284,7 +343,8 @@ class Svc(object):
noice_scale=noice_scale,
f0_predictor = f0_predictor,
enhancer_adaptive_key = enhancer_adaptive_key,
cr_threshold = cr_threshold
cr_threshold = cr_threshold,
k_step = k_step
)
_audio = out_audio.cpu().numpy()
pad_len = int(self.target_sample * pad_seconds)
@ -327,7 +387,7 @@ class RealTimeVC:
auto_predict_f0=auto_predict_f0,
noice_scale=noice_scale,
f0_filter=f0_filter)
audio = audio.cpu().numpy()
self.last_chunk = audio[-self.pre_len:]
self.last_o = audio
@ -348,3 +408,4 @@ class RealTimeVC:
self.last_chunk = audio[-self.pre_len:]
self.last_o = audio
return ret[self.chunk_len:2 * self.chunk_len]

View File

@ -29,7 +29,7 @@ def main():
parser.add_argument('-n', '--clean_names', type=str, nargs='+', default=["君の知らない物語-src.wav"], help='wav文件名列表放在raw文件夹下')
parser.add_argument('-t', '--trans', type=int, nargs='+', default=[0], help='音高调整,支持正负(半音)')
parser.add_argument('-s', '--spk_list', type=str, nargs='+', default=['nen'], help='合成目标说话人名称')
# 可选项部分
parser.add_argument('-a', '--auto_predict_f0', action='store_true', default=False, help='语音转换自动预测音高,转换歌声时不要打开这个会严重跑调')
parser.add_argument('-cm', '--cluster_model_path', type=str, default="logs/44k/kmeans_10000.pt", help='聚类模型路径,如果没有训练聚类则随便填')
@ -37,6 +37,13 @@ def main():
parser.add_argument('-lg', '--linear_gradient', type=float, default=0, help='两段音频切片的交叉淡入长度如果强制切片后出现人声不连贯可调整该数值如果连贯建议采用默认值0单位为秒')
parser.add_argument('-f0p', '--f0_predictor', type=str, default="pm", help='选择F0预测器,可选择crepe,pm,dio,harvest,默认为pm(注意crepe为原F0使用均值滤波器)')
parser.add_argument('-eh', '--enhance', action='store_true', default=False, help='是否使用NSF_HIFIGAN增强器,该选项对部分训练集少的模型有一定的音质增强效果,但是对训练好的模型有反面效果,默认关闭')
parser.add_argument('-shd', '--shallow_diffusion', action='store_true', default=False, help='是否使用浅层扩散使用后可解决一部分电音问题默认关闭该选项打开时NSF_HIFIGAN增强器将会被禁止')
# 浅扩散设置
parser.add_argument('-dm', '--diffusion_model_path', type=str, default="logs/44k/diffusion/model_0.pt", help='扩散模型路径')
parser.add_argument('-dc', '--diffusion_config_path', type=str, default="logs/44k/diffusion/config.yaml", help='扩散模型配置文件路径')
parser.add_argument('-ks', '--k_step', type=int, default=100, help='扩散步数越大越接近扩散模型的结果默认100')
parser.add_argument('-od', '--only_diffusion', action='store_true', default=False, help='纯扩散模式该模式不会加载sovits模型以扩散模型推理')
# 不用动的部分
parser.add_argument('-sd', '--slice_db', type=int, default=-40, help='默认-40嘈杂的音频可以-30干声保留呼吸可以-50')
@ -67,70 +74,40 @@ def main():
enhance = args.enhance
enhancer_adaptive_key = args.enhancer_adaptive_key
cr_threshold = args.f0_filter_threshold
diffusion_model_path = args.diffusion_model_path
diffusion_config_path = args.diffusion_config_path
k_step = args.k_step
only_diffusion = args.only_diffusion
shallow_diffusion = args.shallow_diffusion
svc_model = Svc(args.model_path, args.config_path, args.device, args.cluster_model_path,enhance)
svc_model = Svc(args.model_path, args.config_path, args.device, args.cluster_model_path,enhance,diffusion_model_path,diffusion_config_path,shallow_diffusion,only_diffusion)
infer_tool.mkdir(["raw", "results"])
infer_tool.fill_a_to_b(trans, clean_names)
for clean_name, tran in zip(clean_names, trans):
raw_audio_path = f"raw/{clean_name}"
if "." not in raw_audio_path:
raw_audio_path += ".wav"
infer_tool.format_wav(raw_audio_path)
wav_path = Path(raw_audio_path).with_suffix('.wav')
chunks = slicer.cut(wav_path, db_thresh=slice_db)
audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)
per_size = int(clip*audio_sr)
lg_size = int(lg*audio_sr)
lg_size_r = int(lg_size*lgr)
lg_size_c_l = (lg_size-lg_size_r)//2
lg_size_c_r = lg_size-lg_size_r-lg_size_c_l
lg_2 = np.linspace(0,1,lg_size_r) if lg_size!=0 else 0
for spk in spk_list:
audio = []
for (slice_tag, data) in audio_data:
print(f'#=====segment start, {round(len(data) / audio_sr, 3)}s======')
length = int(np.ceil(len(data) / audio_sr * svc_model.target_sample))
if slice_tag:
print('jump empty segment')
_audio = np.zeros(length)
audio.extend(list(infer_tool.pad_array(_audio, length)))
continue
if per_size != 0:
datas = infer_tool.split_list_by_n(data, per_size,lg_size)
else:
datas = [data]
for k,dat in enumerate(datas):
per_length = int(np.ceil(len(dat) / audio_sr * svc_model.target_sample)) if clip!=0 else length
if clip!=0: print(f'###=====segment clip start, {round(len(dat) / audio_sr, 3)}s======')
# padd
pad_len = int(audio_sr * pad_seconds)
dat = np.concatenate([np.zeros([pad_len]), dat, np.zeros([pad_len])])
raw_path = io.BytesIO()
soundfile.write(raw_path, dat, audio_sr, format="wav")
raw_path.seek(0)
out_audio, out_sr = svc_model.infer(spk, tran, raw_path,
cluster_infer_ratio=cluster_infer_ratio,
auto_predict_f0=auto_predict_f0,
noice_scale=noice_scale,
f0_predictor = f0p,
enhancer_adaptive_key = enhancer_adaptive_key,
cr_threshold = cr_threshold
)
_audio = out_audio.cpu().numpy()
pad_len = int(svc_model.target_sample * pad_seconds)
_audio = _audio[pad_len:-pad_len]
_audio = infer_tool.pad_array(_audio, per_length)
if lg_size!=0 and k!=0:
lg1 = audio[-(lg_size_r+lg_size_c_r):-lg_size_c_r] if lgr != 1 else audio[-lg_size:]
lg2 = _audio[lg_size_c_l:lg_size_c_l+lg_size_r] if lgr != 1 else _audio[0:lg_size]
lg_pre = lg1*(1-lg_2)+lg2*lg_2
audio = audio[0:-(lg_size_r+lg_size_c_r)] if lgr != 1 else audio[0:-lg_size]
audio.extend(lg_pre)
_audio = _audio[lg_size_c_l+lg_size_r:] if lgr != 1 else _audio[lg_size:]
audio.extend(list(_audio))
kwarg = {
"raw_audio_path" : raw_audio_path,
"spk" : spk,
"tran" : tran,
"slice_db" : slice_db,
"cluster_infer_ratio" : cluster_infer_ratio,
"auto_predict_f0" : auto_predict_f0,
"noice_scale" : noice_scale,
"pad_seconds" : pad_seconds,
"clip_seconds" : clip,
"lg_num": lg,
"lgr_num" : lgr,
"f0_predictor" : f0p,
"enhancer_adaptive_key" : enhancer_adaptive_key,
"cr_threshold" : cr_threshold,
"k_step":k_step
}
audio = svc_model.slice_inference(**kwarg)
key = "auto" if auto_predict_f0 else f"{tran}key"
cluster_name = "" if cluster_infer_ratio == 0 else f"_{cluster_infer_ratio}"
res_path = f'./results/{clean_name}_{key}_{spk}{cluster_name}.{wav_format}'

View File

@ -417,4 +417,4 @@ class SynthesizerTrn(nn.Module):
z_p, m_p, logs_p, c_mask = self.enc_p(x, x_mask, f0=f0_to_coarse(f0), noice_scale=noice_scale)
z = self.flow(z_p, c_mask, g=g, reverse=True)
o = self.dec(z * c_mask, g=g, f0=f0)
return o
return o,f0

View File

@ -19,3 +19,4 @@ librosa==0.9.1
tensorboard
tensorboardX
edge_tts
pyyaml

View File

@ -22,3 +22,4 @@ onnxsim
onnxoptimizer
tensorboardX
edge_tts
pyyaml

View File

@ -416,7 +416,7 @@ class Volume_Extractor:
self.hop_size = hop_size
def extract(self, audio): # audio: 2d tensor array
if isinstance(audio,torch.Tensor):
if not isinstance(audio,torch.Tensor):
audio = torch.Tensor(audio)
n_frames = int(audio.size(-1) // self.hop_size)
audio2 = audio ** 2