Diif Updata
This commit is contained in:
parent
8cc7645379
commit
7dbb0ba66a
|
@ -145,10 +145,10 @@ class AudioDataset(Dataset):
|
|||
if n_spk is not None and n_spk > 1:
|
||||
spk_name = name_ext.split("/")[-2]
|
||||
spk_id = spk[spk_name] if spk_name in spk else 0
|
||||
if spk_id < 1 or spk_id > n_spk:
|
||||
raise ValueError(' [x] Muiti-speaker traing error : spk_id must be a positive integer from 1 to n_spk ')
|
||||
if spk_id < 0 or spk_id >= n_spk:
|
||||
raise ValueError(' [x] Muiti-speaker traing error : spk_id must be a positive integer from 0 to n_spk-1 ')
|
||||
else:
|
||||
spk_id = 1
|
||||
spk_id = 0
|
||||
spk_id = torch.LongTensor(np.array([spk_id])).to(device)
|
||||
|
||||
if load_all_data:
|
||||
|
|
|
@ -22,7 +22,8 @@ def test(args, model, vocoder, loader_test, saver):
|
|||
# run
|
||||
with torch.no_grad():
|
||||
for bidx, data in enumerate(loader_test):
|
||||
fn = data['name'][0]
|
||||
fn = data['name'][0].split("/")[-1]
|
||||
speaker = data['name'][0].split("/")[-2]
|
||||
print('--------')
|
||||
print('{}/{} - {}'.format(bidx, num_batches, fn))
|
||||
|
||||
|
@ -65,16 +66,15 @@ def test(args, model, vocoder, loader_test, saver):
|
|||
test_loss += loss.item()
|
||||
|
||||
# log mel
|
||||
saver.log_spec(data['name'][0], data['mel'], mel)
|
||||
saver.log_spec(f"{speaker}_{fn}.wav", data['mel'], mel)
|
||||
|
||||
# log audio
|
||||
path_audio = os.path.join(args.data.valid_path, 'audio', data['name_ext'][0])
|
||||
# log audi
|
||||
path_audio = data['name_ext'][0]
|
||||
audio, sr = librosa.load(path_audio, sr=args.data.sampling_rate)
|
||||
if len(audio.shape) > 1:
|
||||
audio = librosa.to_mono(audio)
|
||||
audio = torch.from_numpy(audio).unsqueeze(0).to(signal)
|
||||
saver.log_audio({fn+'/gt.wav': audio, fn+'/pred.wav': signal})
|
||||
|
||||
saver.log_audio({f"{speaker}_{fn}_gt.wav": audio,f"{speaker}_{fn}_pred.wav": signal})
|
||||
# report
|
||||
test_loss /= args.train.batch_size
|
||||
test_loss /= num_batches
|
||||
|
@ -107,6 +107,7 @@ def train(args, initial_global_step, model, optimizer, scheduler, vocoder, loade
|
|||
dtype = torch.bfloat16
|
||||
else:
|
||||
raise ValueError(' [x] Unknown amp_dtype: ' + args.train.amp_dtype)
|
||||
saver.log_info("epoch|batch_idx/num_batches|output_dir|batch/s|lr|time|step")
|
||||
for epoch in range(args.train.epochs):
|
||||
for batch_idx, data in enumerate(loader_train):
|
||||
saver.global_step_increment()
|
||||
|
|
|
@ -18,8 +18,12 @@ class DotDict(dict):
|
|||
|
||||
def load_model_vocoder(
|
||||
model_path,
|
||||
device='cpu'):
|
||||
config_file = os.path.join(os.path.split(model_path)[0], 'config.yaml')
|
||||
device='cpu',
|
||||
config_path = None
|
||||
):
|
||||
if config_path is None: config_file = os.path.join(os.path.split(model_path)[0], 'config.yaml')
|
||||
else: config_file = config_path
|
||||
|
||||
with open(config_file, "r") as config:
|
||||
args = yaml.safe_load(config)
|
||||
args = DotDict(args)
|
||||
|
@ -85,9 +89,9 @@ class Unit2Mel(nn.Module):
|
|||
if spk_mix_dict is not None:
|
||||
for k, v in spk_mix_dict.items():
|
||||
spk_id_torch = torch.LongTensor(np.array([[k]])).to(units.device)
|
||||
x = x + v * self.spk_embed(spk_id_torch - 1)
|
||||
x = x + v * self.spk_embed(spk_id_torch)
|
||||
else:
|
||||
x = x + self.spk_embed(spk_id - 1)
|
||||
x = x + self.spk_embed(spk_id)
|
||||
if self.aug_shift_embed is not None and aug_shift is not None:
|
||||
x = x + self.aug_shift_embed(aug_shift / 5)
|
||||
x = self.decoder(x, gt_spec=gt_spec, infer=infer, infer_speedup=infer_speedup, method=method, k_step=k_step, use_tqdm=use_tqdm)
|
||||
|
|
|
@ -19,6 +19,9 @@ import cluster
|
|||
import utils
|
||||
from models import SynthesizerTrn
|
||||
|
||||
from diffusion.unit2mel import load_model_vocoder
|
||||
import yaml
|
||||
|
||||
logging.getLogger('matplotlib').setLevel(logging.WARNING)
|
||||
|
||||
|
||||
|
@ -114,7 +117,11 @@ class Svc(object):
|
|||
def __init__(self, net_g_path, config_path,
|
||||
device=None,
|
||||
cluster_model_path="logs/44k/kmeans_10000.pt",
|
||||
nsf_hifigan_enhance = False
|
||||
nsf_hifigan_enhance = False,
|
||||
diffusion_model_path="logs/44k/diffusion/model_0.pt",
|
||||
diffusion_config_path="configs/diffusion.yaml",
|
||||
shallow_diffusion = False,
|
||||
only_diffusion = False,
|
||||
):
|
||||
self.net_g_path = net_g_path
|
||||
if device is None:
|
||||
|
@ -127,19 +134,32 @@ class Svc(object):
|
|||
self.hop_size = self.hps_ms.data.hop_length
|
||||
self.spk2id = self.hps_ms.spk
|
||||
self.nsf_hifigan_enhance = nsf_hifigan_enhance
|
||||
self.only_diffusion = only_diffusion
|
||||
self.shallow_diffusion = shallow_diffusion
|
||||
try:
|
||||
self.speech_encoder = self.hps_ms.model.speech_encoder
|
||||
except Exception as e:
|
||||
self.speech_encoder = 'vec768l12'
|
||||
# load hubert
|
||||
self.hubert_model = utils.get_speech_encoder(self.speech_encoder,device=self.dev)
|
||||
self.load_model()
|
||||
|
||||
if self.shallow_diffusion or self.only_diffusion:
|
||||
self.diffusion_model,self.vocoder,self.diffusion_args = load_model_vocoder(diffusion_model_path,self.dev,config_path=diffusion_config_path)
|
||||
# load hubert and model
|
||||
if not self.only_diffusion:
|
||||
self.load_model()
|
||||
self.hubert_model = utils.get_speech_encoder(self.speech_encoder,device=self.dev)
|
||||
self.volume_extractor = utils.Volume_Extractor(self.hps_ms.data.hop_length)
|
||||
assert self.diffusion_args.data.encoder == self.hps_ms.model.speech_encoder
|
||||
else:
|
||||
self.hubert_model = utils.get_speech_encoder(self.diffusion_args.data.encoder,device=self.dev)
|
||||
self.volume_extractor = utils.Volume_Extractor(self.diffusion_args.data.block_size)
|
||||
|
||||
if os.path.exists(cluster_model_path):
|
||||
self.cluster_model = cluster.get_cluster_model(cluster_model_path)
|
||||
if self.shallow_diffusion : self.nsf_hifigan_enhance = False
|
||||
if self.nsf_hifigan_enhance:
|
||||
from modules.enhancer import Enhancer
|
||||
self.enhancer = Enhancer('nsf-hifigan', 'pretrain/nsf_hifigan/model',device=self.dev)
|
||||
|
||||
|
||||
def load_model(self):
|
||||
# get model configuration
|
||||
self.net_g_ms = SynthesizerTrn(
|
||||
|
@ -157,7 +177,7 @@ class Svc(object):
|
|||
def get_unit_f0(self, in_path, tran, cluster_infer_ratio, speaker, f0_filter ,f0_predictor,cr_threshold=0.05):
|
||||
|
||||
wav, sr = librosa.load(in_path, sr=self.target_sample)
|
||||
|
||||
|
||||
f0_predictor_object = utils.get_f0_predictor(f0_predictor,hop_length=self.hop_size,sampling_rate=self.target_sample,device=self.dev,threshold=cr_threshold)
|
||||
|
||||
f0, uv = f0_predictor_object.compute_f0_uv(wav)
|
||||
|
@ -190,7 +210,8 @@ class Svc(object):
|
|||
f0_filter=False,
|
||||
f0_predictor='pm',
|
||||
enhancer_adaptive_key = 0,
|
||||
cr_threshold = 0.05
|
||||
cr_threshold = 0.05,
|
||||
k_step = 100
|
||||
):
|
||||
|
||||
speaker_id = self.spk2id.__dict__.get(speaker)
|
||||
|
@ -203,7 +224,44 @@ class Svc(object):
|
|||
c = c.half()
|
||||
with torch.no_grad():
|
||||
start = time.time()
|
||||
audio = self.net_g_ms.infer(c, f0=f0, g=sid, uv=uv, predict_f0=auto_predict_f0, noice_scale=noice_scale)[0,0].data.float()
|
||||
if not self.only_diffusion:
|
||||
audio,f0 = self.net_g_ms.infer(c, f0=f0, g=sid, uv=uv, predict_f0=auto_predict_f0, noice_scale=noice_scale)
|
||||
audio = audio[0,0].data.float()
|
||||
if self.shallow_diffusion:
|
||||
audio_mel = self.vocoder.extract(audio[None,:])
|
||||
vol = self.volume_extractor.extract(audio[None,:])[None,:,None].to(self.dev)
|
||||
f0 = f0[:,:,None]
|
||||
c = c.transpose(-1,-2)
|
||||
audio_mel = self.diffusion_model(
|
||||
c,
|
||||
f0,
|
||||
vol,
|
||||
spk_id = sid,
|
||||
spk_mix_dict = None,
|
||||
gt_spec=audio_mel,
|
||||
infer=True,
|
||||
infer_speedup=self.diffusion_args.infer.speedup,
|
||||
method=self.diffusion_args.infer.methold,
|
||||
k_step=k_step)
|
||||
audio = self.vocoder.infer(audio_mel, f0).squeeze()
|
||||
else:
|
||||
wav, sr = librosa.load(raw_path, sr=self.target_sample)
|
||||
wav = torch.FloatTensor(wav).to(self.dev)
|
||||
vol = self.volume_extractor.extract(wav[None,:])[None,:,None]
|
||||
c = c.transpose(-1,-2)
|
||||
f0 = f0[:,:,None]
|
||||
audio_mel = self.diffusion_model(
|
||||
c,
|
||||
f0,
|
||||
vol,
|
||||
spk_id = sid,
|
||||
spk_mix_dict = None,
|
||||
gt_spec=None,
|
||||
infer=True,
|
||||
infer_speedup=self.diffusion_args.infer.speedup,
|
||||
method=self.diffusion_args.infer.methold,
|
||||
k_step=k_step)
|
||||
audio = self.vocoder.infer(audio_mel, f0).squeeze()
|
||||
if self.nsf_hifigan_enhance:
|
||||
audio, _ = self.enhancer.enhance(
|
||||
audio[None,:],
|
||||
|
@ -243,9 +301,10 @@ class Svc(object):
|
|||
lgr_num =0.75,
|
||||
f0_predictor='pm',
|
||||
enhancer_adaptive_key = 0,
|
||||
cr_threshold = 0.05
|
||||
cr_threshold = 0.05,
|
||||
k_step = 100
|
||||
):
|
||||
wav_path = raw_audio_path
|
||||
wav_path = Path(raw_audio_path).with_suffix('.wav')
|
||||
chunks = slicer.cut(wav_path, db_thresh=slice_db)
|
||||
audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)
|
||||
per_size = int(clip_seconds*audio_sr)
|
||||
|
@ -284,7 +343,8 @@ class Svc(object):
|
|||
noice_scale=noice_scale,
|
||||
f0_predictor = f0_predictor,
|
||||
enhancer_adaptive_key = enhancer_adaptive_key,
|
||||
cr_threshold = cr_threshold
|
||||
cr_threshold = cr_threshold,
|
||||
k_step = k_step
|
||||
)
|
||||
_audio = out_audio.cpu().numpy()
|
||||
pad_len = int(self.target_sample * pad_seconds)
|
||||
|
@ -327,7 +387,7 @@ class RealTimeVC:
|
|||
auto_predict_f0=auto_predict_f0,
|
||||
noice_scale=noice_scale,
|
||||
f0_filter=f0_filter)
|
||||
|
||||
|
||||
audio = audio.cpu().numpy()
|
||||
self.last_chunk = audio[-self.pre_len:]
|
||||
self.last_o = audio
|
||||
|
@ -348,3 +408,4 @@ class RealTimeVC:
|
|||
self.last_chunk = audio[-self.pre_len:]
|
||||
self.last_o = audio
|
||||
return ret[self.chunk_len:2 * self.chunk_len]
|
||||
|
|
@ -29,7 +29,7 @@ def main():
|
|||
parser.add_argument('-n', '--clean_names', type=str, nargs='+', default=["君の知らない物語-src.wav"], help='wav文件名列表,放在raw文件夹下')
|
||||
parser.add_argument('-t', '--trans', type=int, nargs='+', default=[0], help='音高调整,支持正负(半音)')
|
||||
parser.add_argument('-s', '--spk_list', type=str, nargs='+', default=['nen'], help='合成目标说话人名称')
|
||||
|
||||
|
||||
# 可选项部分
|
||||
parser.add_argument('-a', '--auto_predict_f0', action='store_true', default=False, help='语音转换自动预测音高,转换歌声时不要打开这个会严重跑调')
|
||||
parser.add_argument('-cm', '--cluster_model_path', type=str, default="logs/44k/kmeans_10000.pt", help='聚类模型路径,如果没有训练聚类则随便填')
|
||||
|
@ -37,6 +37,13 @@ def main():
|
|||
parser.add_argument('-lg', '--linear_gradient', type=float, default=0, help='两段音频切片的交叉淡入长度,如果强制切片后出现人声不连贯可调整该数值,如果连贯建议采用默认值0,单位为秒')
|
||||
parser.add_argument('-f0p', '--f0_predictor', type=str, default="pm", help='选择F0预测器,可选择crepe,pm,dio,harvest,默认为pm(注意:crepe为原F0使用均值滤波器)')
|
||||
parser.add_argument('-eh', '--enhance', action='store_true', default=False, help='是否使用NSF_HIFIGAN增强器,该选项对部分训练集少的模型有一定的音质增强效果,但是对训练好的模型有反面效果,默认关闭')
|
||||
parser.add_argument('-shd', '--shallow_diffusion', action='store_true', default=False, help='是否使用浅层扩散,使用后可解决一部分电音问题,默认关闭,该选项打开时,NSF_HIFIGAN增强器将会被禁止')
|
||||
|
||||
# 浅扩散设置
|
||||
parser.add_argument('-dm', '--diffusion_model_path', type=str, default="logs/44k/diffusion/model_0.pt", help='扩散模型路径')
|
||||
parser.add_argument('-dc', '--diffusion_config_path', type=str, default="logs/44k/diffusion/config.yaml", help='扩散模型配置文件路径')
|
||||
parser.add_argument('-ks', '--k_step', type=int, default=100, help='扩散步数,越大越接近扩散模型的结果,默认100')
|
||||
parser.add_argument('-od', '--only_diffusion', action='store_true', default=False, help='纯扩散模式,该模式不会加载sovits模型,以扩散模型推理')
|
||||
|
||||
# 不用动的部分
|
||||
parser.add_argument('-sd', '--slice_db', type=int, default=-40, help='默认-40,嘈杂的音频可以-30,干声保留呼吸可以-50')
|
||||
|
@ -67,70 +74,40 @@ def main():
|
|||
enhance = args.enhance
|
||||
enhancer_adaptive_key = args.enhancer_adaptive_key
|
||||
cr_threshold = args.f0_filter_threshold
|
||||
diffusion_model_path = args.diffusion_model_path
|
||||
diffusion_config_path = args.diffusion_config_path
|
||||
k_step = args.k_step
|
||||
only_diffusion = args.only_diffusion
|
||||
shallow_diffusion = args.shallow_diffusion
|
||||
|
||||
svc_model = Svc(args.model_path, args.config_path, args.device, args.cluster_model_path,enhance)
|
||||
svc_model = Svc(args.model_path, args.config_path, args.device, args.cluster_model_path,enhance,diffusion_model_path,diffusion_config_path,shallow_diffusion,only_diffusion)
|
||||
infer_tool.mkdir(["raw", "results"])
|
||||
|
||||
|
||||
infer_tool.fill_a_to_b(trans, clean_names)
|
||||
for clean_name, tran in zip(clean_names, trans):
|
||||
raw_audio_path = f"raw/{clean_name}"
|
||||
if "." not in raw_audio_path:
|
||||
raw_audio_path += ".wav"
|
||||
infer_tool.format_wav(raw_audio_path)
|
||||
wav_path = Path(raw_audio_path).with_suffix('.wav')
|
||||
chunks = slicer.cut(wav_path, db_thresh=slice_db)
|
||||
audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)
|
||||
per_size = int(clip*audio_sr)
|
||||
lg_size = int(lg*audio_sr)
|
||||
lg_size_r = int(lg_size*lgr)
|
||||
lg_size_c_l = (lg_size-lg_size_r)//2
|
||||
lg_size_c_r = lg_size-lg_size_r-lg_size_c_l
|
||||
lg_2 = np.linspace(0,1,lg_size_r) if lg_size!=0 else 0
|
||||
|
||||
for spk in spk_list:
|
||||
audio = []
|
||||
for (slice_tag, data) in audio_data:
|
||||
print(f'#=====segment start, {round(len(data) / audio_sr, 3)}s======')
|
||||
|
||||
length = int(np.ceil(len(data) / audio_sr * svc_model.target_sample))
|
||||
if slice_tag:
|
||||
print('jump empty segment')
|
||||
_audio = np.zeros(length)
|
||||
audio.extend(list(infer_tool.pad_array(_audio, length)))
|
||||
continue
|
||||
if per_size != 0:
|
||||
datas = infer_tool.split_list_by_n(data, per_size,lg_size)
|
||||
else:
|
||||
datas = [data]
|
||||
for k,dat in enumerate(datas):
|
||||
per_length = int(np.ceil(len(dat) / audio_sr * svc_model.target_sample)) if clip!=0 else length
|
||||
if clip!=0: print(f'###=====segment clip start, {round(len(dat) / audio_sr, 3)}s======')
|
||||
# padd
|
||||
pad_len = int(audio_sr * pad_seconds)
|
||||
dat = np.concatenate([np.zeros([pad_len]), dat, np.zeros([pad_len])])
|
||||
raw_path = io.BytesIO()
|
||||
soundfile.write(raw_path, dat, audio_sr, format="wav")
|
||||
raw_path.seek(0)
|
||||
out_audio, out_sr = svc_model.infer(spk, tran, raw_path,
|
||||
cluster_infer_ratio=cluster_infer_ratio,
|
||||
auto_predict_f0=auto_predict_f0,
|
||||
noice_scale=noice_scale,
|
||||
f0_predictor = f0p,
|
||||
enhancer_adaptive_key = enhancer_adaptive_key,
|
||||
cr_threshold = cr_threshold
|
||||
)
|
||||
_audio = out_audio.cpu().numpy()
|
||||
pad_len = int(svc_model.target_sample * pad_seconds)
|
||||
_audio = _audio[pad_len:-pad_len]
|
||||
_audio = infer_tool.pad_array(_audio, per_length)
|
||||
if lg_size!=0 and k!=0:
|
||||
lg1 = audio[-(lg_size_r+lg_size_c_r):-lg_size_c_r] if lgr != 1 else audio[-lg_size:]
|
||||
lg2 = _audio[lg_size_c_l:lg_size_c_l+lg_size_r] if lgr != 1 else _audio[0:lg_size]
|
||||
lg_pre = lg1*(1-lg_2)+lg2*lg_2
|
||||
audio = audio[0:-(lg_size_r+lg_size_c_r)] if lgr != 1 else audio[0:-lg_size]
|
||||
audio.extend(lg_pre)
|
||||
_audio = _audio[lg_size_c_l+lg_size_r:] if lgr != 1 else _audio[lg_size:]
|
||||
audio.extend(list(_audio))
|
||||
kwarg = {
|
||||
"raw_audio_path" : raw_audio_path,
|
||||
"spk" : spk,
|
||||
"tran" : tran,
|
||||
"slice_db" : slice_db,
|
||||
"cluster_infer_ratio" : cluster_infer_ratio,
|
||||
"auto_predict_f0" : auto_predict_f0,
|
||||
"noice_scale" : noice_scale,
|
||||
"pad_seconds" : pad_seconds,
|
||||
"clip_seconds" : clip,
|
||||
"lg_num": lg,
|
||||
"lgr_num" : lgr,
|
||||
"f0_predictor" : f0p,
|
||||
"enhancer_adaptive_key" : enhancer_adaptive_key,
|
||||
"cr_threshold" : cr_threshold,
|
||||
"k_step":k_step
|
||||
}
|
||||
audio = svc_model.slice_inference(**kwarg)
|
||||
key = "auto" if auto_predict_f0 else f"{tran}key"
|
||||
cluster_name = "" if cluster_infer_ratio == 0 else f"_{cluster_infer_ratio}"
|
||||
res_path = f'./results/{clean_name}_{key}_{spk}{cluster_name}.{wav_format}'
|
||||
|
|
|
@ -417,4 +417,4 @@ class SynthesizerTrn(nn.Module):
|
|||
z_p, m_p, logs_p, c_mask = self.enc_p(x, x_mask, f0=f0_to_coarse(f0), noice_scale=noice_scale)
|
||||
z = self.flow(z_p, c_mask, g=g, reverse=True)
|
||||
o = self.dec(z * c_mask, g=g, f0=f0)
|
||||
return o
|
||||
return o,f0
|
||||
|
|
|
@ -19,3 +19,4 @@ librosa==0.9.1
|
|||
tensorboard
|
||||
tensorboardX
|
||||
edge_tts
|
||||
pyyaml
|
||||
|
|
|
@ -22,3 +22,4 @@ onnxsim
|
|||
onnxoptimizer
|
||||
tensorboardX
|
||||
edge_tts
|
||||
pyyaml
|
||||
|
|
2
utils.py
2
utils.py
|
@ -416,7 +416,7 @@ class Volume_Extractor:
|
|||
self.hop_size = hop_size
|
||||
|
||||
def extract(self, audio): # audio: 2d tensor array
|
||||
if isinstance(audio,torch.Tensor):
|
||||
if not isinstance(audio,torch.Tensor):
|
||||
audio = torch.Tensor(audio)
|
||||
n_frames = int(audio.size(-1) // self.hop_size)
|
||||
audio2 = audio ** 2
|
||||
|
|
Loading…
Reference in New Issue