diff --git a/inference/infer_tool.py b/inference/infer_tool.py index 6440fb0..dd1799a 100644 --- a/inference/infer_tool.py +++ b/inference/infer_tool.py @@ -190,38 +190,59 @@ class Svc(object): # 清理显存 torch.cuda.empty_cache() - def slice_inference(self,raw_audio_path, spk, tran, slice_db,cluster_infer_ratio, auto_predict_f0,noice_scale, pad_seconds=0.5): + def slice_inference(self,raw_audio_path, spk, tran, slice_db,cluster_infer_ratio, auto_predict_f0,noice_scale, pad_seconds=0.5, clip_seconds=0,lg_num=0,lgr_num =0.75): wav_path = raw_audio_path chunks = slicer.cut(wav_path, db_thresh=slice_db) audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks) - + per_size = int(clip_seconds*audio_sr) + lg_size = int(lg_num*audio_sr) + lg_size_r = int(lg_size*lgr_num) + lg_size_c_l = (lg_size-lg_size_r)//2 + lg_size_c_r = lg_size-lg_size_r-lg_size_c_l + lg = np.linspace(0,1,lg_size_r) if lg_size!=0 else 0 + audio = [] for (slice_tag, data) in audio_data: print(f'#=====segment start, {round(len(data) / audio_sr, 3)}s======') # padd - pad_len = int(audio_sr * pad_seconds) - data = np.concatenate([np.zeros([pad_len]), data, np.zeros([pad_len])]) length = int(np.ceil(len(data) / audio_sr * self.target_sample)) - raw_path = io.BytesIO() - soundfile.write(raw_path, data, audio_sr, format="wav") - raw_path.seek(0) if slice_tag: print('jump empty segment') _audio = np.zeros(length) + audio.extend(list(pad_array(_audio, length))) + continue + if per_size != 0: + datas = split_list_by_n(data, per_size,lg_size) else: + datas = [data] + for k,dat in enumerate(datas): + per_length = int(np.ceil(len(dat) / audio_sr * self.target_sample)) if clip_seconds!=0 else length + if clip_seconds!=0: print(f'###=====segment clip start, {round(len(dat) / audio_sr, 3)}s======') + # padd + pad_len = int(audio_sr * pad_seconds) + dat = np.concatenate([np.zeros([pad_len]), dat, np.zeros([pad_len])]) + raw_path = io.BytesIO() + soundfile.write(raw_path, dat, audio_sr, format="wav") + raw_path.seek(0) out_audio, out_sr = self.infer(spk, tran, raw_path, cluster_infer_ratio=cluster_infer_ratio, auto_predict_f0=auto_predict_f0, noice_scale=noice_scale ) _audio = out_audio.cpu().numpy() - - pad_len = int(self.target_sample * pad_seconds) - _audio = _audio[pad_len:-pad_len] - audio.extend(list(_audio)) + pad_len = int(self.target_sample * pad_seconds) + _audio = _audio[pad_len:-pad_len] + _audio = pad_array(_audio, per_length) + if lg_size!=0 and k!=0: + lg1 = audio[-(lg_size_r+lg_size_c_r):-lg_size_c_r] if lgr_num != 1 else audio[-lg_size:] + lg2 = _audio[lg_size_c_l:lg_size_c_l+lg_size_r] if lgr_num != 1 else _audio[0:lg_size] + lg_pre = lg1*(1-lg)+lg2*lg + audio = audio[0:-(lg_size_r+lg_size_c_r)] if lgr_num != 1 else audio[0:-lg_size] + audio.extend(lg_pre) + _audio = _audio[lg_size_c_l+lg_size_r:] if lgr_num != 1 else _audio[lg_size:] + audio.extend(list(_audio)) return np.array(audio) - class RealTimeVC: def __init__(self): self.last_chunk = None diff --git a/webUI.py b/webUI.py new file mode 100644 index 0000000..1795982 --- /dev/null +++ b/webUI.py @@ -0,0 +1,94 @@ +import io +import os + +# os.system("wget -P cvec/ https://huggingface.co/spaces/innnky/nanami/resolve/main/checkpoint_best_legacy_500.pt") +import gradio as gr +import librosa +import numpy as np +import soundfile +from inference.infer_tool import Svc +import logging + +logging.getLogger('numba').setLevel(logging.WARNING) +logging.getLogger('markdown_it').setLevel(logging.WARNING) +logging.getLogger('urllib3').setLevel(logging.WARNING) +logging.getLogger('matplotlib').setLevel(logging.WARNING) +logging.getLogger('multipart').setLevel(logging.WARNING) + +model = None +spk = None + +def vc_fn(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num): + global model + try: + if input_audio is None: + return "You need to upload an audio", None + if model is None: + return "You need to upload an model", None + sampling_rate, audio = input_audio + # print(audio.shape,sampling_rate) + audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32) + if len(audio.shape) > 1: + audio = librosa.to_mono(audio.transpose(1, 0)) + temp_path = "temp.wav" + soundfile.write(temp_path, audio, model.target_sample, format="wav") + _audio = model.slice_inference(temp_path, sid, vc_transform, slice_db, cluster_ratio, auto_f0, noise_scale,pad_seconds,cl_num,lg_num,lgr_num) + os.remove(temp_path) + return "Success", (model.target_sample, _audio) + except Exception as e: + return "异常信息:"+str(e)+"\n请排障后重试",None + +app = gr.Blocks() +with app: + with gr.Tabs(): + with gr.TabItem("Sovits4.0"): + gr.Markdown(value=""" + Sovits4.0 WebUI + """) + + gr.Markdown(value=""" + 下面是模型文件选择: + """) + model_path = gr.File(label="模型文件") + gr.Markdown(value=""" + 下面是配置文件选择: + """) + config_path = gr.File(label="配置文件") + gr.Markdown(value=""" + 下面是聚类模型文件选择,没有可以不填: + """) + cluster_model_path = gr.File(label="聚类模型文件") + device = gr.Dropdown(label="推理设备,留白则为自动选择cpu和gpu",choices=[None,"gpu","cpu"],value=None) + gr.Markdown(value=""" + 全部上传完毕后(全部文件模块显示download),点击模型解析进行解析: + """) + model_analysis_button = gr.Button(value="模型解析") + sid = gr.Dropdown(label="音色(说话人)") + sid_output = gr.Textbox(label="Output Message") + vc_input3 = gr.Audio(label="上传音频") + vc_transform = gr.Number(label="变调(整数,可以正负,半音数量,升高八度就是12)", value=0) + cluster_ratio = gr.Number(label="聚类模型混合比例,0-1之间,默认为0不启用聚类,能提升音色相似度,但会导致咬字下降(如果使用建议0.5左右)", value=0) + auto_f0 = gr.Checkbox(label="自动f0预测,配合聚类模型f0预测效果更好,会导致变调功能失效(仅限转换语音,歌声不要勾选此项会究极跑调)", value=False) + slice_db = gr.Number(label="切片阈值", value=-40) + noise_scale = gr.Number(label="noise_scale 建议不要动,会影响音质,玄学参数", value=0.4) + cl_num = gr.Number(label="音频自动切片,0为不切片,单位为秒/s", value=0) + pad_seconds = gr.Number(label="推理音频pad秒数,由于未知原因开头结尾会有异响,pad一小段静音段后就不会出现", value=0.5) + lg_num = gr.Number(label="两端音频切片的交叉淡入长度,如果自动切片后出现人声不连贯可调整该数值,如果连贯建议采用默认值0,注意,该设置会影响推理速度,单位为秒/s", value=0) + lgr_num = gr.Number(label="自动音频切片后,需要舍弃每段切片的头尾。该参数设置交叉长度保留的比例,范围0-1,左开右闭", value=0.75,interactive=True) + vc_submit = gr.Button("转换", variant="primary") + vc_output1 = gr.Textbox(label="Output Message") + vc_output2 = gr.Audio(label="Output Audio") + def modelAnalysis(model_path,config_path,cluster_model_path,device): + try: + global model + model = Svc(model_path.name, config_path.name,device=device if device!="" else None,cluster_model_path= cluster_model_path.name if cluster_model_path!=None else "") + spks = list(model.spk2id.keys()) + return sid.update(choices = spks,value=spks[0]),"ok" + except Exception as e: + return "","异常信息:"+str(e)+"\n请排障后重试" + vc_submit.click(vc_fn, [sid, vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num], [vc_output1, vc_output2]) + model_analysis_button.click(modelAnalysis,[model_path,config_path,cluster_model_path,device],[sid,sid_output]) + app.launch() + + +