commit
8bff4720bd
|
@ -190,38 +190,59 @@ class Svc(object):
|
||||||
# 清理显存
|
# 清理显存
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
def slice_inference(self,raw_audio_path, spk, tran, slice_db,cluster_infer_ratio, auto_predict_f0,noice_scale, pad_seconds=0.5):
|
def slice_inference(self,raw_audio_path, spk, tran, slice_db,cluster_infer_ratio, auto_predict_f0,noice_scale, pad_seconds=0.5, clip_seconds=0,lg_num=0,lgr_num =0.75):
|
||||||
wav_path = raw_audio_path
|
wav_path = raw_audio_path
|
||||||
chunks = slicer.cut(wav_path, db_thresh=slice_db)
|
chunks = slicer.cut(wav_path, db_thresh=slice_db)
|
||||||
audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)
|
audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)
|
||||||
|
per_size = int(clip_seconds*audio_sr)
|
||||||
|
lg_size = int(lg_num*audio_sr)
|
||||||
|
lg_size_r = int(lg_size*lgr_num)
|
||||||
|
lg_size_c_l = (lg_size-lg_size_r)//2
|
||||||
|
lg_size_c_r = lg_size-lg_size_r-lg_size_c_l
|
||||||
|
lg = np.linspace(0,1,lg_size_r) if lg_size!=0 else 0
|
||||||
|
|
||||||
audio = []
|
audio = []
|
||||||
for (slice_tag, data) in audio_data:
|
for (slice_tag, data) in audio_data:
|
||||||
print(f'#=====segment start, {round(len(data) / audio_sr, 3)}s======')
|
print(f'#=====segment start, {round(len(data) / audio_sr, 3)}s======')
|
||||||
# padd
|
# padd
|
||||||
pad_len = int(audio_sr * pad_seconds)
|
|
||||||
data = np.concatenate([np.zeros([pad_len]), data, np.zeros([pad_len])])
|
|
||||||
length = int(np.ceil(len(data) / audio_sr * self.target_sample))
|
length = int(np.ceil(len(data) / audio_sr * self.target_sample))
|
||||||
raw_path = io.BytesIO()
|
|
||||||
soundfile.write(raw_path, data, audio_sr, format="wav")
|
|
||||||
raw_path.seek(0)
|
|
||||||
if slice_tag:
|
if slice_tag:
|
||||||
print('jump empty segment')
|
print('jump empty segment')
|
||||||
_audio = np.zeros(length)
|
_audio = np.zeros(length)
|
||||||
|
audio.extend(list(pad_array(_audio, length)))
|
||||||
|
continue
|
||||||
|
if per_size != 0:
|
||||||
|
datas = split_list_by_n(data, per_size,lg_size)
|
||||||
else:
|
else:
|
||||||
|
datas = [data]
|
||||||
|
for k,dat in enumerate(datas):
|
||||||
|
per_length = int(np.ceil(len(dat) / audio_sr * self.target_sample)) if clip_seconds!=0 else length
|
||||||
|
if clip_seconds!=0: print(f'###=====segment clip start, {round(len(dat) / audio_sr, 3)}s======')
|
||||||
|
# padd
|
||||||
|
pad_len = int(audio_sr * pad_seconds)
|
||||||
|
dat = np.concatenate([np.zeros([pad_len]), dat, np.zeros([pad_len])])
|
||||||
|
raw_path = io.BytesIO()
|
||||||
|
soundfile.write(raw_path, dat, audio_sr, format="wav")
|
||||||
|
raw_path.seek(0)
|
||||||
out_audio, out_sr = self.infer(spk, tran, raw_path,
|
out_audio, out_sr = self.infer(spk, tran, raw_path,
|
||||||
cluster_infer_ratio=cluster_infer_ratio,
|
cluster_infer_ratio=cluster_infer_ratio,
|
||||||
auto_predict_f0=auto_predict_f0,
|
auto_predict_f0=auto_predict_f0,
|
||||||
noice_scale=noice_scale
|
noice_scale=noice_scale
|
||||||
)
|
)
|
||||||
_audio = out_audio.cpu().numpy()
|
_audio = out_audio.cpu().numpy()
|
||||||
|
pad_len = int(self.target_sample * pad_seconds)
|
||||||
pad_len = int(self.target_sample * pad_seconds)
|
_audio = _audio[pad_len:-pad_len]
|
||||||
_audio = _audio[pad_len:-pad_len]
|
_audio = pad_array(_audio, per_length)
|
||||||
audio.extend(list(_audio))
|
if lg_size!=0 and k!=0:
|
||||||
|
lg1 = audio[-(lg_size_r+lg_size_c_r):-lg_size_c_r] if lgr_num != 1 else audio[-lg_size:]
|
||||||
|
lg2 = _audio[lg_size_c_l:lg_size_c_l+lg_size_r] if lgr_num != 1 else _audio[0:lg_size]
|
||||||
|
lg_pre = lg1*(1-lg)+lg2*lg
|
||||||
|
audio = audio[0:-(lg_size_r+lg_size_c_r)] if lgr_num != 1 else audio[0:-lg_size]
|
||||||
|
audio.extend(lg_pre)
|
||||||
|
_audio = _audio[lg_size_c_l+lg_size_r:] if lgr_num != 1 else _audio[lg_size:]
|
||||||
|
audio.extend(list(_audio))
|
||||||
return np.array(audio)
|
return np.array(audio)
|
||||||
|
|
||||||
|
|
||||||
class RealTimeVC:
|
class RealTimeVC:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.last_chunk = None
|
self.last_chunk = None
|
||||||
|
|
|
@ -0,0 +1,94 @@
|
||||||
|
import io
|
||||||
|
import os
|
||||||
|
|
||||||
|
# os.system("wget -P cvec/ https://huggingface.co/spaces/innnky/nanami/resolve/main/checkpoint_best_legacy_500.pt")
|
||||||
|
import gradio as gr
|
||||||
|
import librosa
|
||||||
|
import numpy as np
|
||||||
|
import soundfile
|
||||||
|
from inference.infer_tool import Svc
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logging.getLogger('numba').setLevel(logging.WARNING)
|
||||||
|
logging.getLogger('markdown_it').setLevel(logging.WARNING)
|
||||||
|
logging.getLogger('urllib3').setLevel(logging.WARNING)
|
||||||
|
logging.getLogger('matplotlib').setLevel(logging.WARNING)
|
||||||
|
logging.getLogger('multipart').setLevel(logging.WARNING)
|
||||||
|
|
||||||
|
model = None
|
||||||
|
spk = None
|
||||||
|
|
||||||
|
def vc_fn(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num):
|
||||||
|
global model
|
||||||
|
try:
|
||||||
|
if input_audio is None:
|
||||||
|
return "You need to upload an audio", None
|
||||||
|
if model is None:
|
||||||
|
return "You need to upload an model", None
|
||||||
|
sampling_rate, audio = input_audio
|
||||||
|
# print(audio.shape,sampling_rate)
|
||||||
|
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
|
||||||
|
if len(audio.shape) > 1:
|
||||||
|
audio = librosa.to_mono(audio.transpose(1, 0))
|
||||||
|
temp_path = "temp.wav"
|
||||||
|
soundfile.write(temp_path, audio, model.target_sample, format="wav")
|
||||||
|
_audio = model.slice_inference(temp_path, sid, vc_transform, slice_db, cluster_ratio, auto_f0, noise_scale,pad_seconds,cl_num,lg_num,lgr_num)
|
||||||
|
os.remove(temp_path)
|
||||||
|
return "Success", (model.target_sample, _audio)
|
||||||
|
except Exception as e:
|
||||||
|
return "异常信息:"+str(e)+"\n请排障后重试",None
|
||||||
|
|
||||||
|
app = gr.Blocks()
|
||||||
|
with app:
|
||||||
|
with gr.Tabs():
|
||||||
|
with gr.TabItem("Sovits4.0"):
|
||||||
|
gr.Markdown(value="""
|
||||||
|
Sovits4.0 WebUI
|
||||||
|
""")
|
||||||
|
|
||||||
|
gr.Markdown(value="""
|
||||||
|
<font size=3>下面是模型文件选择:</font>
|
||||||
|
""")
|
||||||
|
model_path = gr.File(label="模型文件")
|
||||||
|
gr.Markdown(value="""
|
||||||
|
<font size=3>下面是配置文件选择:</font>
|
||||||
|
""")
|
||||||
|
config_path = gr.File(label="配置文件")
|
||||||
|
gr.Markdown(value="""
|
||||||
|
<font size=3>下面是聚类模型文件选择,没有可以不填:</font>
|
||||||
|
""")
|
||||||
|
cluster_model_path = gr.File(label="聚类模型文件")
|
||||||
|
device = gr.Dropdown(label="推理设备,留白则为自动选择cpu和gpu",choices=[None,"gpu","cpu"],value=None)
|
||||||
|
gr.Markdown(value="""
|
||||||
|
<font size=3>全部上传完毕后(全部文件模块显示download),点击模型解析进行解析:</font>
|
||||||
|
""")
|
||||||
|
model_analysis_button = gr.Button(value="模型解析")
|
||||||
|
sid = gr.Dropdown(label="音色(说话人)")
|
||||||
|
sid_output = gr.Textbox(label="Output Message")
|
||||||
|
vc_input3 = gr.Audio(label="上传音频")
|
||||||
|
vc_transform = gr.Number(label="变调(整数,可以正负,半音数量,升高八度就是12)", value=0)
|
||||||
|
cluster_ratio = gr.Number(label="聚类模型混合比例,0-1之间,默认为0不启用聚类,能提升音色相似度,但会导致咬字下降(如果使用建议0.5左右)", value=0)
|
||||||
|
auto_f0 = gr.Checkbox(label="自动f0预测,配合聚类模型f0预测效果更好,会导致变调功能失效(仅限转换语音,歌声不要勾选此项会究极跑调)", value=False)
|
||||||
|
slice_db = gr.Number(label="切片阈值", value=-40)
|
||||||
|
noise_scale = gr.Number(label="noise_scale 建议不要动,会影响音质,玄学参数", value=0.4)
|
||||||
|
cl_num = gr.Number(label="音频自动切片,0为不切片,单位为秒/s", value=0)
|
||||||
|
pad_seconds = gr.Number(label="推理音频pad秒数,由于未知原因开头结尾会有异响,pad一小段静音段后就不会出现", value=0.5)
|
||||||
|
lg_num = gr.Number(label="两端音频切片的交叉淡入长度,如果自动切片后出现人声不连贯可调整该数值,如果连贯建议采用默认值0,注意,该设置会影响推理速度,单位为秒/s", value=0)
|
||||||
|
lgr_num = gr.Number(label="自动音频切片后,需要舍弃每段切片的头尾。该参数设置交叉长度保留的比例,范围0-1,左开右闭", value=0.75,interactive=True)
|
||||||
|
vc_submit = gr.Button("转换", variant="primary")
|
||||||
|
vc_output1 = gr.Textbox(label="Output Message")
|
||||||
|
vc_output2 = gr.Audio(label="Output Audio")
|
||||||
|
def modelAnalysis(model_path,config_path,cluster_model_path,device):
|
||||||
|
try:
|
||||||
|
global model
|
||||||
|
model = Svc(model_path.name, config_path.name,device=device if device!="" else None,cluster_model_path= cluster_model_path.name if cluster_model_path!=None else "")
|
||||||
|
spks = list(model.spk2id.keys())
|
||||||
|
return sid.update(choices = spks,value=spks[0]),"ok"
|
||||||
|
except Exception as e:
|
||||||
|
return "","异常信息:"+str(e)+"\n请排障后重试"
|
||||||
|
vc_submit.click(vc_fn, [sid, vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num], [vc_output1, vc_output2])
|
||||||
|
model_analysis_button.click(modelAnalysis,[model_path,config_path,cluster_model_path,device],[sid,sid_output])
|
||||||
|
app.launch()
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue