Updata threshold para
This commit is contained in:
parent
105dd3f798
commit
b64b2328ab
|
@ -129,7 +129,7 @@ class Svc(object):
|
|||
self.hop_size = self.hps_ms.data.hop_length
|
||||
self.spk2id = self.hps_ms.spk
|
||||
self.nsf_hifigan_enhance = nsf_hifigan_enhance
|
||||
# 加载hubert
|
||||
# load hubert
|
||||
self.hubert_model = utils.get_hubert_model().to(self.dev)
|
||||
self.load_model()
|
||||
if os.path.exists(cluster_model_path):
|
||||
|
@ -139,7 +139,7 @@ class Svc(object):
|
|||
self.enhancer = Enhancer('nsf-hifigan', 'pretrain/nsf_hifigan/model',device=self.dev)
|
||||
|
||||
def load_model(self):
|
||||
# 获取模型配置
|
||||
# get model configuration
|
||||
self.net_g_ms = SynthesizerTrn(
|
||||
self.hps_ms.data.filter_length // 2 + 1,
|
||||
self.hps_ms.train.segment_size // self.hps_ms.data.hop_length,
|
||||
|
@ -152,20 +152,20 @@ class Svc(object):
|
|||
|
||||
|
||||
|
||||
def get_unit_f0(self, in_path, tran, cluster_infer_ratio, speaker, f0_filter ,F0_mean_pooling):
|
||||
def get_unit_f0(self, in_path, tran, cluster_infer_ratio, speaker, f0_filter ,F0_mean_pooling,cr_threshold=0.05):
|
||||
|
||||
wav, sr = librosa.load(in_path, sr=self.target_sample)
|
||||
|
||||
if F0_mean_pooling == True:
|
||||
f0, uv = utils.compute_f0_uv_torchcrepe(torch.FloatTensor(wav), sampling_rate=self.target_sample, hop_length=self.hop_size,device=self.dev)
|
||||
f0, uv = utils.compute_f0_uv_torchcrepe(torch.FloatTensor(wav), sampling_rate=self.target_sample, hop_length=self.hop_size,device=self.dev,cr_threshold = cr_threshold)
|
||||
if f0_filter and sum(f0) == 0:
|
||||
raise F0FilterException("未检测到人声")
|
||||
raise F0FilterException("No voice detected")
|
||||
f0 = torch.FloatTensor(list(f0))
|
||||
uv = torch.FloatTensor(list(uv))
|
||||
if F0_mean_pooling == False:
|
||||
f0 = utils.compute_f0_parselmouth(wav, sampling_rate=self.target_sample, hop_length=self.hop_size)
|
||||
if f0_filter and sum(f0) == 0:
|
||||
raise F0FilterException("未检测到人声")
|
||||
raise F0FilterException("No voice detected")
|
||||
f0, uv = utils.interpolate_f0(f0)
|
||||
f0 = torch.FloatTensor(f0)
|
||||
uv = torch.FloatTensor(uv)
|
||||
|
@ -193,7 +193,8 @@ class Svc(object):
|
|||
noice_scale=0.4,
|
||||
f0_filter=False,
|
||||
F0_mean_pooling=False,
|
||||
enhancer_adaptive_key = 0
|
||||
enhancer_adaptive_key = 0,
|
||||
cr_threshold = 0.05
|
||||
):
|
||||
|
||||
speaker_id = self.spk2id.__dict__.get(speaker)
|
||||
|
@ -201,7 +202,7 @@ class Svc(object):
|
|||
if len(self.spk2id.__dict__) >= speaker:
|
||||
speaker_id = speaker
|
||||
sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0)
|
||||
c, f0, uv = self.get_unit_f0(raw_path, tran, cluster_infer_ratio, speaker, f0_filter,F0_mean_pooling)
|
||||
c, f0, uv = self.get_unit_f0(raw_path, tran, cluster_infer_ratio, speaker, f0_filter,F0_mean_pooling,cr_threshold=cr_threshold)
|
||||
if "half" in self.net_g_path and torch.cuda.is_available():
|
||||
c = c.half()
|
||||
with torch.no_grad():
|
||||
|
@ -219,11 +220,11 @@ class Svc(object):
|
|||
return audio, audio.shape[-1]
|
||||
|
||||
def clear_empty(self):
|
||||
# 清理显存
|
||||
# clean up vram
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def unload_model(self):
|
||||
# 卸载模型
|
||||
# unload model
|
||||
self.net_g_ms = self.net_g_ms.to("cpu")
|
||||
del self.net_g_ms
|
||||
if hasattr(self,"enhancer"):
|
||||
|
@ -245,7 +246,8 @@ class Svc(object):
|
|||
lg_num=0,
|
||||
lgr_num =0.75,
|
||||
F0_mean_pooling = False,
|
||||
enhancer_adaptive_key = 0
|
||||
enhancer_adaptive_key = 0,
|
||||
cr_threshold = 0.05
|
||||
):
|
||||
wav_path = raw_audio_path
|
||||
chunks = slicer.cut(wav_path, db_thresh=slice_db)
|
||||
|
@ -285,7 +287,8 @@ class Svc(object):
|
|||
auto_predict_f0=auto_predict_f0,
|
||||
noice_scale=noice_scale,
|
||||
F0_mean_pooling = F0_mean_pooling,
|
||||
enhancer_adaptive_key = enhancer_adaptive_key
|
||||
enhancer_adaptive_key = enhancer_adaptive_key,
|
||||
cr_threshold = cr_threshold
|
||||
)
|
||||
_audio = out_audio.cpu().numpy()
|
||||
pad_len = int(self.target_sample * pad_seconds)
|
||||
|
@ -305,10 +308,10 @@ class RealTimeVC:
|
|||
def __init__(self):
|
||||
self.last_chunk = None
|
||||
self.last_o = None
|
||||
self.chunk_len = 16000 # 区块长度
|
||||
self.pre_len = 3840 # 交叉淡化长度,640的倍数
|
||||
self.chunk_len = 16000 # chunk length
|
||||
self.pre_len = 3840 # cross fade length, multiples of 640
|
||||
|
||||
"""输入输出都是1维numpy 音频波形数组"""
|
||||
# Input and output are 1-dimensional numpy waveform arrays
|
||||
|
||||
def process(self, svc_model, speaker_id, f_pitch_change, input_wav_path,
|
||||
cluster_infer_ratio=0,
|
||||
|
|
|
@ -46,7 +46,9 @@ def main():
|
|||
parser.add_argument('-wf', '--wav_format', type=str, default='flac', help='音频输出格式')
|
||||
parser.add_argument('-lgr', '--linear_gradient_retain', type=float, default=0.75, help='自动音频切片后,需要舍弃每段切片的头尾。该参数设置交叉长度保留的比例,范围0-1,左开右闭')
|
||||
parser.add_argument('-eak', '--enhancer_adaptive_key', type=int, default=0, help='使增强器适应更高的音域(单位为半音数)|默认为0')
|
||||
|
||||
parser.add_argument('-ft', '--F0_filter_threshold', type=float, default=0.05,help='F0过滤阈值,只有启动f0_mean_pooling时有效. 数值范围从0-1. 降低该值可减少跑调概率,但会增加哑音')
|
||||
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
clean_names = args.clean_names
|
||||
|
@ -64,6 +66,7 @@ def main():
|
|||
F0_mean_pooling = args.f0_mean_pooling
|
||||
enhance = args.enhance
|
||||
enhancer_adaptive_key = args.enhancer_adaptive_key
|
||||
cr_threshold = args.F0_filter_threshold
|
||||
|
||||
svc_model = Svc(args.model_path, args.config_path, args.device, args.cluster_model_path,enhance)
|
||||
infer_tool.mkdir(["raw", "results"])
|
||||
|
@ -113,7 +116,8 @@ def main():
|
|||
auto_predict_f0=auto_predict_f0,
|
||||
noice_scale=noice_scale,
|
||||
F0_mean_pooling = F0_mean_pooling,
|
||||
enhancer_adaptive_key = enhancer_adaptive_key
|
||||
enhancer_adaptive_key = enhancer_adaptive_key,
|
||||
cr_threshold = cr_threshold
|
||||
)
|
||||
_audio = out_audio.cpu().numpy()
|
||||
pad_len = int(svc_model.target_sample * pad_seconds)
|
||||
|
|
4
utils.py
4
utils.py
|
@ -80,7 +80,7 @@ def normalize_f0(f0, x_mask, uv, random_scale=True):
|
|||
exit(0)
|
||||
return f0_norm * x_mask
|
||||
|
||||
def compute_f0_uv_torchcrepe(wav_numpy, p_len=None, sampling_rate=44100, hop_length=512,device=None):
|
||||
def compute_f0_uv_torchcrepe(wav_numpy, p_len=None, sampling_rate=44100, hop_length=512,device=None,cr_threshold=0.05):
|
||||
from modules.crepe import CrepePitchExtractor
|
||||
x = wav_numpy
|
||||
if p_len is None:
|
||||
|
@ -90,7 +90,7 @@ def compute_f0_uv_torchcrepe(wav_numpy, p_len=None, sampling_rate=44100, hop_len
|
|||
|
||||
f0_min = 50
|
||||
f0_max = 1100
|
||||
F0Creper = CrepePitchExtractor(hop_length=hop_length,f0_min=f0_min,f0_max=f0_max,device=device)
|
||||
F0Creper = CrepePitchExtractor(hop_length=hop_length,f0_min=f0_min,f0_max=f0_max,device=device,threshold=cr_threshold)
|
||||
f0,uv = F0Creper(x[None,:].float(),sampling_rate,pad_to=p_len)
|
||||
return f0,uv
|
||||
|
||||
|
|
15
webUI.py
15
webUI.py
|
@ -106,7 +106,7 @@ def modelUnload():
|
|||
return sid.update(choices = [],value=""),"模型卸载完毕!"
|
||||
|
||||
|
||||
def vc_fn(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,F0_mean_pooling,enhancer_adaptive_key):
|
||||
def vc_fn(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,F0_mean_pooling,enhancer_adaptive_key,cr_threshold):
|
||||
global model
|
||||
try:
|
||||
if input_audio is None:
|
||||
|
@ -120,7 +120,7 @@ def vc_fn(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, noise
|
|||
audio = librosa.to_mono(audio.transpose(1, 0))
|
||||
temp_path = "temp.wav"
|
||||
soundfile.write(temp_path, audio, sampling_rate, format="wav")
|
||||
_audio = model.slice_inference(temp_path, sid, vc_transform, slice_db, cluster_ratio, auto_f0, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,F0_mean_pooling,enhancer_adaptive_key)
|
||||
_audio = model.slice_inference(temp_path, sid, vc_transform, slice_db, cluster_ratio, auto_f0, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,F0_mean_pooling,enhancer_adaptive_key,cr_threshold)
|
||||
model.clear_empty()
|
||||
os.remove(temp_path)
|
||||
#构建保存文件的路径,并保存到results文件夹内
|
||||
|
@ -166,7 +166,7 @@ def tts_func(_text,_rate,_voice):
|
|||
def text_clear(text):
|
||||
return re.sub(r"[\n\,\(\) ]", "", text)
|
||||
|
||||
def vc_fn2(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,text2tts,tts_rate,tts_voice,F0_mean_pooling,enhancer_adaptive_key):
|
||||
def vc_fn2(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,text2tts,tts_rate,tts_voice,F0_mean_pooling,enhancer_adaptive_key,cr_threshold):
|
||||
#使用edge-tts把文字转成音频
|
||||
text2tts=text_clear(text2tts)
|
||||
output_file=tts_func(text2tts,tts_rate,tts_voice)
|
||||
|
@ -184,7 +184,7 @@ def vc_fn2(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, nois
|
|||
sample_rate, data=gr_pu.audio_from_file(save_path2)
|
||||
vc_input=(sample_rate, data)
|
||||
|
||||
a,b=vc_fn(sid, vc_input, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,F0_mean_pooling,enhancer_adaptive_key)
|
||||
a,b=vc_fn(sid, vc_input, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,F0_mean_pooling,enhancer_adaptive_key,cr_threshold)
|
||||
os.remove(output_file)
|
||||
os.remove(save_path2)
|
||||
return a,b
|
||||
|
@ -242,6 +242,7 @@ with gr.Blocks(
|
|||
lg_num = gr.Number(label="两端音频切片的交叉淡入长度,如果自动切片后出现人声不连贯可调整该数值,如果连贯建议采用默认值0,注意,该设置会影响推理速度,单位为秒/s", value=0)
|
||||
lgr_num = gr.Number(label="自动音频切片后,需要舍弃每段切片的头尾。该参数设置交叉长度保留的比例,范围0-1,左开右闭", value=0.75)
|
||||
enhancer_adaptive_key = gr.Number(label="使增强器适应更高的音域(单位为半音数)|默认为0", value=0)
|
||||
cr_threshold = gr.Number(label="F0过滤阈值,只有启动f0_mean_pooling时有效. 数值范围从0-1. 降低该值可减少跑调概率,但会增加哑音", value=0.05)
|
||||
with gr.Tabs():
|
||||
with gr.TabItem("音频转音频"):
|
||||
vc_input3 = gr.Audio(label="选择音频")
|
||||
|
@ -299,12 +300,12 @@ with gr.Blocks(
|
|||
<font size=2> WebUI设置</font>
|
||||
""")
|
||||
debug_button = gr.Checkbox(label="Debug模式,如果向社区反馈BUG需要打开,打开后控制台可以显示具体错误提示", value=debug)
|
||||
vc_submit.click(vc_fn, [sid, vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,F0_mean_pooling,enhancer_adaptive_key], [vc_output1, vc_output2])
|
||||
vc_submit2.click(vc_fn2, [sid, vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,text2tts,tts_rate,tts_voice,F0_mean_pooling,enhancer_adaptive_key], [vc_output1, vc_output2])
|
||||
vc_submit.click(vc_fn, [sid, vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,F0_mean_pooling,enhancer_adaptive_key,cr_threshold], [vc_output1, vc_output2])
|
||||
vc_submit2.click(vc_fn2, [sid, vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,text2tts,tts_rate,tts_voice,F0_mean_pooling,enhancer_adaptive_key,cr_threshold], [vc_output1, vc_output2])
|
||||
debug_button.change(debug_change,[],[])
|
||||
model_load_button.click(modelAnalysis,[model_path,config_path,cluster_model_path,device,enhance],[sid,sid_output])
|
||||
model_unload_button.click(modelUnload,[],[sid,sid_output])
|
||||
app.launch()
|
||||
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue