Updata threshold para

2023-04-17 18:21:00 +08:00 · 2023-04-17 18:21:00 +08:00 · caf873cf43
parent f1aa45a3b9
commit caf873cf43
4 changed files with 24 additions and 15 deletions
--- a/inference/infer_tool.py
+++ b/inference/infer_tool.py
@ -152,12 +152,12 @@ class Svc(object):



-    def get_unit_f0(self, in_path, tran, cluster_infer_ratio, speaker, f0_filter ,F0_mean_pooling):
+    def get_unit_f0(self, in_path, tran, cluster_infer_ratio, speaker, f0_filter ,F0_mean_pooling,cr_threshold=0.05):

        wav, sr = librosa.load(in_path, sr=self.target_sample)

        if F0_mean_pooling == True:
-            f0, uv = utils.compute_f0_uv_torchcrepe(torch.FloatTensor(wav), sampling_rate=self.target_sample, hop_length=self.hop_size,device=self.dev)
+            f0, uv = utils.compute_f0_uv_torchcrepe(torch.FloatTensor(wav), sampling_rate=self.target_sample, hop_length=self.hop_size,device=self.dev,cr_threshold = cr_threshold)
            if f0_filter and sum(f0) == 0:
                raise F0FilterException("No voice detected")
            f0 = torch.FloatTensor(list(f0))
@ -193,7 +193,8 @@ class Svc(object):
              noice_scale=0.4,
              f0_filter=False,
              F0_mean_pooling=False,
-              enhancer_adaptive_key = 0
+              enhancer_adaptive_key = 0,
+              cr_threshold = 0.05
              ):

        speaker_id = self.spk2id.__dict__.get(speaker)
@ -201,7 +202,7 @@ class Svc(object):
            if len(self.spk2id.__dict__) >= speaker:
                speaker_id = speaker
        sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0)
-        c, f0, uv = self.get_unit_f0(raw_path, tran, cluster_infer_ratio, speaker, f0_filter,F0_mean_pooling)
+        c, f0, uv = self.get_unit_f0(raw_path, tran, cluster_infer_ratio, speaker, f0_filter,F0_mean_pooling,cr_threshold=cr_threshold)
        if "half" in self.net_g_path and torch.cuda.is_available():
            c = c.half()
        with torch.no_grad():
@ -245,7 +246,8 @@ class Svc(object):
                        lg_num=0,
                        lgr_num =0.75,
                        F0_mean_pooling = False,
-                        enhancer_adaptive_key = 0
+                        enhancer_adaptive_key = 0,
+                        cr_threshold = 0.05
                        ):
        wav_path = raw_audio_path
        chunks = slicer.cut(wav_path, db_thresh=slice_db)
@ -285,7 +287,8 @@ class Svc(object):
                                                    auto_predict_f0=auto_predict_f0,
                                                    noice_scale=noice_scale,
                                                    F0_mean_pooling = F0_mean_pooling,
-                                                    enhancer_adaptive_key = enhancer_adaptive_key
+                                                    enhancer_adaptive_key = enhancer_adaptive_key,
+                                                    cr_threshold = cr_threshold
                                                    )
                _audio = out_audio.cpu().numpy()
                pad_len = int(self.target_sample * pad_seconds)
--- a/inference_main.py
+++ b/inference_main.py
@ -65,6 +65,9 @@ def main():
                        help='Proportion of cross length retention, range (0-1]. After forced slicing, the beginning and end of each segment need to be discarded.')
    parser.add_argument('-eak', '--enhancer_adaptive_key', type=int, default=0,
                        help='Adapt the enhancer to a higher range of sound. The unit is the semitones, default 0.')
+    parser.add_argument('-ft', '--F0_filter_threshold', type=float, default=0.05,
+                        help='F0 Filtering threshold: This parameter is valid only when f0_mean_pooling is enabled. Values range from 0 to 1. Reducing this value reduces the probability of being out of tune, but increases matte.')
+

    args = parser.parse_args()

@ -83,6 +86,7 @@ def main():
    F0_mean_pooling = args.f0_mean_pooling
    enhance = args.enhance
    enhancer_adaptive_key = args.enhancer_adaptive_key
+    cr_threshold = args.F0_filter_threshold

    svc_model = Svc(args.model_path, args.config_path, args.device, args.cluster_model_path,enhance)
    infer_tool.mkdir(["raw", "results"])
@ -132,7 +136,8 @@ def main():
                                                        auto_predict_f0=auto_predict_f0,
                                                        noice_scale=noice_scale,
                                                        F0_mean_pooling = F0_mean_pooling,
-                                                        enhancer_adaptive_key = enhancer_adaptive_key
+                                                        enhancer_adaptive_key = enhancer_adaptive_key,
+                                                        cr_threshold = cr_threshold
                                                        )
                    _audio = out_audio.cpu().numpy()
                    pad_len = int(svc_model.target_sample * pad_seconds)
--- a/utils.py
+++ b/utils.py
@ -80,7 +80,7 @@ def normalize_f0(f0, x_mask, uv, random_scale=True):
        exit(0)
    return f0_norm * x_mask

-def compute_f0_uv_torchcrepe(wav_numpy, p_len=None, sampling_rate=44100, hop_length=512,device=None):
+def compute_f0_uv_torchcrepe(wav_numpy, p_len=None, sampling_rate=44100, hop_length=512,device=None,cr_threshold=0.05):
    from modules.crepe import CrepePitchExtractor
    x = wav_numpy
    if p_len is None:
@ -90,7 +90,7 @@ def compute_f0_uv_torchcrepe(wav_numpy, p_len=None, sampling_rate=44100, hop_len
    
    f0_min = 50
    f0_max = 1100
-    F0Creper = CrepePitchExtractor(hop_length=hop_length,f0_min=f0_min,f0_max=f0_max,device=device)
+    F0Creper = CrepePitchExtractor(hop_length=hop_length,f0_min=f0_min,f0_max=f0_max,device=device,threshold=cr_threshold)
    f0,uv = F0Creper(x[None,:].float(),sampling_rate,pad_to=p_len)
    return f0,uv

--- a/webUI.py
+++ b/webUI.py
@ -106,7 +106,7 @@ def modelUnload():
        return sid.update(choices = [],value=""),"模型卸载完毕!"


-def vc_fn(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,F0_mean_pooling,enhancer_adaptive_key):
+def vc_fn(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,F0_mean_pooling,enhancer_adaptive_key,cr_threshold):
    global model
    try:
        if input_audio is None:
@ -120,7 +120,7 @@ def vc_fn(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, noise
            audio = librosa.to_mono(audio.transpose(1, 0))
        temp_path = "temp.wav"
        soundfile.write(temp_path, audio, sampling_rate, format="wav")
-        _audio = model.slice_inference(temp_path, sid, vc_transform, slice_db, cluster_ratio, auto_f0, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,F0_mean_pooling,enhancer_adaptive_key)
+        _audio = model.slice_inference(temp_path, sid, vc_transform, slice_db, cluster_ratio, auto_f0, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,F0_mean_pooling,enhancer_adaptive_key,cr_threshold)
        model.clear_empty()
        os.remove(temp_path)
        #构建保存文件的路径，并保存到results文件夹内
@ -166,7 +166,7 @@ def tts_func(_text,_rate,_voice):
 def text_clear(text):
    return re.sub(r"[\n\,\(\) ]", "", text)

-def vc_fn2(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,text2tts,tts_rate,tts_voice,F0_mean_pooling,enhancer_adaptive_key):
+def vc_fn2(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,text2tts,tts_rate,tts_voice,F0_mean_pooling,enhancer_adaptive_key,cr_threshold):
    #使用edge-tts把文字转成音频
    text2tts=text_clear(text2tts)
    output_file=tts_func(text2tts,tts_rate,tts_voice)
@ -184,7 +184,7 @@ def vc_fn2(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, nois
    sample_rate, data=gr_pu.audio_from_file(save_path2)
    vc_input=(sample_rate, data)

-    a,b=vc_fn(sid, vc_input, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,F0_mean_pooling,enhancer_adaptive_key)
+    a,b=vc_fn(sid, vc_input, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,F0_mean_pooling,enhancer_adaptive_key,cr_threshold)
    os.remove(output_file)
    os.remove(save_path2)
    return a,b
@ -242,6 +242,7 @@ with gr.Blocks(
                    lg_num = gr.Number(label="两端音频切片的交叉淡入长度，如果自动切片后出现人声不连贯可调整该数值，如果连贯建议采用默认值0，注意，该设置会影响推理速度，单位为秒/s", value=0)
                    lgr_num = gr.Number(label="自动音频切片后，需要舍弃每段切片的头尾。该参数设置交叉长度保留的比例，范围0-1,左开右闭", value=0.75)
                    enhancer_adaptive_key = gr.Number(label="使增强器适应更高的音域(单位为半音数)|默认为0", value=0)
+                    cr_threshold = gr.Number(label="F0过滤阈值，只有启动f0_mean_pooling时有效. 数值范围从0-1. 降低该值可减少跑调概率，但会增加哑音", value=0.05)
            with gr.Tabs():
                with gr.TabItem("音频转音频"):
                    vc_input3 = gr.Audio(label="选择音频")
@ -299,8 +300,8 @@ with gr.Blocks(
                    <font size=2> WebUI设置</font>
                    """)
                debug_button = gr.Checkbox(label="Debug模式，如果向社区反馈BUG需要打开，打开后控制台可以显示具体错误提示", value=debug)
-        vc_submit.click(vc_fn, [sid, vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,F0_mean_pooling,enhancer_adaptive_key], [vc_output1, vc_output2])
-        vc_submit2.click(vc_fn2, [sid, vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,text2tts,tts_rate,tts_voice,F0_mean_pooling,enhancer_adaptive_key], [vc_output1, vc_output2])
+        vc_submit.click(vc_fn, [sid, vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,F0_mean_pooling,enhancer_adaptive_key,cr_threshold], [vc_output1, vc_output2])
+        vc_submit2.click(vc_fn2, [sid, vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,text2tts,tts_rate,tts_voice,F0_mean_pooling,enhancer_adaptive_key,cr_threshold], [vc_output1, vc_output2])
        debug_button.change(debug_change,[],[])
        model_load_button.click(modelAnalysis,[model_path,config_path,cluster_model_path,device,enhance],[sid,sid_output])
        model_unload_button.click(modelUnload,[],[sid,sid_output])