From 87bc1f8782d3c35228bd1ef5cfeefd1484ea1f41 Mon Sep 17 00:00:00 2001
From: zwa73 <ssfen2419@126.com>
Date: Fri, 31 Mar 2023 01:29:02 +0800
Subject: [PATCH 1/4] =?UTF-8?q?=E6=A0=B9=E6=8D=AE@ChrisPreston=E7=9A=84?=
 =?UTF-8?q?=E4=BB=A3=E7=A0=81=E6=B7=BB=E5=8A=A0=E4=BA=86=E5=99=AA=E9=9F=B3?=
 =?UTF-8?q?=E8=BF=87=E6=BB=A4f0=5Ffilter=E5=8F=82=E6=95=B0=20=E5=9C=A8flas?=
 =?UTF-8?q?k=5Fapi=E4=B8=AD=E6=B7=BB=E5=8A=A0=E4=BA=86=E4=B8=80=E4=BA=9B?=
 =?UTF-8?q?=E5=8F=82=E6=95=B0=E4=BC=A0=E9=80=92?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 flask_api.py            | 10 +++++++---
 inference/infer_tool.py | 44 ++++++++++++++++++++++++++++++++++-------
 2 files changed, 44 insertions(+), 10 deletions(-)

diff --git a/flask_api.py b/flask_api.py
index 8cc236a..5be4141 100644
--- a/flask_api.py
+++ b/flask_api.py
@@ -30,10 +30,13 @@ def voice_change_model():
 
     # 模型推理
     if raw_infer:
-        out_audio, out_sr = svc_model.infer(speaker_id, f_pitch_change, input_wav_path)
+        # out_audio, out_sr = svc_model.infer(speaker_id, f_pitch_change, input_wav_path)
+        out_audio, _ = svc_model.infer(speaker_id, f_pitch_change, input_wav_path, cluster_infer_ratio=0,
+                                       auto_predict_f0=False, noice_scale=0.4, f0_filter=False)
         tar_audio = torchaudio.functional.resample(out_audio, svc_model.target_sample, daw_sample)
     else:
-        out_audio = svc.process(svc_model, speaker_id, f_pitch_change, input_wav_path)
+        out_audio = svc.process(svc_model, speaker_id, f_pitch_change, input_wav_path, cluster_infer_ratio=0,
+                                auto_predict_f0=False, noice_scale=0.4, f0_filter=False)
         tar_audio = torchaudio.functional.resample(torch.from_numpy(out_audio), svc_model.target_sample, daw_sample)
     # 返回音频
     out_wav_path = io.BytesIO()
@@ -50,7 +53,8 @@ if __name__ == '__main__':
     # 每个模型和config是唯一对应的
     model_name = "logs/32k/G_174000-Copy1.pth"
     config_name = "configs/config.json"
-    svc_model = Svc(model_name, config_name)
+    cluster_model_path = "logs/44k/kmeans_10000.pt"
+    svc_model = Svc(model_name, config_name, cluster_model_path=cluster_model_path)
     svc = RealTimeVC()
     # 此处与vst插件对应，不建议更改
     app.run(port=6842, host="0.0.0.0", debug=False, threaded=False)
diff --git a/inference/infer_tool.py b/inference/infer_tool.py
index 3a2635b..b60c00f 100644
--- a/inference/infer_tool.py
+++ b/inference/infer_tool.py
@@ -104,6 +104,9 @@ def pad_array(arr, target_length):
         return padded_arr
 
 
+class F0FilterException(Exception):
+    pass
+
 class Svc(object):
     def __init__(self, net_g_path, config_path,
                  device=None,
@@ -138,11 +141,15 @@ class Svc(object):
 
 
 
-    def get_unit_f0(self, in_path, tran, cluster_infer_ratio, speaker):
+    def get_unit_f0(self, in_path, tran, cluster_infer_ratio, speaker, f0_filter):
 
         wav, sr = librosa.load(in_path, sr=self.target_sample)
 
         f0 = utils.compute_f0_parselmouth(wav, sampling_rate=self.target_sample, hop_length=self.hop_size)
+
+        if f0_filter and sum(f0) == 0:
+            raise F0FilterException("未检测到人声")
+
         f0, uv = utils.interpolate_f0(f0)
         f0 = torch.FloatTensor(f0)
         uv = torch.FloatTensor(uv)
@@ -166,13 +173,14 @@ class Svc(object):
     def infer(self, speaker, tran, raw_path,
               cluster_infer_ratio=0,
               auto_predict_f0=False,
-              noice_scale=0.4):
+              noice_scale=0.4,
+              f0_filter=False):
         speaker_id = self.spk2id.__dict__.get(speaker)
         if not speaker_id and type(speaker) is int:
             if len(self.spk2id.__dict__) >= speaker:
                 speaker_id = speaker
         sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0)
-        c, f0, uv = self.get_unit_f0(raw_path, tran, cluster_infer_ratio, speaker)
+        c, f0, uv = self.get_unit_f0(raw_path, tran, cluster_infer_ratio, speaker, f0_filter)
         if "half" in self.net_g_path and torch.cuda.is_available():
             c = c.half()
         with torch.no_grad():
@@ -181,7 +189,7 @@ class Svc(object):
             use_time = time.time() - start
             print("vits use time:{}".format(use_time))
         return audio, audio.shape[-1]
-    
+
     def clear_empty(self):
         # 清理显存
         torch.cuda.empty_cache()
@@ -227,14 +235,30 @@ class RealTimeVC:
 
     """输入输出都是1维numpy 音频波形数组"""
 
-    def process(self, svc_model, speaker_id, f_pitch_change, input_wav_path):
+    def process(self, svc_model, speaker_id, f_pitch_change, input_wav_path,
+                cluster_infer_ratio=0,
+                auto_predict_f0=False,
+                noice_scale=0.4,
+                f0_filter=False):
+
+        cluster_infer_ratio = cluster_infer_ratio
+        auto_predict_f0 = auto_predict_f0
+        noice_scale = noice_scale
+        f0_filter = f0_filter
+
         import maad
         audio, sr = torchaudio.load(input_wav_path)
         audio = audio.cpu().numpy()[0]
         temp_wav = io.BytesIO()
         if self.last_chunk is None:
             input_wav_path.seek(0)
-            audio, sr = svc_model.infer(speaker_id, f_pitch_change, input_wav_path)
+
+            audio, sr = svc_model.infer(speaker_id, f_pitch_change, input_wav_path,
+                                        cluster_infer_ratio=cluster_infer_ratio,
+                                        auto_predict_f0=auto_predict_f0,
+                                        noice_scale=noice_scale,
+                                        f0_filter=f0_filter)
+
             audio = audio.cpu().numpy()
             self.last_chunk = audio[-self.pre_len:]
             self.last_o = audio
@@ -243,7 +267,13 @@ class RealTimeVC:
             audio = np.concatenate([self.last_chunk, audio])
             soundfile.write(temp_wav, audio, sr, format="wav")
             temp_wav.seek(0)
-            audio, sr = svc_model.infer(speaker_id, f_pitch_change, temp_wav)
+
+            audio, sr = svc_model.infer(speaker_id, f_pitch_change, temp_wav,
+                                        cluster_infer_ratio=cluster_infer_ratio,
+                                        auto_predict_f0=auto_predict_f0,
+                                        noice_scale=noice_scale,
+                                        f0_filter=f0_filter)
+
             audio = audio.cpu().numpy()
             ret = maad.util.crossfade(self.last_o, audio, self.pre_len)
             self.last_chunk = audio[-self.pre_len:]

From 061d3b46a5b16673613d54416924d3ac1ba1d81f Mon Sep 17 00:00:00 2001
From: zwa73 <ssfen2419@126.com>
Date: Fri, 31 Mar 2023 01:34:10 +0800
Subject: [PATCH 2/4] fix

---
 inference/infer_tool.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/inference/infer_tool.py b/inference/infer_tool.py
index 2da5c58..0ea8397 100644
--- a/inference/infer_tool.py
+++ b/inference/infer_tool.py
@@ -179,6 +179,7 @@ class Svc(object):
               auto_predict_f0=False,
               noice_scale=0.4,
               f0_filter=False):
+
         speaker_id = self.spk2id.__dict__.get(speaker)
         if not speaker_id and type(speaker) is int:
             if len(self.spk2id.__dict__) >= speaker:
@@ -266,11 +267,6 @@ class RealTimeVC:
                 noice_scale=0.4,
                 f0_filter=False):
 
-        cluster_infer_ratio = cluster_infer_ratio
-        auto_predict_f0 = auto_predict_f0
-        noice_scale = noice_scale
-        f0_filter = f0_filter
-
         import maad
         audio, sr = torchaudio.load(input_wav_path)
         audio = audio.cpu().numpy()[0]

From 6a0000823299314204861aeb9da0a63fa4188791 Mon Sep 17 00:00:00 2001
From: zwa73 <ssfen2419@126.com>
Date: Fri, 31 Mar 2023 01:35:50 +0800
Subject: [PATCH 3/4] fix

---
 flask_api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flask_api.py b/flask_api.py
index 5be4141..5e5aebe 100644
--- a/flask_api.py
+++ b/flask_api.py
@@ -31,7 +31,7 @@ def voice_change_model():
     # 模型推理
     if raw_infer:
         # out_audio, out_sr = svc_model.infer(speaker_id, f_pitch_change, input_wav_path)
-        out_audio, _ = svc_model.infer(speaker_id, f_pitch_change, input_wav_path, cluster_infer_ratio=0,
+        out_audio, out_sr = svc_model.infer(speaker_id, f_pitch_change, input_wav_path, cluster_infer_ratio=0,
                                        auto_predict_f0=False, noice_scale=0.4, f0_filter=False)
         tar_audio = torchaudio.functional.resample(out_audio, svc_model.target_sample, daw_sample)
     else:

From 41dd39790ff211b1bfbbf34ea168fb0f5301dea4 Mon Sep 17 00:00:00 2001
From: zwa73 <ssfen2419@126.com>
Date: Fri, 31 Mar 2023 01:37:23 +0800
Subject: [PATCH 4/4] fix

---
 flask_api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flask_api.py b/flask_api.py
index 5e5aebe..b3f1e06 100644
--- a/flask_api.py
+++ b/flask_api.py
@@ -32,7 +32,7 @@ def voice_change_model():
     if raw_infer:
         # out_audio, out_sr = svc_model.infer(speaker_id, f_pitch_change, input_wav_path)
         out_audio, out_sr = svc_model.infer(speaker_id, f_pitch_change, input_wav_path, cluster_infer_ratio=0,
-                                       auto_predict_f0=False, noice_scale=0.4, f0_filter=False)
+                                            auto_predict_f0=False, noice_scale=0.4, f0_filter=False)
         tar_audio = torchaudio.functional.resample(out_audio, svc_model.target_sample, daw_sample)
     else:
         out_audio = svc.process(svc_model, speaker_id, f_pitch_change, input_wav_path, cluster_infer_ratio=0,