From 9fa9490e53c7909cdf124419f0f6547eeda38617 Mon Sep 17 00:00:00 2001 From: ylzz1997 Date: Sat, 13 May 2023 15:33:40 +0800 Subject: [PATCH] Updata F0 Predictor --- inference/infer_tool.py | 47 ++++---- inference_main.py | 8 +- modules/F0Predictor/CrepeF0Predictor.py | 31 +++++ modules/F0Predictor/DioF0Predictor.py | 59 ++++++++++ modules/F0Predictor/F0Predictor.py | 16 +++ modules/F0Predictor/HarvestF0Predictor.py | 55 +++++++++ modules/F0Predictor/PMF0Predictor.py | 83 ++++++++++++++ modules/F0Predictor/__init__.py | 0 modules/{ => F0Predictor}/crepe.py | 11 +- preprocess_hubert_f0.py | 6 +- utils.py | 131 ---------------------- webUI.py | 16 +-- 12 files changed, 297 insertions(+), 166 deletions(-) create mode 100644 modules/F0Predictor/CrepeF0Predictor.py create mode 100644 modules/F0Predictor/DioF0Predictor.py create mode 100644 modules/F0Predictor/F0Predictor.py create mode 100644 modules/F0Predictor/HarvestF0Predictor.py create mode 100644 modules/F0Predictor/PMF0Predictor.py create mode 100644 modules/F0Predictor/__init__.py rename modules/{ => F0Predictor}/crepe.py (96%) diff --git a/inference/infer_tool.py b/inference/infer_tool.py index 91561cf..960ebd5 100644 --- a/inference/infer_tool.py +++ b/inference/infer_tool.py @@ -152,27 +152,34 @@ class Svc(object): - def get_unit_f0(self, in_path, tran, cluster_infer_ratio, speaker, f0_filter ,F0_mean_pooling,cr_threshold=0.05): + def get_unit_f0(self, in_path, tran, cluster_infer_ratio, speaker, f0_filter ,f0_predictor,cr_threshold=0.05): wav, sr = librosa.load(in_path, sr=self.target_sample) - if F0_mean_pooling == True: - f0, uv = utils.compute_f0_uv_torchcrepe(torch.FloatTensor(wav), sampling_rate=self.target_sample, hop_length=self.hop_size,device=self.dev,cr_threshold = cr_threshold) - if f0_filter and sum(f0) == 0: - raise F0FilterException("No voice detected") - f0 = torch.FloatTensor(list(f0)) - uv = torch.FloatTensor(list(uv)) - if F0_mean_pooling == False: - f0 = utils.compute_f0_parselmouth(wav, sampling_rate=self.target_sample, hop_length=self.hop_size) - if f0_filter and sum(f0) == 0: - raise F0FilterException("No voice detected") - f0, uv = utils.interpolate_f0(f0) - f0 = torch.FloatTensor(f0) - uv = torch.FloatTensor(uv) + if f0_predictor == "pm": + from modules.F0Predictor.PMF0Predictor import PMF0Predictor + f0_predictor_object = PMF0Predictor(hop_length=self.hop_size,sampling_rate=self.target_sample) + elif f0_predictor == "crepe": + from modules.F0Predictor.CrepeF0Predictor import CrepeF0Predictor + f0_predictor_object = CrepeF0Predictor(hop_length=self.hop_size,sampling_rate=self.target_sample,device=self.dev,threshold=cr_threshold) + elif f0_predictor == "harvest": + from modules.F0Predictor.HarvestF0Predictor import HarvestF0Predictor + f0_predictor_object = HarvestF0Predictor(hop_length=self.hop_size,sampling_rate=self.target_sample) + elif f0_predictor == "dio": + from modules.F0Predictor.DioF0Predictor import DioF0Predictor + f0_predictor_object = DioF0Predictor(hop_length=self.hop_size,sampling_rate=self.target_sample) + else: + raise Exception("Unknown f0 predictor") + + f0, uv = f0_predictor_object.compute_f0_uv(wav) + if f0_filter and sum(f0) == 0: + raise F0FilterException("No voice detected") + f0 = torch.FloatTensor(f0).to(self.dev) + uv = torch.FloatTensor(uv).to(self.dev) f0 = f0 * 2 ** (tran / 12) - f0 = f0.unsqueeze(0).to(self.dev) - uv = uv.unsqueeze(0).to(self.dev) + f0 = f0.unsqueeze(0) + uv = uv.unsqueeze(0) wav16k = librosa.resample(wav, orig_sr=self.target_sample, target_sr=16000) wav16k = torch.from_numpy(wav16k).to(self.dev) @@ -192,7 +199,7 @@ class Svc(object): auto_predict_f0=False, noice_scale=0.4, f0_filter=False, - F0_mean_pooling=False, + f0_predictor='pm', enhancer_adaptive_key = 0, cr_threshold = 0.05 ): @@ -202,7 +209,7 @@ class Svc(object): if len(self.spk2id.__dict__) >= speaker: speaker_id = speaker sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0) - c, f0, uv = self.get_unit_f0(raw_path, tran, cluster_infer_ratio, speaker, f0_filter,F0_mean_pooling,cr_threshold=cr_threshold) + c, f0, uv = self.get_unit_f0(raw_path, tran, cluster_infer_ratio, speaker, f0_filter,f0_predictor,cr_threshold=cr_threshold) if "half" in self.net_g_path and torch.cuda.is_available(): c = c.half() with torch.no_grad(): @@ -245,7 +252,7 @@ class Svc(object): clip_seconds=0, lg_num=0, lgr_num =0.75, - F0_mean_pooling = False, + f0_predictor='pm', enhancer_adaptive_key = 0, cr_threshold = 0.05 ): @@ -286,7 +293,7 @@ class Svc(object): cluster_infer_ratio=cluster_infer_ratio, auto_predict_f0=auto_predict_f0, noice_scale=noice_scale, - F0_mean_pooling = F0_mean_pooling, + f0_predictor = f0_predictor, enhancer_adaptive_key = enhancer_adaptive_key, cr_threshold = cr_threshold ) diff --git a/inference_main.py b/inference_main.py index 833f867..df88ca5 100644 --- a/inference_main.py +++ b/inference_main.py @@ -35,7 +35,7 @@ def main(): parser.add_argument('-cm', '--cluster_model_path', type=str, default="logs/44k/kmeans_10000.pt", help='聚类模型路径,如果没有训练聚类则随便填') parser.add_argument('-cr', '--cluster_infer_ratio', type=float, default=0, help='聚类方案占比,范围0-1,若没有训练聚类模型则默认0即可') parser.add_argument('-lg', '--linear_gradient', type=float, default=0, help='两段音频切片的交叉淡入长度,如果强制切片后出现人声不连贯可调整该数值,如果连贯建议采用默认值0,单位为秒') - parser.add_argument('-fmp', '--f0_mean_pooling', type=bool, default=False, help='是否对F0使用均值滤波器(池化),对部分哑音有改善。注意,启动该选项会导致推理速度下降,默认关闭') + parser.add_argument('-f0p', '--f0_predictor', type=str, default="pm", help='选择F0预测器,可选择crepe,pm,dio,harvest,默认为pm(注意:crepe为原F0使用均值滤波器)') parser.add_argument('-eh', '--enhance', type=bool, default=False, help='是否使用NSF_HIFIGAN增强器,该选项对部分训练集少的模型有一定的音质增强效果,但是对训练好的模型有反面效果,默认关闭') # 不用动的部分 @@ -46,7 +46,7 @@ def main(): parser.add_argument('-wf', '--wav_format', type=str, default='flac', help='音频输出格式') parser.add_argument('-lgr', '--linear_gradient_retain', type=float, default=0.75, help='自动音频切片后,需要舍弃每段切片的头尾。该参数设置交叉长度保留的比例,范围0-1,左开右闭') parser.add_argument('-eak', '--enhancer_adaptive_key', type=int, default=0, help='使增强器适应更高的音域(单位为半音数)|默认为0') - parser.add_argument('-ft', '--f0_filter_threshold', type=float, default=0.05,help='F0过滤阈值,只有启动f0_mean_pooling时有效. 数值范围从0-1. 降低该值可减少跑调概率,但会增加哑音') + parser.add_argument('-ft', '--f0_filter_threshold', type=float, default=0.05,help='F0过滤阈值,只有使用crepe时有效. 数值范围从0-1. 降低该值可减少跑调概率,但会增加哑音') args = parser.parse_args() @@ -63,7 +63,7 @@ def main(): clip = args.clip lg = args.linear_gradient lgr = args.linear_gradient_retain - F0_mean_pooling = args.f0_mean_pooling + f0p = args.f0_predictor enhance = args.enhance enhancer_adaptive_key = args.enhancer_adaptive_key cr_threshold = args.f0_filter_threshold @@ -115,7 +115,7 @@ def main(): cluster_infer_ratio=cluster_infer_ratio, auto_predict_f0=auto_predict_f0, noice_scale=noice_scale, - F0_mean_pooling = F0_mean_pooling, + f0_predictor = f0p, enhancer_adaptive_key = enhancer_adaptive_key, cr_threshold = cr_threshold ) diff --git a/modules/F0Predictor/CrepeF0Predictor.py b/modules/F0Predictor/CrepeF0Predictor.py new file mode 100644 index 0000000..e005288 --- /dev/null +++ b/modules/F0Predictor/CrepeF0Predictor.py @@ -0,0 +1,31 @@ +from modules.F0Predictor.F0Predictor import F0Predictor +from modules.F0Predictor.crepe import CrepePitchExtractor +import torch + +class CrepeF0Predictor(F0Predictor): + def __init__(self,hop_length=512,f0_min=50,f0_max=1100,device=None,sampling_rate=44100,threshold=0.05,model="full"): + self.F0Creper = CrepePitchExtractor(hop_length=hop_length,f0_min=f0_min,f0_max=f0_max,device=device,threshold=threshold,model=model) + self.hop_length = hop_length + self.f0_min = f0_min + self.f0_max = f0_max + self.device = device + self.threshold = threshold + self.sampling_rate = sampling_rate + + def compute_f0(self,wav,p_len=None): + x = torch.FloatTensor(wav).to(self.device) + if p_len is None: + p_len = x.shape[0]//self.hop_length + else: + assert abs(p_len-x.shape[0]//self.hop_length) < 4, "pad length error" + f0,uv = self.F0Creper(x[None,:].float(),self.sampling_rate,pad_to=p_len) + return f0 + + def compute_f0_uv(self,wav,p_len=None): + x = torch.FloatTensor(wav).to(self.device) + if p_len is None: + p_len = x.shape[0]//self.hop_length + else: + assert abs(p_len-x.shape[0]//self.hop_length) < 4, "pad length error" + f0,uv = self.F0Creper(x[None,:].float(),self.sampling_rate,pad_to=p_len) + return f0,uv \ No newline at end of file diff --git a/modules/F0Predictor/DioF0Predictor.py b/modules/F0Predictor/DioF0Predictor.py new file mode 100644 index 0000000..091f8a7 --- /dev/null +++ b/modules/F0Predictor/DioF0Predictor.py @@ -0,0 +1,59 @@ +from modules.F0Predictor.F0Predictor import F0Predictor +import pyworld +import numpy as np +import scipy + +class DioF0Predictor(F0Predictor): + def __init__(self,hop_length=512,f0_min=50,f0_max=1100,sampling_rate=44100): + self.hop_length = hop_length + self.f0_min = f0_min + self.f0_max = f0_max + self.sampling_rate = sampling_rate + + def resize_f0(self,x, target_len): + source = np.array(x) + source[source<0.001] = np.nan + target = np.interp(np.arange(0, len(source)*target_len, len(source))/ target_len, np.arange(0, len(source)), source) + res = np.nan_to_num(target) + return res + + def resize_f0_uv(self,x, target_len): + source = np.array(x) + vuv_vector = np.zeros_like(x) + vuv_vector[x > 0.0] = 1.0 + vuv_vector[x < 0.001] = 0.0 + source[source<0.001] = np.nan + target = np.interp(np.arange(0, len(source)*target_len, len(source))/ target_len, np.arange(0, len(source)), source) + res = np.nan_to_num(target) + vuv_vector = np.ceil(scipy.ndimage.zoom(vuv_vector,target_len/len(vuv_vector),order = 0)) + return res,vuv_vector + + def compute_f0(self,wav,p_len=None): + if p_len is None: + p_len = wav.shape[0]//self.hop_length + f0, t = pyworld.dio( + wav.astype(np.double), + fs=self.sampling_rate, + f0_floor=self.f0_min, + f0_ceil=self.f0_max, + frame_period=1000 * self.hop_length / self.sampling_rate, + ) + f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate) + for index, pitch in enumerate(f0): + f0[index] = round(pitch, 1) + return self.resize_f0(f0, p_len) + + def compute_f0_uv(self,wav,p_len=None): + if p_len is None: + p_len = wav.shape[0]//self.hop_length + f0, t = pyworld.dio( + wav.astype(np.double), + fs=self.sampling_rate, + f0_floor=self.f0_min, + f0_ceil=self.f0_max, + frame_period=1000 * self.hop_length / self.sampling_rate, + ) + f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate) + for index, pitch in enumerate(f0): + f0[index] = round(pitch, 1) + return self.resize_f0_uv(f0, p_len) \ No newline at end of file diff --git a/modules/F0Predictor/F0Predictor.py b/modules/F0Predictor/F0Predictor.py new file mode 100644 index 0000000..69d8a9b --- /dev/null +++ b/modules/F0Predictor/F0Predictor.py @@ -0,0 +1,16 @@ +class F0Predictor(object): + def compute_f0(self,wav,p_len): + ''' + input: wav:[signal_length] + p_len:int + output: f0:[signal_length//hop_length] + ''' + pass + + def compute_f0_uv(self,wav,p_len): + ''' + input: wav:[signal_length] + p_len:int + output: f0:[signal_length//hop_length],uv:[signal_length//hop_length] + ''' + pass \ No newline at end of file diff --git a/modules/F0Predictor/HarvestF0Predictor.py b/modules/F0Predictor/HarvestF0Predictor.py new file mode 100644 index 0000000..fdb6016 --- /dev/null +++ b/modules/F0Predictor/HarvestF0Predictor.py @@ -0,0 +1,55 @@ +from modules.F0Predictor.F0Predictor import F0Predictor +import pyworld +import numpy as np +import scipy + +class HarvestF0Predictor(F0Predictor): + def __init__(self,hop_length=512,f0_min=50,f0_max=1100,sampling_rate=44100): + self.hop_length = hop_length + self.f0_min = f0_min + self.f0_max = f0_max + self.sampling_rate = sampling_rate + + def resize_f0(self,x, target_len): + source = np.array(x) + source[source<0.001] = np.nan + target = np.interp(np.arange(0, len(source)*target_len, len(source))/ target_len, np.arange(0, len(source)), source) + res = np.nan_to_num(target) + return res + + def resize_f0_uv(self,x, target_len): + source = np.array(x) + vuv_vector = np.zeros_like(x) + vuv_vector[x > 0.0] = 1.0 + vuv_vector[x < 0.001] = 0.0 + source[source<0.001] = np.nan + target = np.interp(np.arange(0, len(source)*target_len, len(source))/ target_len, np.arange(0, len(source)), source) + res = np.nan_to_num(target) + vuv_vector = np.ceil(scipy.ndimage.zoom(vuv_vector,target_len/len(vuv_vector),order = 0)) + return res,vuv_vector + + def compute_f0(self,wav,p_len=None): + if p_len is None: + p_len = wav.shape[0]//self.hop_length + f0, t = pyworld.harvest( + wav.astype(np.double), + fs=self.hop_length, + f0_ceil=self.f0_max, + f0_floor=self.f0_min, + frame_period=1000 * self.hop_length / self.sampling_rate, + ) + f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.fs) + return self.resize_f0(f0, p_len) + + def compute_f0_uv(self,wav,p_len=None): + if p_len is None: + p_len = wav.shape[0]//self.hop_length + f0, t = pyworld.dio( + wav.astype(np.double), + fs=self.sampling_rate, + f0_floor=self.f0_min, + f0_ceil=self.f0_max, + frame_period=1000 * self.hop_length / self.sampling_rate, + ) + f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate) + return self.resize_f0_uv(f0, p_len) \ No newline at end of file diff --git a/modules/F0Predictor/PMF0Predictor.py b/modules/F0Predictor/PMF0Predictor.py new file mode 100644 index 0000000..ccf4128 --- /dev/null +++ b/modules/F0Predictor/PMF0Predictor.py @@ -0,0 +1,83 @@ +from modules.F0Predictor.F0Predictor import F0Predictor +import parselmouth +import numpy as np + +class PMF0Predictor(F0Predictor): + def __init__(self,hop_length=512,f0_min=50,f0_max=1100,sampling_rate=44100): + self.hop_length = hop_length + self.f0_min = f0_min + self.f0_max = f0_max + self.sampling_rate = sampling_rate + + + def interpolate_f0(self,f0): + ''' + 对F0进行插值处理 + ''' + + data = np.reshape(f0, (f0.size, 1)) + + vuv_vector = np.zeros((data.size, 1), dtype=np.float32) + vuv_vector[data > 0.0] = 1.0 + vuv_vector[data <= 0.0] = 0.0 + + ip_data = data + + frame_number = data.size + last_value = 0.0 + for i in range(frame_number): + if data[i] <= 0.0: + j = i + 1 + for j in range(i + 1, frame_number): + if data[j] > 0.0: + break + if j < frame_number - 1: + if last_value > 0.0: + step = (data[j] - data[i - 1]) / float(j - i) + for k in range(i, j): + ip_data[k] = data[i - 1] + step * (k - i + 1) + else: + for k in range(i, j): + ip_data[k] = data[j] + else: + for k in range(i, frame_number): + ip_data[k] = last_value + else: + ip_data[i] = data[i] #这里可能存在一个没有必要的拷贝 + last_value = data[i] + + return ip_data[:,0], vuv_vector[:,0] + + def compute_f0(self,wav,p_len=None): + x = wav + if p_len is None: + p_len = x.shape[0]//self.hop_length + else: + assert abs(p_len-x.shape[0]//self.hop_length) < 4, "pad length error" + time_step = self.hop_length / self.sampling_rate * 1000 + f0 = parselmouth.Sound(x, self.sampling_rate).to_pitch_ac( + time_step=time_step / 1000, voicing_threshold=0.6, + pitch_floor=self.f0_min, pitch_ceiling=self.f0_max).selected_array['frequency'] + + pad_size=(p_len - len(f0) + 1) // 2 + if(pad_size>0 or p_len - len(f0) - pad_size>0): + f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant') + f0,uv = self.interpolate_f0(f0) + return f0 + + def compute_f0_uv(self,wav,p_len=None): + x = wav + if p_len is None: + p_len = x.shape[0]//self.hop_length + else: + assert abs(p_len-x.shape[0]//self.hop_length) < 4, "pad length error" + time_step = self.hop_length / self.sampling_rate * 1000 + f0 = parselmouth.Sound(x, self.sampling_rate).to_pitch_ac( + time_step=time_step / 1000, voicing_threshold=0.6, + pitch_floor=self.f0_min, pitch_ceiling=self.f0_max).selected_array['frequency'] + + pad_size=(p_len - len(f0) + 1) // 2 + if(pad_size>0 or p_len - len(f0) - pad_size>0): + f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant') + f0,uv = self.interpolate_f0(f0) + return f0,uv diff --git a/modules/F0Predictor/__init__.py b/modules/F0Predictor/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/modules/crepe.py b/modules/F0Predictor/crepe.py similarity index 96% rename from modules/crepe.py rename to modules/F0Predictor/crepe.py index 584a219..c6fb45c 100644 --- a/modules/crepe.py +++ b/modules/F0Predictor/crepe.py @@ -263,9 +263,17 @@ class CrepePitchExtractor(BasePitchExtractor): device = None, model: Literal["full", "tiny"] = "full", use_fast_filters: bool = True, + decoder="viterbi" ): super().__init__(hop_length, f0_min, f0_max, keep_zeros) - + if decoder == "viterbi": + self.decoder = torchcrepe.decode.viterbi + elif decoder == "argmax": + self.decoder = torchcrepe.decode.argmax + elif decoder == "weighted_argmax": + self.decoder = torchcrepe.decode.weighted_argmax + else: + raise "Unknown decoder" self.threshold = threshold self.model = model self.use_fast_filters = use_fast_filters @@ -306,6 +314,7 @@ class CrepePitchExtractor(BasePitchExtractor): batch_size=1024, device=x.device, return_periodicity=True, + decoder=self.decoder ) # Filter, remove silence, set uv threshold, refer to the original warehouse readme diff --git a/preprocess_hubert_f0.py b/preprocess_hubert_f0.py index 763fb0d..b374a26 100644 --- a/preprocess_hubert_f0.py +++ b/preprocess_hubert_f0.py @@ -34,8 +34,10 @@ def process_one(filename, hmodel): f0_path = filename + ".f0.npy" if not os.path.exists(f0_path): - f0 = utils.compute_f0_dio( - wav, sampling_rate=sampling_rate, hop_length=hop_length + from modules.F0Predictor.DioF0Predictor import DioF0Predictor + f0_predictor = DioF0Predictor(sampling_rate=sampling_rate, hop_length=hop_length) + f0 = f0_predictor.compute_f0( + wav ) np.save(f0_path, f0) diff --git a/utils.py b/utils.py index 71efc01..5f866f2 100644 --- a/utils.py +++ b/utils.py @@ -29,41 +29,6 @@ f0_min = 50.0 f0_mel_min = 1127 * np.log(1 + f0_min / 700) f0_mel_max = 1127 * np.log(1 + f0_max / 700) - -# def normalize_f0(f0, random_scale=True): -# f0_norm = f0.clone() # create a copy of the input Tensor -# batch_size, _, frame_length = f0_norm.shape -# for i in range(batch_size): -# means = torch.mean(f0_norm[i, 0, :]) -# if random_scale: -# factor = random.uniform(0.8, 1.2) -# else: -# factor = 1 -# f0_norm[i, 0, :] = (f0_norm[i, 0, :] - means) * factor -# return f0_norm -# def normalize_f0(f0, random_scale=True): -# means = torch.mean(f0[:, 0, :], dim=1, keepdim=True) -# if random_scale: -# factor = torch.Tensor(f0.shape[0],1).uniform_(0.8, 1.2).to(f0.device) -# else: -# factor = torch.ones(f0.shape[0], 1, 1).to(f0.device) -# f0_norm = (f0 - means.unsqueeze(-1)) * factor.unsqueeze(-1) -# return f0_norm - -def deprecated(func): - """This is a decorator which can be used to mark functions - as deprecated. It will result in a warning being emitted - when the function is used.""" - @functools.wraps(func) - def new_func(*args, **kwargs): - warnings.simplefilter('always', DeprecationWarning) # turn off filter - warnings.warn("Call to deprecated function {}.".format(func.__name__), - category=DeprecationWarning, - stacklevel=2) - warnings.simplefilter('default', DeprecationWarning) # reset filter - return func(*args, **kwargs) - return new_func - def normalize_f0(f0, x_mask, uv, random_scale=True): # calculate means based on x_mask uv_sum = torch.sum(uv, dim=1, keepdim=True) @@ -80,20 +45,6 @@ def normalize_f0(f0, x_mask, uv, random_scale=True): exit(0) return f0_norm * x_mask -def compute_f0_uv_torchcrepe(wav_numpy, p_len=None, sampling_rate=44100, hop_length=512,device=None,cr_threshold=0.05): - from modules.crepe import CrepePitchExtractor - x = wav_numpy - if p_len is None: - p_len = x.shape[0]//hop_length - else: - assert abs(p_len-x.shape[0]//hop_length) < 4, "pad length error" - - f0_min = 50 - f0_max = 1100 - F0Creper = CrepePitchExtractor(hop_length=hop_length,f0_min=f0_min,f0_max=f0_max,device=device,threshold=cr_threshold) - f0,uv = F0Creper(x[None,:].float(),sampling_rate,pad_to=p_len) - return f0,uv - def plot_data_to_numpy(x, y): global MATPLOTLIB_FLAG if not MATPLOTLIB_FLAG: @@ -117,87 +68,6 @@ def plot_data_to_numpy(x, y): return data - -def interpolate_f0(f0): - ''' - 对F0进行插值处理 - ''' - - data = np.reshape(f0, (f0.size, 1)) - - vuv_vector = np.zeros((data.size, 1), dtype=np.float32) - vuv_vector[data > 0.0] = 1.0 - vuv_vector[data <= 0.0] = 0.0 - - ip_data = data - - frame_number = data.size - last_value = 0.0 - for i in range(frame_number): - if data[i] <= 0.0: - j = i + 1 - for j in range(i + 1, frame_number): - if data[j] > 0.0: - break - if j < frame_number - 1: - if last_value > 0.0: - step = (data[j] - data[i - 1]) / float(j - i) - for k in range(i, j): - ip_data[k] = data[i - 1] + step * (k - i + 1) - else: - for k in range(i, j): - ip_data[k] = data[j] - else: - for k in range(i, frame_number): - ip_data[k] = last_value - else: - ip_data[i] = data[i] #这里可能存在一个没有必要的拷贝 - last_value = data[i] - - return ip_data[:,0], vuv_vector[:,0] - - -def compute_f0_parselmouth(wav_numpy, p_len=None, sampling_rate=44100, hop_length=512): - import parselmouth - x = wav_numpy - if p_len is None: - p_len = x.shape[0]//hop_length - else: - assert abs(p_len-x.shape[0]//hop_length) < 4, "pad length error" - time_step = hop_length / sampling_rate * 1000 - f0_min = 50 - f0_max = 1100 - f0 = parselmouth.Sound(x, sampling_rate).to_pitch_ac( - time_step=time_step / 1000, voicing_threshold=0.6, - pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency'] - - pad_size=(p_len - len(f0) + 1) // 2 - if(pad_size>0 or p_len - len(f0) - pad_size>0): - f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant') - return f0 - -def resize_f0(x, target_len): - source = np.array(x) - source[source<0.001] = np.nan - target = np.interp(np.arange(0, len(source)*target_len, len(source))/ target_len, np.arange(0, len(source)), source) - res = np.nan_to_num(target) - return res - -def compute_f0_dio(wav_numpy, p_len=None, sampling_rate=44100, hop_length=512): - import pyworld - if p_len is None: - p_len = wav_numpy.shape[0]//hop_length - f0, t = pyworld.dio( - wav_numpy.astype(np.double), - fs=sampling_rate, - f0_ceil=800, - frame_period=1000 * hop_length / sampling_rate, - ) - f0 = pyworld.stonemask(wav_numpy.astype(np.double), f0, t, sampling_rate) - for index, pitch in enumerate(f0): - f0[index] = round(pitch, 1) - return resize_f0(f0, p_len) - def f0_to_coarse(f0): is_torch = isinstance(f0, torch.Tensor) f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700) @@ -246,7 +116,6 @@ def get_content(cmodel, y): return c - def load_checkpoint(checkpoint_path, model, optimizer=None, skip_optimizer=False): assert os.path.isfile(checkpoint_path) checkpoint_dict = torch.load(checkpoint_path, map_location='cpu') diff --git a/webUI.py b/webUI.py index 499ff0b..e22737f 100644 --- a/webUI.py +++ b/webUI.py @@ -106,7 +106,7 @@ def modelUnload(): return sid.update(choices = [],value=""),"模型卸载完毕!" -def vc_fn(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,F0_mean_pooling,enhancer_adaptive_key,cr_threshold): +def vc_fn(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold): global model try: if input_audio is None: @@ -120,7 +120,7 @@ def vc_fn(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, noise audio = librosa.to_mono(audio.transpose(1, 0)) temp_path = "temp.wav" soundfile.write(temp_path, audio, sampling_rate, format="wav") - _audio = model.slice_inference(temp_path, sid, vc_transform, slice_db, cluster_ratio, auto_f0, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,F0_mean_pooling,enhancer_adaptive_key,cr_threshold) + _audio = model.slice_inference(temp_path, sid, vc_transform, slice_db, cluster_ratio, auto_f0, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold) model.clear_empty() os.remove(temp_path) #构建保存文件的路径,并保存到results文件夹内 @@ -166,7 +166,7 @@ def tts_func(_text,_rate,_voice): def text_clear(text): return re.sub(r"[\n\,\(\) ]", "", text) -def vc_fn2(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,text2tts,tts_rate,tts_voice,F0_mean_pooling,enhancer_adaptive_key,cr_threshold): +def vc_fn2(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,text2tts,tts_rate,tts_voice,f0_predictor,enhancer_adaptive_key,cr_threshold): #使用edge-tts把文字转成音频 text2tts=text_clear(text2tts) output_file=tts_func(text2tts,tts_rate,tts_voice) @@ -184,7 +184,7 @@ def vc_fn2(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, nois sample_rate, data=gr_pu.audio_from_file(save_path2) vc_input=(sample_rate, data) - a,b=vc_fn(sid, vc_input, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,F0_mean_pooling,enhancer_adaptive_key,cr_threshold) + a,b=vc_fn(sid, vc_input, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold) os.remove(output_file) os.remove(save_path2) return a,b @@ -231,7 +231,7 @@ with gr.Blocks( 推理设置 """) auto_f0 = gr.Checkbox(label="自动f0预测,配合聚类模型f0预测效果更好,会导致变调功能失效(仅限转换语音,歌声勾选此项会究极跑调)", value=False) - F0_mean_pooling = gr.Checkbox(label="是否对F0使用均值滤波器(池化),对部分哑音有改善。注意,启动该选项会导致推理速度下降,默认关闭", value=False) + f0_predictor = gr.Dropdown(label="选择F0预测器,可选择crepe,pm,dio,harvest,默认为pm(注意:crepe为原F0使用均值滤波器)", choices=["pm","dio","harvest","crepe"], value="pm") vc_transform = gr.Number(label="变调(整数,可以正负,半音数量,升高八度就是12)", value=0) cluster_ratio = gr.Number(label="聚类模型混合比例,0-1之间,0即不启用聚类。使用聚类模型能提升音色相似度,但会导致咬字下降(如果使用建议0.5左右)", value=0) slice_db = gr.Number(label="切片阈值", value=-40) @@ -242,7 +242,7 @@ with gr.Blocks( lg_num = gr.Number(label="两端音频切片的交叉淡入长度,如果自动切片后出现人声不连贯可调整该数值,如果连贯建议采用默认值0,注意,该设置会影响推理速度,单位为秒/s", value=0) lgr_num = gr.Number(label="自动音频切片后,需要舍弃每段切片的头尾。该参数设置交叉长度保留的比例,范围0-1,左开右闭", value=0.75) enhancer_adaptive_key = gr.Number(label="使增强器适应更高的音域(单位为半音数)|默认为0", value=0) - cr_threshold = gr.Number(label="F0过滤阈值,只有启动f0_mean_pooling时有效. 数值范围从0-1. 降低该值可减少跑调概率,但会增加哑音", value=0.05) + cr_threshold = gr.Number(label="F0过滤阈值,只有启动crepe时有效. 数值范围从0-1. 降低该值可减少跑调概率,但会增加哑音", value=0.05) with gr.Tabs(): with gr.TabItem("音频转音频"): vc_input3 = gr.Audio(label="选择音频") @@ -300,8 +300,8 @@ with gr.Blocks( WebUI设置 """) debug_button = gr.Checkbox(label="Debug模式,如果向社区反馈BUG需要打开,打开后控制台可以显示具体错误提示", value=debug) - vc_submit.click(vc_fn, [sid, vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,F0_mean_pooling,enhancer_adaptive_key,cr_threshold], [vc_output1, vc_output2]) - vc_submit2.click(vc_fn2, [sid, vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,text2tts,tts_rate,tts_voice,F0_mean_pooling,enhancer_adaptive_key,cr_threshold], [vc_output1, vc_output2]) + vc_submit.click(vc_fn, [sid, vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold], [vc_output1, vc_output2]) + vc_submit2.click(vc_fn2, [sid, vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,text2tts,tts_rate,tts_voice,f0_predictor,enhancer_adaptive_key,cr_threshold], [vc_output1, vc_output2]) debug_button.change(debug_change,[],[]) model_load_button.click(modelAnalysis,[model_path,config_path,cluster_model_path,device,enhance],[sid,sid_output]) model_unload_button.click(modelUnload,[],[sid,sid_output])