so-vits-svc/inference_main.py

122 lines
7.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import io
import logging
import time
from pathlib import Path
import librosa
import matplotlib.pyplot as plt
import numpy as np
import soundfile
from inference import infer_tool
from inference import slicer
from inference.infer_tool import Svc
logging.getLogger('numba').setLevel(logging.WARNING)
chunks_dict = infer_tool.read_temp("inference/chunks_temp.json")
def main():
import argparse
parser = argparse.ArgumentParser(description='sovits4 inference')
# 一定要设置的部分
parser.add_argument('-m', '--model_path', type=str, default="logs/44k/G_0.pth", help='模型路径')
parser.add_argument('-c', '--config_path', type=str, default="configs/config.json", help='配置文件路径')
parser.add_argument('-cl', '--clip', type=float, default=0, help='音频强制切片默认0为自动切片单位为秒/s')
parser.add_argument('-n', '--clean_names', type=str, nargs='+', default=["君の知らない物語-src.wav"], help='wav文件名列表放在raw文件夹下')
parser.add_argument('-t', '--trans', type=int, nargs='+', default=[0], help='音高调整,支持正负(半音)')
parser.add_argument('-s', '--spk_list', type=str, nargs='+', default=['nen'], help='合成目标说话人名称')
# 可选项部分
parser.add_argument('-a', '--auto_predict_f0', action='store_true', default=False, help='语音转换自动预测音高,转换歌声时不要打开这个会严重跑调')
parser.add_argument('-cm', '--cluster_model_path', type=str, default="logs/44k/kmeans_10000.pt", help='聚类模型路径,如果没有训练聚类则随便填')
parser.add_argument('-cr', '--cluster_infer_ratio', type=float, default=0, help='聚类方案占比范围0-1若没有训练聚类模型则默认0即可')
parser.add_argument('-lg', '--linear_gradient', type=float, default=0, help='两段音频切片的交叉淡入长度如果强制切片后出现人声不连贯可调整该数值如果连贯建议采用默认值0单位为秒')
parser.add_argument('-f0p', '--f0_predictor', type=str, default="pm", help='选择F0预测器,可选择crepe,pm,dio,harvest,默认为pm(注意crepe为原F0使用均值滤波器)')
parser.add_argument('-eh', '--enhance', action='store_true', default=False, help='是否使用NSF_HIFIGAN增强器,该选项对部分训练集少的模型有一定的音质增强效果,但是对训练好的模型有反面效果,默认关闭')
parser.add_argument('-shd', '--shallow_diffusion', action='store_true', default=False, help='是否使用浅层扩散使用后可解决一部分电音问题默认关闭该选项打开时NSF_HIFIGAN增强器将会被禁止')
# 浅扩散设置
parser.add_argument('-dm', '--diffusion_model_path', type=str, default="logs/44k/diffusion/model_0.pt", help='扩散模型路径')
parser.add_argument('-dc', '--diffusion_config_path', type=str, default="logs/44k/diffusion/config.yaml", help='扩散模型配置文件路径')
parser.add_argument('-ks', '--k_step', type=int, default=100, help='扩散步数越大越接近扩散模型的结果默认100')
parser.add_argument('-od', '--only_diffusion', action='store_true', default=False, help='纯扩散模式该模式不会加载sovits模型以扩散模型推理')
# 不用动的部分
parser.add_argument('-sd', '--slice_db', type=int, default=-40, help='默认-40嘈杂的音频可以-30干声保留呼吸可以-50')
parser.add_argument('-d', '--device', type=str, default=None, help='推理设备None则为自动选择cpu和gpu')
parser.add_argument('-ns', '--noice_scale', type=float, default=0.4, help='噪音级别,会影响咬字和音质,较为玄学')
parser.add_argument('-p', '--pad_seconds', type=float, default=0.5, help='推理音频pad秒数由于未知原因开头结尾会有异响pad一小段静音段后就不会出现')
parser.add_argument('-wf', '--wav_format', type=str, default='flac', help='音频输出格式')
parser.add_argument('-lgr', '--linear_gradient_retain', type=float, default=0.75, help='自动音频切片后需要舍弃每段切片的头尾。该参数设置交叉长度保留的比例范围0-1,左开右闭')
parser.add_argument('-eak', '--enhancer_adaptive_key', type=int, default=0, help='使增强器适应更高的音域(单位为半音数)|默认为0')
parser.add_argument('-ft', '--f0_filter_threshold', type=float, default=0.05,help='F0过滤阈值只有使用crepe时有效. 数值范围从0-1. 降低该值可减少跑调概率,但会增加哑音')
args = parser.parse_args()
clean_names = args.clean_names
trans = args.trans
spk_list = args.spk_list
slice_db = args.slice_db
wav_format = args.wav_format
auto_predict_f0 = args.auto_predict_f0
cluster_infer_ratio = args.cluster_infer_ratio
noice_scale = args.noice_scale
pad_seconds = args.pad_seconds
clip = args.clip
lg = args.linear_gradient
lgr = args.linear_gradient_retain
f0p = args.f0_predictor
enhance = args.enhance
enhancer_adaptive_key = args.enhancer_adaptive_key
cr_threshold = args.f0_filter_threshold
diffusion_model_path = args.diffusion_model_path
diffusion_config_path = args.diffusion_config_path
k_step = args.k_step
only_diffusion = args.only_diffusion
shallow_diffusion = args.shallow_diffusion
svc_model = Svc(args.model_path, args.config_path, args.device, args.cluster_model_path,enhance,diffusion_model_path,diffusion_config_path,shallow_diffusion,only_diffusion)
infer_tool.mkdir(["raw", "results"])
infer_tool.fill_a_to_b(trans, clean_names)
for clean_name, tran in zip(clean_names, trans):
raw_audio_path = f"raw/{clean_name}"
if "." not in raw_audio_path:
raw_audio_path += ".wav"
infer_tool.format_wav(raw_audio_path)
for spk in spk_list:
kwarg = {
"raw_audio_path" : raw_audio_path,
"spk" : spk,
"tran" : tran,
"slice_db" : slice_db,
"cluster_infer_ratio" : cluster_infer_ratio,
"auto_predict_f0" : auto_predict_f0,
"noice_scale" : noice_scale,
"pad_seconds" : pad_seconds,
"clip_seconds" : clip,
"lg_num": lg,
"lgr_num" : lgr,
"f0_predictor" : f0p,
"enhancer_adaptive_key" : enhancer_adaptive_key,
"cr_threshold" : cr_threshold,
"k_step":k_step
}
audio = svc_model.slice_inference(**kwarg)
key = "auto" if auto_predict_f0 else f"{tran}key"
cluster_name = "" if cluster_infer_ratio == 0 else f"_{cluster_infer_ratio}"
isdiffusion = "sovits"
if shallow_diffusion : isdiffusion = "sovdiff"
if only_diffusion : isdiffusion = "diff"
res_path = f'./results/{clean_name}_{key}_{spk}{cluster_name}_{isdiffusion}.{wav_format}'
soundfile.write(res_path, audio, svc_model.target_sample, format=wav_format)
svc_model.clear_empty()
if __name__ == '__main__':
main()