Compare commits
31 Commits
fec31b364a
...
68802ecd83
Author | SHA1 | Date |
---|---|---|
YuriHead | 68802ecd83 | |
ylzz1997 | 4e402d8a64 | |
YuriHead | cb2a0cae61 | |
innnky | 347f88e220 | |
innnky | 1796ccf9a2 | |
ylzz1997 | b6c779ad2a | |
ylzz1997 | d34715321c | |
YuriHead | 26ad74e0fc | |
ylzz1997 | eb257f04f1 | |
ylzz1997 | 107d0a30a9 | |
ylzz1997 | 966f20aa3b | |
ylzz1997 | 823b43820a | |
ylzz1997 | 66bd04549a | |
ylzz1997 | 8a82ff00c8 | |
YuriHead | f6709b6218 | |
ylzz1997 | 8675d77e63 | |
asdfw13 | 0705b9e412 | |
ylzz1997 | 1caf819044 | |
ylzz1997 | 8388efe279 | |
ylzz1997 | 614c61177d | |
ylzz1997 | 8b3261d33a | |
ylzz1997 | 7351c530bc | |
ylzz1997 | 39628c1a97 | |
ylzz1997 | 08617333ce | |
ylzz1997 | fd8e717112 | |
ylzz1997 | afde9defed | |
YuriHead | 8117b2c85f | |
CN_ChiTu | b7d6905b80 | |
asdfw13 | c92aa6b2ac | |
mlbv | c63fd1f40c | |
YuriHead | 64591bd664 |
|
@ -519,6 +519,8 @@ Note: For Hubert Onnx models, please use the models provided by MoeSS. Currently
|
|||
|[2105.02446v3](https://arxiv.org/abs/2105.02446v3) | Shallow Diffusion (PostProcessing)| DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism | [CNChTu/Diffusion-SVC](https://github.com/CNChTu/Diffusion-SVC) |
|
||||
|[K-means](https://citeseerx.ist.psu.edu/viewdoc/download;jsessionid=01D65490BADCC216F350D06F84D721AD?doi=10.1.1.308.8619&rep=rep1&type=pdf) | Feature K-means Clustering (PreProcessing)| Some methods for classification and analysis of multivariate observations | This repo |
|
||||
| | Feature TopK Retrieval (PreProcessing)| Retrieval based Voice Conversion | [RVC-Project/Retrieval-based-Voice-Conversion-WebUI](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI) |
|
||||
| | whisper ppg| whisper ppg | [PlayVoice/whisper_ppg](https://github.com/PlayVoice/whisper_ppg) |
|
||||
| | bigvgan| bigvgan | [PlayVoice/so-vits-svc-5.0](https://github.com/PlayVoice/so-vits-svc-5.0/tree/bigvgan-mix-v2/vits_decoder/alias) |
|
||||
|
||||
|
||||
## ☀️ Previous contributors
|
||||
|
|
|
@ -1,74 +0,0 @@
|
|||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from diffusion.unit2mel import load_model_vocoder
|
||||
|
||||
|
||||
class DiffGtMel:
|
||||
def __init__(self, project_path=None, device=None):
|
||||
self.project_path = project_path
|
||||
if device is not None:
|
||||
self.device = device
|
||||
else:
|
||||
self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
||||
self.model = None
|
||||
self.vocoder = None
|
||||
self.args = None
|
||||
|
||||
def flush_model(self, project_path, ddsp_config=None):
|
||||
if (self.model is None) or (project_path != self.project_path):
|
||||
model, vocoder, args = load_model_vocoder(project_path, device=self.device)
|
||||
if self.check_args(ddsp_config, args):
|
||||
self.model = model
|
||||
self.vocoder = vocoder
|
||||
self.args = args
|
||||
|
||||
def check_args(self, args1, args2):
|
||||
if args1.data.block_size != args2.data.block_size:
|
||||
raise ValueError("DDSP与DIFF模型的block_size不一致")
|
||||
if args1.data.sampling_rate != args2.data.sampling_rate:
|
||||
raise ValueError("DDSP与DIFF模型的sampling_rate不一致")
|
||||
if args1.data.encoder != args2.data.encoder:
|
||||
raise ValueError("DDSP与DIFF模型的encoder不一致")
|
||||
return True
|
||||
|
||||
def __call__(self, audio, f0, hubert, volume, acc=1, spk_id=1, k_step=0, method='pndm',
|
||||
spk_mix_dict=None, start_frame=0):
|
||||
input_mel = self.vocoder.extract(audio, self.args.data.sampling_rate)
|
||||
out_mel = self.model(
|
||||
hubert,
|
||||
f0,
|
||||
volume,
|
||||
spk_id=spk_id,
|
||||
spk_mix_dict=spk_mix_dict,
|
||||
gt_spec=input_mel,
|
||||
infer=True,
|
||||
infer_speedup=acc,
|
||||
method=method,
|
||||
k_step=k_step,
|
||||
use_tqdm=False)
|
||||
if start_frame > 0:
|
||||
out_mel = out_mel[:, start_frame:, :]
|
||||
f0 = f0[:, start_frame:, :]
|
||||
output = self.vocoder.infer(out_mel, f0)
|
||||
if start_frame > 0:
|
||||
output = F.pad(output, (start_frame * self.vocoder.vocoder_hop_size, 0))
|
||||
return output
|
||||
|
||||
def infer(self, audio, f0, hubert, volume, acc=1, spk_id=1, k_step=0, method='pndm', silence_front=0,
|
||||
use_silence=False, spk_mix_dict=None):
|
||||
start_frame = int(silence_front * self.vocoder.vocoder_sample_rate / self.vocoder.vocoder_hop_size)
|
||||
if use_silence:
|
||||
audio = audio[:, start_frame * self.vocoder.vocoder_hop_size:]
|
||||
f0 = f0[:, start_frame:, :]
|
||||
hubert = hubert[:, start_frame:, :]
|
||||
volume = volume[:, start_frame:, :]
|
||||
_start_frame = 0
|
||||
else:
|
||||
_start_frame = start_frame
|
||||
audio = self.__call__(audio, f0, hubert, volume, acc=acc, spk_id=spk_id, k_step=k_step,
|
||||
method=method, spk_mix_dict=spk_mix_dict, start_frame=_start_frame)
|
||||
if use_silence:
|
||||
if start_frame > 0:
|
||||
audio = F.pad(audio, (start_frame * self.vocoder.vocoder_hop_size, 0))
|
||||
return audio
|
|
@ -101,6 +101,7 @@ def train(args, initial_global_step, model, optimizer, scheduler, vocoder, loade
|
|||
|
||||
# run
|
||||
num_batches = len(loader_train)
|
||||
start_epoch = initial_global_step // num_batches
|
||||
model.train()
|
||||
saver.log_info('======= start training =======')
|
||||
scaler = GradScaler()
|
||||
|
@ -113,7 +114,7 @@ def train(args, initial_global_step, model, optimizer, scheduler, vocoder, loade
|
|||
else:
|
||||
raise ValueError(' [x] Unknown amp_dtype: ' + args.train.amp_dtype)
|
||||
saver.log_info("epoch|batch_idx/num_batches|output_dir|batch/s|lr|time|step")
|
||||
for epoch in range(args.train.epochs):
|
||||
for epoch in range(start_epoch, args.train.epochs):
|
||||
for batch_idx, data in enumerate(loader_train):
|
||||
saver.global_step_increment()
|
||||
optimizer.zero_grad()
|
||||
|
|
|
@ -13,7 +13,7 @@ from .wavenet import WaveNet
|
|||
class DotDict(dict):
|
||||
def __getattr__(*args):
|
||||
val = dict.get(*args)
|
||||
return DotDict(val) if type(val) is dict else val
|
||||
return DotDict(val) if type(val) is dict else val # noqa: E721
|
||||
|
||||
__setattr__ = dict.__setitem__
|
||||
__delattr__ = dict.__delitem__
|
||||
|
|
|
@ -0,0 +1,516 @@
|
|||
import json
|
||||
import os
|
||||
import pickle
|
||||
import threading
|
||||
|
||||
# from ddsp.vocoder import load_model, F0_Extractor, Volume_Extractor, Units_Encoder
|
||||
# from ddsp.core import upsample
|
||||
import time
|
||||
|
||||
import librosa
|
||||
import numpy as np
|
||||
import PySimpleGUI as sg
|
||||
import sounddevice as sd
|
||||
import torch
|
||||
from torch.nn import functional as F
|
||||
from torchaudio.transforms import Resample
|
||||
|
||||
from gui_i18 import I18nAuto
|
||||
from inference.infer_tool import Svc
|
||||
|
||||
|
||||
class Config:
|
||||
def __init__(self) -> None:
|
||||
self.samplerate = 44100 # Hz
|
||||
self.block_time = 1.5 # s
|
||||
self.f_pitch_change: float = 0.0 # float(request_form.get("fPitchChange", 0))
|
||||
self.spk_id = 0 # 默认说话人。
|
||||
self.spk_list = [0]
|
||||
# self.spk_mix_dict = None # {1:0.5, 2:0.5} 表示1号说话人和2号说话人的音色按照0.5:0.5的比例混合
|
||||
self.use_vocoder_based_enhancer = True
|
||||
self.use_feature_retrieval = False
|
||||
self.cluster_infer_ratio = 0
|
||||
self.checkpoint_path = ''
|
||||
self.kmeans_path = ''
|
||||
self.threhold = -35
|
||||
self.buffer_num = 2
|
||||
self.noice_scale = 0.4
|
||||
self.crossfade_time = 0.03
|
||||
self.select_pitch_extractor = 'fcpe' # F0预测器["parselmouth", "dio", "harvest", "crepe", "rmvpe", "fcpe"]
|
||||
# self.use_spk_mix = False
|
||||
self.sounddevices = ['', '']
|
||||
self.diff_use = False
|
||||
self.auto_F0 = False
|
||||
self.diff_project = ''
|
||||
self.diff_acc = 10
|
||||
self.k_step = 100
|
||||
self.diff_method = 'pndm'
|
||||
self.diff_silence = False
|
||||
self.second_encoding = False
|
||||
|
||||
def save(self, path):
|
||||
with open(path + '\\config.pkl', 'wb') as f:
|
||||
pickle.dump(vars(self), f)
|
||||
|
||||
def load(self, path) -> bool:
|
||||
try:
|
||||
with open(path + '\\config.pkl', 'rb') as f:
|
||||
self.update(pickle.load(f))
|
||||
return True
|
||||
except: # noqa: E722
|
||||
print('config.pkl does not exist')
|
||||
return False
|
||||
|
||||
def update(self, data_dict):
|
||||
for key, value in data_dict.items():
|
||||
setattr(self, key, value)
|
||||
|
||||
class GUI:
|
||||
def __init__(self) -> None:
|
||||
self.config = Config()
|
||||
self.flag_vc: bool = False # 变声线程flag
|
||||
self.block_frame = 0
|
||||
self.crossfade_frame = 0
|
||||
self.sola_search_frame = 0
|
||||
self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
||||
self.svc_model = None
|
||||
self.fade_in_window: np.ndarray = None # crossfade计算用numpy数组
|
||||
self.fade_out_window: np.ndarray = None # crossfade计算用numpy数组
|
||||
self.input_wav: np.ndarray = None # 输入音频规范化后的保存地址
|
||||
self.output_wav: np.ndarray = None # 输出音频规范化后的保存地址
|
||||
self.sola_buffer: torch.Tensor = None # 保存上一个output的crossfade
|
||||
self.f0_mode_list = ["pm", "dio", "harvest", "crepe" ,"rmvpe","fcpe"] # F0预测器
|
||||
self.diff_method_list = ["ddim", "pndm", "dpm-solver++", "dpm-solver", "unipc"]
|
||||
self.f_safe_prefix_pad_length: float = 0.0
|
||||
self.resample_kernel = {}
|
||||
self.launcher() # start
|
||||
|
||||
def launcher(self):
|
||||
'''窗口加载'''
|
||||
input_devices, output_devices, _, _ = self.get_devices()
|
||||
sg.theme('LightBlue4') # 设置主题
|
||||
sg.theme_background_color("#4BD2D8")
|
||||
sg.theme_element_background_color("#4BD2D8")
|
||||
sg.theme_text_element_background_color("#4BD2D8")
|
||||
# 界面布局
|
||||
layout = [
|
||||
[sg.Frame(layout=[
|
||||
[sg.Input(key='sg_model', default_text='logs\\44k\\G_30000.pth', enable_events=True),
|
||||
sg.FileBrowse(i18n('选择模型文件'), key='choose_model')]
|
||||
], title=i18n('模型:.pth格式(自动识别同目录下config.json)')),
|
||||
sg.Frame(layout=[
|
||||
[sg.Text(i18n('选择配置文件所在目录')), sg.Input(key='config_file_dir', default_text='configs'),
|
||||
sg.FolderBrowse(i18n('打开文件夹'), key='choose_config')],
|
||||
[sg.Button(i18n('读取配置文件'), key='load_config'),
|
||||
sg.Button(i18n('保存配置文件'), key='save_config')]
|
||||
], title=i18n('快速配置文件'))
|
||||
],
|
||||
[sg.Frame(layout=[
|
||||
[sg.Text(i18n("输入设备")),
|
||||
sg.Combo(input_devices, key='sg_input_device', default_value=input_devices[sd.default.device[0]],
|
||||
enable_events=True)],
|
||||
[sg.Text(i18n("输出设备")),
|
||||
sg.Combo(output_devices, key='sg_output_device', default_value=output_devices[sd.default.device[1]],
|
||||
enable_events=True)]
|
||||
], title=i18n('音频设备')),
|
||||
sg.Frame(layout=[
|
||||
[sg.Input(key='kmeans_model', default_text='logs\\44k\\kmeans_10000.pt'),
|
||||
sg.FileBrowse('选择聚类或特征检索文件', key='choose_model')]
|
||||
], title="选择聚类或特征检索文件"),
|
||||
],
|
||||
[sg.Frame(layout=[
|
||||
[sg.Text(i18n("说话人")), sg.Combo(self.config.spk_list, key='spk_id', default_value=self.config.spk_id, size=25)],
|
||||
[sg.Text(i18n("响应阈值")),
|
||||
sg.Slider(range=(-65, 0), orientation='h', key='threhold', resolution=1, default_value=-45,
|
||||
enable_events=True)],
|
||||
[sg.Text("特征检索/聚类比例"),
|
||||
sg.Slider(range=(0, 1), orientation='h', key='cluster_infer_ratio', resolution=0.01, default_value=0,
|
||||
enable_events=True)],
|
||||
[sg.Text(i18n("变调")),
|
||||
sg.Slider(range=(-24, 24), orientation='h', key='pitch', resolution=1, default_value=0,
|
||||
enable_events=True)],
|
||||
[sg.Text(i18n("采样率")), sg.Input(key='samplerate', default_text='44100', size=8)],
|
||||
[sg.Text("噪音级别,会影响咬字和音质"),
|
||||
sg.Slider(range=(0, 1), orientation='h', key='noice_scale', resolution=0.01, default_value=0.4,
|
||||
enable_events=True)],
|
||||
# [sg.Checkbox(text=i18n('启用捏音色功能'), default=False, key='spk_mix', enable_events=True),
|
||||
# sg.Button(i18n("设置混合音色"), key='set_spk_mix')]
|
||||
], title=i18n('普通设置')),
|
||||
sg.Frame(layout=[
|
||||
[sg.Text(i18n("音频切分大小")),
|
||||
sg.Slider(range=(0.05, 3.0), orientation='h', key='block', resolution=0.01, default_value=0.5,
|
||||
enable_events=True)],
|
||||
[sg.Text(i18n("交叉淡化时长")),
|
||||
sg.Slider(range=(0.01, 0.15), orientation='h', key='crossfade', resolution=0.01,
|
||||
default_value=0.04, enable_events=True)],
|
||||
[sg.Text(i18n("使用历史区块数量")),
|
||||
sg.Slider(range=(1, 20), orientation='h', key='buffernum', resolution=1, default_value=4,
|
||||
enable_events=True)],
|
||||
[sg.Text(i18n("f0预测模式")),
|
||||
sg.Combo(values=self.f0_mode_list, key='f0_mode', default_value=self.f0_mode_list[-1],
|
||||
enable_events=True)],
|
||||
[sg.Checkbox(text=i18n('启用增强器'), default=False, key='use_enhancer', enable_events=True),
|
||||
sg.Checkbox(text='启用特征检索', default=False, key='use_feature_retrieval', enable_events=True),
|
||||
sg.Checkbox(text='自动F0预测', default=False, key='auto_F0', enable_events=True)
|
||||
],[
|
||||
sg.Checkbox(text=i18n('不推理安全区(加速但损失效果)'), default=False, key='diff_silence', enable_events=True),
|
||||
]
|
||||
], title=i18n('性能设置')),
|
||||
sg.Frame(layout=[
|
||||
[sg.Text(i18n("扩散模型文件"))],
|
||||
[sg.Input(key='diff_project', default_text='logs\\44k\\diffusion\\model_400000.pt'),
|
||||
sg.FileBrowse(i18n('选择模型文件'), key='choose_model')],
|
||||
[sg.Text(i18n("扩散深度")), sg.Input(key='k_step', default_text='100', size=18)],
|
||||
[sg.Text(i18n("扩散加速")), sg.Input(key='diff_acc', default_text='10', size=18)],
|
||||
[sg.Text(i18n("扩散算法")),
|
||||
sg.Combo(values=self.diff_method_list, key='diff_method', default_value=self.diff_method_list[0],
|
||||
enable_events=True)],
|
||||
[sg.Checkbox(text=i18n('启用扩散'), key='diff_use', enable_events=True),
|
||||
sg.Checkbox(text='启用二次编码', default=False, key='second_encoding', enable_events=True)
|
||||
]
|
||||
], title=i18n('扩散设置')),
|
||||
],
|
||||
[sg.Button(i18n("开始音频转换"), key="start_vc"), sg.Button(i18n("停止音频转换"), key="stop_vc"),
|
||||
sg.Text(i18n('推理所用时间(ms):')), sg.Text('0', key='infer_time')]
|
||||
]
|
||||
|
||||
# 创造窗口
|
||||
self.window = sg.Window('SOVITS - REAL - TIME - GUI', layout, finalize=True)
|
||||
self.window['samplerate'].bind('<Return>', '')
|
||||
self.window['k_step'].bind('<Return>', '')
|
||||
self.window['diff_acc'].bind('<Return>', '')
|
||||
self.event_handler()
|
||||
|
||||
def event_handler(self):
|
||||
'''事件处理'''
|
||||
while True: # 事件处理循环
|
||||
event, values = self.window.read()
|
||||
if event == sg.WINDOW_CLOSED: # 如果用户关闭窗口
|
||||
self.flag_vc = False
|
||||
exit()
|
||||
|
||||
print('event: ' + event)
|
||||
|
||||
if event == 'start_vc' and self.flag_vc is False:
|
||||
# set values 和界面布局layout顺序一一对应
|
||||
self.set_values(values)
|
||||
print('crossfade_time:' + str(self.config.crossfade_time))
|
||||
print("buffer_num:" + str(self.config.buffer_num))
|
||||
print("samplerate:" + str(self.config.samplerate))
|
||||
print('block_time:' + str(self.config.block_time))
|
||||
print("prefix_pad_length:" + str(self.f_safe_prefix_pad_length))
|
||||
# print("mix_mode:" + str(self.config.spk_mix_dict))
|
||||
print("enhancer:" + str(self.config.use_vocoder_based_enhancer))
|
||||
print("diffusion:" + str(self.config.diff_use))
|
||||
print('using_cuda:' + str(torch.cuda.is_available()))
|
||||
self.start_vc()
|
||||
elif event == 'k_step':
|
||||
if 1 <= int(values['k_step']) <= 1000:
|
||||
self.config.k_step = int(values['k_step'])
|
||||
else:
|
||||
self.window['k_step'].update(1000)
|
||||
elif event == 'diff_acc':
|
||||
if self.config.k_step < int(values['diff_acc']):
|
||||
self.config.diff_acc = int(self.config.k_step / 4)
|
||||
else:
|
||||
self.config.diff_acc = int(values['diff_acc'])
|
||||
if self.svc_model is not None and hasattr(self.svc_model, "diffusion_model"):
|
||||
self.svc_model.diffusion_args.infer.speedup = self.config.diff_acc
|
||||
elif event == 'diff_use':
|
||||
self.config.diff_use = values['diff_use']
|
||||
self.window['use_enhancer'].update(False)
|
||||
self.config.use_vocoder_based_enhancer=False
|
||||
if self.svc_model is not None:
|
||||
self.svc_model.shallow_diffusion = self.config.diff_use
|
||||
elif event == 'diff_silence':
|
||||
self.config.diff_silence = values['diff_silence']
|
||||
elif event == 'diff_method':
|
||||
self.config.diff_method = values['diff_method']
|
||||
if self.svc_model is not None and hasattr(self.svc_model, "diffusion_model"):
|
||||
self.svc_model.diffusion_args.infer.method = self.config.diff_method
|
||||
elif event == 'spk_id':
|
||||
self.config.spk_id = values['spk_id']
|
||||
elif event == 'sg_model':
|
||||
model_config_path = os.path.join(os.path.dirname(values["sg_model"]), 'config.json')
|
||||
print("model_config_path:", model_config_path)
|
||||
try:
|
||||
config = json.load(open(model_config_path))
|
||||
self.config.spk_list = list(config['spk'].keys())
|
||||
self.config.spk_id = self.config.spk_list[0]
|
||||
self.window['spk_id'].update(values = self.config.spk_list, value = self.config.spk_id)
|
||||
except Exception as e:
|
||||
print("This is a error path or config!")
|
||||
print(f"detail:{e}")
|
||||
elif event == 'threhold':
|
||||
self.config.threhold = values['threhold']
|
||||
elif event == 'pitch':
|
||||
self.config.f_pitch_change = values['pitch']
|
||||
elif event == 'second_encoding':
|
||||
self.config.second_encoding = values['second_encoding']
|
||||
elif event == 'auto_F0':
|
||||
self.config.auto_F0 = values['auto_F0']
|
||||
elif event == 'noice_scale':
|
||||
self.config.noice_scale = values['noice_scale']
|
||||
# elif event == 'spk_mix':
|
||||
# self.config.use_spk_mix = values['spk_mix']
|
||||
# elif event == 'set_spk_mix':
|
||||
# spk_mix = sg.popup_get_text(message='示例:1:0.3,2:0.5,3:0.2', title="设置混合音色,支持多人")
|
||||
# if spk_mix != None:
|
||||
# self.config.spk_mix_dict = eval("{" + spk_mix.replace(',', ',').replace(':', ':') + "}")
|
||||
elif event == 'spk_mix':
|
||||
self.config.use_spk_mix = values['spk_mix']
|
||||
elif event == 'use_feature_retrieval':
|
||||
self.config.use_feature_retrieval = values['use_feature_retrieval']
|
||||
elif event == 'use_enhancer':
|
||||
self.config.use_vocoder_based_enhancer = values['use_enhancer']
|
||||
self.window['diff_use'].update(False)
|
||||
self.config.diff_use = False
|
||||
elif event == 'load_config' and self.flag_vc is False:
|
||||
if self.config.load(values['config_file_dir']):
|
||||
self.update_values()
|
||||
elif event == 'save_config' and self.flag_vc is False:
|
||||
self.set_values(values)
|
||||
self.config.save(values['config_file_dir'])
|
||||
elif event != 'start_vc' and self.flag_vc is True:
|
||||
self.flag_vc = False
|
||||
|
||||
def set_values(self, values):
|
||||
self.set_devices(values["sg_input_device"], values['sg_output_device'])
|
||||
self.config.sounddevices = [values["sg_input_device"], values['sg_output_device']]
|
||||
self.config.checkpoint_path = values['sg_model']
|
||||
self.config.spk_id = values['spk_id']
|
||||
self.config.threhold = values['threhold']
|
||||
self.config.f_pitch_change = values['pitch']
|
||||
self.config.samplerate = int(values['samplerate'])
|
||||
self.config.block_time = float(values['block'])
|
||||
self.config.crossfade_time = float(values['crossfade'])
|
||||
self.config.second_encoding = values['second_encoding']
|
||||
self.config.buffer_num = int(values['buffernum'])
|
||||
self.config.select_pitch_extractor = values['f0_mode']
|
||||
self.config.use_vocoder_based_enhancer = values['use_enhancer']
|
||||
self.config.use_feature_retrieval = values['use_feature_retrieval']
|
||||
self.config.cluster_infer_ratio = values['cluster_infer_ratio']
|
||||
self.config.noice_scale = float(values['noice_scale'])
|
||||
self.config.kmeans_path = values['kmeans_model']
|
||||
# self.config.use_spk_mix = values['spk_mix']
|
||||
self.config.diff_use = values['diff_use']
|
||||
self.config.auto_F0 = values['auto_F0']
|
||||
self.config.diff_silence = values['diff_silence']
|
||||
self.config.diff_method = values['diff_method']
|
||||
self.config.diff_project = values['diff_project']
|
||||
self.config.diff_acc = int(values['diff_acc'])
|
||||
self.config.k_step = int(values['k_step'])
|
||||
self.block_frame = int(self.config.block_time * self.config.samplerate)
|
||||
self.crossfade_frame = int(self.config.crossfade_time * self.config.samplerate)
|
||||
self.sola_search_frame = int(0.01 * self.config.samplerate)
|
||||
self.last_delay_frame = int(0.02 * self.config.samplerate)
|
||||
self.input_frames = max(
|
||||
self.block_frame + self.crossfade_frame + self.sola_search_frame + 2 * self.last_delay_frame,
|
||||
(1 + self.config.buffer_num) * self.block_frame)
|
||||
self.f_safe_prefix_pad_length = self.config.block_time * self.config.buffer_num - self.config.crossfade_time - 0.01 - 0.02
|
||||
|
||||
def update_values(self):
|
||||
self.window['sg_model'].update(self.config.checkpoint_path)
|
||||
self.window['sg_input_device'].update(self.config.sounddevices[0])
|
||||
self.window['sg_output_device'].update(self.config.sounddevices[1])
|
||||
self.window['spk_id'].update(values = self.config.spk_list, value = self.config.spk_id)
|
||||
self.window['threhold'].update(self.config.threhold)
|
||||
self.window['pitch'].update(self.config.f_pitch_change)
|
||||
self.window['auto_F0'].update(self.config.auto_F0)
|
||||
self.window['samplerate'].update(self.config.samplerate)
|
||||
self.window['use_feature_retrieval'].update(self.config.use_feature_retrieval)
|
||||
self.window['cluster_infer_ratio'].update(self.config.cluster_infer_ratio)
|
||||
self.window['noice_scale'].update(self.config.noice_scale)
|
||||
self.window['kmeans_model'].update(self.config.kmeans_path)
|
||||
# self.window['spk_mix'].update(self.config.use_spk_mix)
|
||||
self.window['block'].update(self.config.block_time)
|
||||
self.window['crossfade'].update(self.config.crossfade_time)
|
||||
self.window['buffernum'].update(self.config.buffer_num)
|
||||
self.window['f0_mode'].update(self.config.select_pitch_extractor)
|
||||
self.window['use_enhancer'].update(self.config.use_vocoder_based_enhancer)
|
||||
self.window['diff_use'].update(self.config.diff_use)
|
||||
self.window['diff_silence'].update(self.config.diff_silence)
|
||||
self.window['diff_method'].update(self.config.diff_method)
|
||||
self.window['diff_project'].update(self.config.diff_project)
|
||||
self.window['diff_acc'].update(self.config.diff_acc)
|
||||
self.window['k_step'].update(self.config.k_step)
|
||||
|
||||
def start_vc(self):
|
||||
'''开始音频转换'''
|
||||
torch.cuda.empty_cache()
|
||||
self.flag_vc = True
|
||||
self.input_wav = np.zeros(self.input_frames, dtype='float32')
|
||||
self.sola_buffer = torch.zeros(self.crossfade_frame, device=self.device)
|
||||
self.fade_in_window = torch.sin(
|
||||
np.pi * torch.arange(0, 1, 1 / self.crossfade_frame, device=self.device) / 2) ** 2
|
||||
self.fade_out_window = 1 - self.fade_in_window
|
||||
self.update_model(self.config.checkpoint_path)
|
||||
thread_vc = threading.Thread(target=self.soundinput)
|
||||
thread_vc.start()
|
||||
|
||||
def soundinput(self):
|
||||
'''
|
||||
接受音频输入
|
||||
'''
|
||||
with sd.Stream(callback=self.audio_callback, blocksize=self.block_frame, samplerate=self.config.samplerate,
|
||||
dtype='float32'):
|
||||
while self.flag_vc:
|
||||
time.sleep(self.config.block_time)
|
||||
print('Audio block passed.')
|
||||
print('ENDing VC')
|
||||
|
||||
def audio_callback(self, indata: np.ndarray, outdata: np.ndarray, frames, times, status):
|
||||
'''
|
||||
音频处理
|
||||
'''
|
||||
start_time = time.perf_counter()
|
||||
print("\nStarting callback")
|
||||
self.input_wav[:] = np.roll(self.input_wav, -self.block_frame)
|
||||
self.input_wav[-self.block_frame:] = librosa.to_mono(indata.T)
|
||||
|
||||
if self.config.diff_silence:
|
||||
start_frame = int(self.f_safe_prefix_pad_length * self.svc_model.target_sample / self.svc_model.hop_size)
|
||||
audio = self.input_wav[start_frame * self.svc_model.hop_size:]
|
||||
else:
|
||||
start_frame = None
|
||||
audio = self.input_wav
|
||||
|
||||
vol = self.svc_model.volume_extractor.extract(torch.FloatTensor(audio)[None,:].to(self.device))[None,:]
|
||||
vol_mask = (vol > 10 ** (float(self.config.threhold) / 20)).to(torch.float) #[1, T]
|
||||
vol_mask = torch.max_pool1d(vol_mask, kernel_size=8, stride=1, padding= 4)
|
||||
# infer
|
||||
_audio, _audio_len, n_frames = self.svc_model.infer(
|
||||
self.config.spk_id,
|
||||
self.config.f_pitch_change,
|
||||
self.input_wav,
|
||||
self.config.cluster_infer_ratio,
|
||||
self.config.auto_F0,
|
||||
self.config.noice_scale,
|
||||
False,
|
||||
self.config.select_pitch_extractor,
|
||||
0,
|
||||
0.05,
|
||||
self.config.k_step,
|
||||
0,
|
||||
False,
|
||||
self.config.second_encoding,
|
||||
1,
|
||||
vol,
|
||||
start_frame,
|
||||
False
|
||||
)
|
||||
vol_mask = torch.nn.functional.interpolate(vol_mask[:,None,:], size=_audio.shape[-1], mode='linear')[0,0,:]
|
||||
_audio *= vol_mask
|
||||
|
||||
if self.config.diff_silence and start_frame is not None and start_frame > 0:
|
||||
_audio = F.pad(_audio, (start_frame * self.svc_model.hop_size, 0))
|
||||
|
||||
_model_sr = self.svc_model.target_sample
|
||||
|
||||
# debug sola
|
||||
'''
|
||||
_audio, _model_sr = self.input_wav, self.config.samplerate
|
||||
rs = int(np.random.uniform(-200,200))
|
||||
print('debug_random_shift: ' + str(rs))
|
||||
_audio = np.roll(_audio, rs)
|
||||
_audio = torch.from_numpy(_audio).to(self.device)
|
||||
'''
|
||||
|
||||
if _model_sr != self.config.samplerate:
|
||||
key_str = str(_model_sr) + '_' + str(self.config.samplerate)
|
||||
if key_str not in self.resample_kernel:
|
||||
self.resample_kernel[key_str] = Resample(_model_sr, self.config.samplerate,
|
||||
lowpass_filter_width=128).to(self.device)
|
||||
_audio = self.resample_kernel[key_str](_audio)
|
||||
temp_wav = _audio[
|
||||
- self.block_frame - self.crossfade_frame - self.sola_search_frame - self.last_delay_frame: - self.last_delay_frame]
|
||||
|
||||
# sola shift
|
||||
conv_input = temp_wav[None, None, : self.crossfade_frame + self.sola_search_frame]
|
||||
cor_nom = F.conv1d(conv_input, self.sola_buffer[None, None, :])
|
||||
cor_den = torch.sqrt(
|
||||
F.conv1d(conv_input ** 2, torch.ones(1, 1, self.crossfade_frame, device=self.device)) + 1e-8)
|
||||
sola_shift = torch.argmax(cor_nom[0, 0] / cor_den[0, 0])
|
||||
temp_wav = temp_wav[sola_shift: sola_shift + self.block_frame + self.crossfade_frame]
|
||||
print('sola_shift: ' + str(int(sola_shift)))
|
||||
|
||||
temp_wav[: self.crossfade_frame] *= self.fade_in_window
|
||||
temp_wav[: self.crossfade_frame] += self.sola_buffer * self.fade_out_window
|
||||
|
||||
self.sola_buffer = temp_wav[- self.crossfade_frame:]
|
||||
|
||||
outdata[:] = temp_wav[: - self.crossfade_frame, None].repeat(1, 2).cpu().numpy()
|
||||
end_time = time.perf_counter()
|
||||
print('infer_time: ' + str(end_time - start_time))
|
||||
self.window['infer_time'].update(int((end_time - start_time) * 1000))
|
||||
|
||||
def get_devices(self, update: bool = True):
|
||||
'''获取设备列表'''
|
||||
if update:
|
||||
sd._terminate()
|
||||
sd._initialize()
|
||||
devices = sd.query_devices()
|
||||
hostapis = sd.query_hostapis()
|
||||
for hostapi in hostapis:
|
||||
for device_idx in hostapi["devices"]:
|
||||
devices[device_idx]["hostapi_name"] = hostapi["name"]
|
||||
input_devices = [
|
||||
f"{d['name']} ({d['hostapi_name']})"
|
||||
for d in devices
|
||||
if d["max_input_channels"] > 0
|
||||
]
|
||||
output_devices = [
|
||||
f"{d['name']} ({d['hostapi_name']})"
|
||||
for d in devices
|
||||
if d["max_output_channels"] > 0
|
||||
]
|
||||
input_devices_indices = [d["index"] for d in devices if d["max_input_channels"] > 0]
|
||||
output_devices_indices = [
|
||||
d["index"] for d in devices if d["max_output_channels"] > 0
|
||||
]
|
||||
return input_devices, output_devices, input_devices_indices, output_devices_indices
|
||||
|
||||
def set_devices(self, input_device, output_device):
|
||||
'''设置输出设备'''
|
||||
input_devices, output_devices, input_device_indices, output_device_indices = self.get_devices()
|
||||
sd.default.device[0] = input_device_indices[input_devices.index(input_device)]
|
||||
sd.default.device[1] = output_device_indices[output_devices.index(output_device)]
|
||||
print("input device:" + str(sd.default.device[0]) + ":" + str(input_device))
|
||||
print("output device:" + str(sd.default.device[1]) + ":" + str(output_device))
|
||||
|
||||
def update_model(self, model_path):
|
||||
model_dir = os.path.dirname(model_path)
|
||||
model_config_path = os.path.join(model_dir, 'config.json')
|
||||
model_diff = self.config.diff_project
|
||||
model_diff_dir = os.path.dirname(model_diff)
|
||||
model_diff_config_path = os.path.join(model_diff_dir, 'config.yaml')
|
||||
print("model_dir:",model_dir)
|
||||
print("model_config_path:",model_config_path)
|
||||
print("model_diff:",model_diff)
|
||||
print("model_diff_config_path:",model_diff_config_path)
|
||||
if os.path.exists(model_dir):
|
||||
self.svc_model = Svc(model_path,
|
||||
model_config_path,
|
||||
self.device,
|
||||
cluster_model_path=self.config.kmeans_path,
|
||||
nsf_hifigan_enhance=self.config.use_vocoder_based_enhancer,
|
||||
diffusion_model_path=model_diff,
|
||||
diffusion_config_path=model_diff_config_path,
|
||||
shallow_diffusion=self.config.diff_use,
|
||||
only_diffusion=False,
|
||||
spk_mix_enable=False,
|
||||
feature_retrieval=self.config.use_feature_retrieval,
|
||||
)
|
||||
self.svc_model.net_g_ms.dec.onnx = True
|
||||
self.svc_model.net_g_ms.dec.m_source.l_sin_gen.onnx = True
|
||||
self.config.samplerate = self.svc_model.target_sample
|
||||
# self.config.spk_list= list(self.svc_model.spk2id.keys())
|
||||
# self.config.spk_id = self.config.spk_list[0]
|
||||
if hasattr(self.svc_model, "diffusion_model"):
|
||||
self.svc_model.diffusion_args.infer.speedup = self.config.diff_acc
|
||||
self.svc_model.diffusion_args.infer.method = self.config.diff_method
|
||||
self.update_values()
|
||||
|
||||
if __name__ == "__main__":
|
||||
i18n = I18nAuto()
|
||||
gui = GUI()
|
|
@ -0,0 +1,155 @@
|
|||
import locale
|
||||
|
||||
'''
|
||||
本地化方式如下所示
|
||||
'''
|
||||
|
||||
LANGUAGE_LIST = ['zh_CN', 'en_US', 'ja_JP']
|
||||
LANGUAGE_ALL = {
|
||||
'zh_CN': {
|
||||
'SUPER': 'END',
|
||||
'LANGUAGE': 'zh_CN',
|
||||
'选择模型文件': '选择模型文件',
|
||||
'模型:.pth格式(自动识别同目录下config.json)': '模型:.pth格式(自动识别同目录下config.json)',
|
||||
'选择配置文件所在目录': '选择配置文件所在目录',
|
||||
'打开文件夹': '打开文件夹',
|
||||
'读取配置文件': '读取配置文件',
|
||||
'保存配置文件': '保存配置文件',
|
||||
'快速配置文件': '快速配置文件',
|
||||
'输入设备': '输入设备',
|
||||
'输出设备': '输出设备',
|
||||
'音频设备': '音频设备',
|
||||
'说话人': '说话人',
|
||||
'响应阈值': '响应阈值',
|
||||
'变调': '变调',
|
||||
'采样率': '采样率',
|
||||
'启用捏音色功能': '启用捏音色功能',
|
||||
'设置混合音色': '设置混合音色',
|
||||
'普通设置': '普通设置',
|
||||
'音频切分大小': '音频切分大小',
|
||||
'交叉淡化时长': '交叉淡化时长',
|
||||
'使用历史区块数量': '使用历史区块数量',
|
||||
'f0预测模式': 'f0预测模式',
|
||||
'启用增强器': '启用增强器',
|
||||
'启用相位声码器': '启用相位声码器',
|
||||
'性能设置': '性能设置',
|
||||
'开始音频转换': '开始音频转换',
|
||||
'停止音频转换': '停止音频转换',
|
||||
'推理所用时间(ms):': '推理所用时间(ms):',
|
||||
'扩散设置': '扩散设置',
|
||||
'启用扩散': '启用扩散',
|
||||
'扩散加速': '扩散加速',
|
||||
'扩散深度': '扩散深度',
|
||||
'扩散说话人id': '扩散说话人id',
|
||||
'扩散模型文件': '扩散模型文件',
|
||||
'不推理安全区(加速但损失效果)': '不推理安全区(加速但损失效果)',
|
||||
'扩散算法': '扩散算法'
|
||||
},
|
||||
'en_US': {
|
||||
'SUPER': 'zh_CN',
|
||||
'LANGUAGE': 'en_US',
|
||||
'选择模型文件': 'Select Model File',
|
||||
'模型:.pth格式(自动识别同目录下config.json)': 'Model:.pth format(Auto ust config.json in here)',
|
||||
'选择配置文件所在目录': 'Select the configuration file directory',
|
||||
'打开文件夹': 'Open folder',
|
||||
'读取配置文件': 'Read config file',
|
||||
'保存配置文件': 'Save config file',
|
||||
'快速配置文件': 'Fast config file',
|
||||
'输入设备': 'Input device',
|
||||
'输出设备': 'Output device',
|
||||
'音频设备': 'Audio devices',
|
||||
'说话人': 'Speaker',
|
||||
'响应阈值': 'Response threshold',
|
||||
'变调': 'Pitch',
|
||||
'采样率': 'Sampling rate',
|
||||
'启用捏音色功能': 'Enable Mix Speaker',
|
||||
'设置混合音色': 'Mix Speaker',
|
||||
'普通设置': 'Normal Settings',
|
||||
'音频切分大小': 'Segmentation size',
|
||||
'交叉淡化时长': 'Cross fade duration',
|
||||
'使用历史区块数量': 'Historical blocks used',
|
||||
'f0预测模式': 'f0Extractor',
|
||||
'启用增强器': 'Enable Enhancer',
|
||||
'启用相位声码器': 'Enable Phase Vocoder',
|
||||
'性能设置': 'Performance settings',
|
||||
'开始音频转换': 'Start conversion',
|
||||
'停止音频转换': 'Stop conversion',
|
||||
'推理所用时间(ms):': 'Inference time(ms):',
|
||||
'扩散设置': '扩散设置',
|
||||
'启用扩散': '启用扩散',
|
||||
'扩散加速': '扩散加速',
|
||||
'扩散深度': '扩散深度',
|
||||
'扩散说话人id': '扩散说话人id',
|
||||
'扩散模型文件': '扩散模型文件',
|
||||
'不推理安全区(加速但损失效果)': '不推理安全区(加速但损失效果)',
|
||||
'扩散算法': '扩散算法'
|
||||
},
|
||||
'ja_JP': {
|
||||
'SUPER': 'zh_CN',
|
||||
'LANGUAGE': 'ja_JP',
|
||||
'选择模型文件': 'モデルを選択',
|
||||
'模型:.pth格式(自动识别同目录下config.json)': 'モデル:.pth形式(同じディレクトリにあるconfig.jsonを自動認識します)',
|
||||
'选择配置文件所在目录': '設定ファイルを選択',
|
||||
'打开文件夹': 'フォルダを開く',
|
||||
'读取配置文件': '設定ファイルを読み込む',
|
||||
'保存配置文件': '設定ファイルを保存',
|
||||
'快速配置文件': '設定プロファイル',
|
||||
'输入设备': '入力デバイス',
|
||||
'输出设备': '出力デバイス',
|
||||
'音频设备': '音声デバイス',
|
||||
'说话人': '話者',
|
||||
'响应阈值': '応答時の閾値',
|
||||
'变调': '音程',
|
||||
'采样率': 'サンプリングレート',
|
||||
'启用捏音色功能': 'ミキシングを有効化',
|
||||
'设置混合音色': 'ミキシング',
|
||||
'普通设置': '通常設定',
|
||||
'音频切分大小': 'セグメンテーションのサイズ',
|
||||
'交叉淡化时长': 'クロスフェードの間隔',
|
||||
'使用历史区块数量': '使用するヒストリカルブロック数',
|
||||
'f0预测模式': 'f0予測モデル',
|
||||
'启用增强器': 'Enhancerを有効化',
|
||||
'启用相位声码器': 'フェーズボコーダを有効化',
|
||||
'性能设置': 'パフォーマンスの設定',
|
||||
'开始音频转换': '変換開始',
|
||||
'停止音频转换': '変換停止',
|
||||
'推理所用时间(ms):': '推論時間(ms):',
|
||||
'扩散设置': '扩散设置',
|
||||
'启用扩散': '启用扩散',
|
||||
'扩散加速': '扩散加速',
|
||||
'扩散深度': '扩散深度',
|
||||
'扩散说话人id': '扩散说话人id',
|
||||
'扩散模型文件': '扩散模型文件',
|
||||
'不扩散安全区(加速但损失效果)': '不推理安全区(加速但损失效果)',
|
||||
'扩散算法': '扩散算法'
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class I18nAuto:
|
||||
def __init__(self, language=None):
|
||||
self.language_list = LANGUAGE_LIST
|
||||
self.language_all = LANGUAGE_ALL
|
||||
self.language_map = {}
|
||||
if language is None:
|
||||
language = 'auto'
|
||||
if language == 'auto':
|
||||
language = locale.getdefaultlocale()[0]
|
||||
if language not in self.language_list:
|
||||
language = 'zh_CN'
|
||||
self.language = language
|
||||
super_language_list = []
|
||||
while self.language_all[language]['SUPER'] != 'END':
|
||||
super_language_list.append(language)
|
||||
language = self.language_all[language]['SUPER']
|
||||
super_language_list.append('zh_CN')
|
||||
super_language_list.reverse()
|
||||
for _lang in super_language_list:
|
||||
self.read_language(self.language_all[_lang])
|
||||
|
||||
def read_language(self, lang_dict: dict):
|
||||
for _key in lang_dict.keys():
|
||||
self.language_map[_key] = lang_dict[_key]
|
||||
|
||||
def __call__(self, key):
|
||||
return self.language_map[key]
|
|
@ -22,6 +22,8 @@ from diffusion.unit2mel import load_model_vocoder
|
|||
from inference import slicer
|
||||
from models import SynthesizerTrn
|
||||
|
||||
torchaudio.set_audio_backend("soundfile")
|
||||
|
||||
logging.getLogger('matplotlib').setLevel(logging.WARNING)
|
||||
|
||||
|
||||
|
@ -168,7 +170,9 @@ class Svc(object):
|
|||
else:
|
||||
self.hubert_model = utils.get_speech_encoder(self.diffusion_args.data.encoder,device=self.dev)
|
||||
self.volume_extractor = utils.Volume_Extractor(self.diffusion_args.data.block_size)
|
||||
|
||||
|
||||
self.hubert_model.model = self.hubert_model.model.to(self.dtype)
|
||||
|
||||
if os.path.exists(cluster_model_path):
|
||||
if self.feature_retrieval:
|
||||
with open(cluster_model_path,"rb") as f:
|
||||
|
@ -198,6 +202,7 @@ class Svc(object):
|
|||
_ = self.net_g_ms.half().eval().to(self.dev)
|
||||
else:
|
||||
_ = self.net_g_ms.eval().to(self.dev)
|
||||
del self.net_g_ms.enc_q
|
||||
if spk_mix_enable:
|
||||
self.net_g_ms.EnableCharacterMix(len(self.spk2id), self.dev)
|
||||
|
||||
|
@ -216,9 +221,9 @@ class Svc(object):
|
|||
f0 = f0.unsqueeze(0)
|
||||
uv = uv.unsqueeze(0)
|
||||
|
||||
wav = torch.from_numpy(wav).to(self.dev)
|
||||
wav = torch.from_numpy(wav).to(self.dev).to(self.dtype)
|
||||
if not hasattr(self,"audio16k_resample_transform"):
|
||||
self.audio16k_resample_transform = torchaudio.transforms.Resample(self.target_sample, 16000).to(self.dev)
|
||||
self.audio16k_resample_transform = torchaudio.transforms.Resample(self.target_sample, 16000).to(self.dtype).to(self.dev)
|
||||
wav16k = self.audio16k_resample_transform(wav[None,:])[0]
|
||||
|
||||
c = self.hubert_model.encoder(wav16k)
|
||||
|
@ -227,7 +232,7 @@ class Svc(object):
|
|||
if cluster_infer_ratio !=0:
|
||||
if self.feature_retrieval:
|
||||
speaker_id = self.spk2id.get(speaker)
|
||||
if not speaker_id and type(speaker) is int:
|
||||
if not speaker_id and type(speaker) is int: # noqa: E721
|
||||
if len(self.spk2id.__dict__) >= speaker:
|
||||
speaker_id = speaker
|
||||
if speaker_id is None:
|
||||
|
@ -265,20 +270,25 @@ class Svc(object):
|
|||
frame = 0,
|
||||
spk_mix = False,
|
||||
second_encoding = False,
|
||||
loudness_envelope_adjustment = 1
|
||||
loudness_envelope_adjustment = 1,
|
||||
vol = None,
|
||||
start_frame = None,
|
||||
use_tqdm = True
|
||||
):
|
||||
torchaudio.set_audio_backend("soundfile")
|
||||
wav, sr = torchaudio.load(raw_path)
|
||||
if not hasattr(self,"audio_resample_transform") or self.audio16k_resample_transform.orig_freq != sr:
|
||||
self.audio_resample_transform = torchaudio.transforms.Resample(sr,self.target_sample)
|
||||
wav = self.audio_resample_transform(wav).numpy()[0]
|
||||
if isinstance(raw_path, str) or isinstance(raw_path, io.BytesIO):
|
||||
wav, sr = torchaudio.load(raw_path)
|
||||
if not hasattr(self,"audio_resample_transform") or self.audio_resample_transform.orig_freq != sr:
|
||||
self.audio_resample_transform = torchaudio.transforms.Resample(sr,self.target_sample)
|
||||
wav = self.audio_resample_transform(wav).numpy()[0]
|
||||
else:
|
||||
wav = raw_path
|
||||
if spk_mix:
|
||||
c, f0, uv = self.get_unit_f0(wav, tran, 0, None, f0_filter,f0_predictor,cr_threshold=cr_threshold)
|
||||
n_frames = f0.size(1)
|
||||
sid = speaker[:, frame:frame+n_frames].transpose(0,1)
|
||||
else:
|
||||
speaker_id = self.spk2id.get(speaker)
|
||||
if not speaker_id and type(speaker) is int:
|
||||
if not speaker_id and type(speaker) is int: # noqa: E721
|
||||
if len(self.spk2id.__dict__) >= speaker:
|
||||
speaker_id = speaker
|
||||
if speaker_id is None:
|
||||
|
@ -289,11 +299,15 @@ class Svc(object):
|
|||
c = c.to(self.dtype)
|
||||
f0 = f0.to(self.dtype)
|
||||
uv = uv.to(self.dtype)
|
||||
if start_frame is not None:
|
||||
c = c[:,:,start_frame:]
|
||||
f0 = f0[:,start_frame:]
|
||||
uv = uv[:,start_frame:]
|
||||
with torch.no_grad():
|
||||
start = time.time()
|
||||
vol = None
|
||||
if not self.only_diffusion:
|
||||
vol = self.volume_extractor.extract(torch.FloatTensor(wav).to(self.dev)[None,:])[None,:].to(self.dev) if self.vol_embedding else None
|
||||
vol = self.volume_extractor.extract(torch.FloatTensor(wav).to(self.dev)[None,:])[None,:].to(self.dev) if self.vol_embedding and vol is None else vol
|
||||
vol = vol.to(self.dtype) if vol is not None else vol
|
||||
audio,f0 = self.net_g_ms.infer(c, f0=f0, g=sid, uv=uv, predict_f0=auto_predict_f0, noice_scale=noice_scale,vol=vol)
|
||||
audio = audio[0,0].data.float()
|
||||
audio_mel = self.vocoder.extract(audio[None,:],self.target_sample) if self.shallow_diffusion else None
|
||||
|
@ -324,7 +338,9 @@ class Svc(object):
|
|||
infer=True,
|
||||
infer_speedup=self.diffusion_args.infer.speedup,
|
||||
method=self.diffusion_args.infer.method,
|
||||
k_step=k_step)
|
||||
k_step=k_step,
|
||||
use_tqdm = use_tqdm
|
||||
)
|
||||
audio = self.vocoder.infer(audio_mel, f0).squeeze()
|
||||
if self.nsf_hifigan_enhance:
|
||||
audio, _ = self.enhancer.enhance(
|
||||
|
|
|
@ -522,7 +522,7 @@ class SynthesizerTrn(nn.Module):
|
|||
|
||||
if self.use_automatic_f0_prediction and predict_f0:
|
||||
lf0 = 2595. * torch.log10(1. + f0.unsqueeze(1) / 700.) / 500
|
||||
norm_lf0 = utils.normalize_f0(lf0, x_mask, uv, random_scale=False)
|
||||
norm_lf0 = utils.normalize_f0(lf0, x_mask, uv, random_scale=False).to(f0)
|
||||
pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g)
|
||||
f0 = (700 * (torch.pow(10, pred_lf0 * 500 / 2595) - 1)).squeeze(1)
|
||||
|
||||
|
|
|
@ -33,6 +33,7 @@ if __name__ == "__main__":
|
|||
parser.add_argument("--speech_encoder", type=str, default="vec768l12", help="choice a speech encoder|'vec768l12','vec256l9','hubertsoft','whisper-ppg','cnhubertlarge','dphubert','whisper-ppg-large','wavlmbase+'")
|
||||
parser.add_argument("--vol_aug", action="store_true", help="Whether to use volume embedding and volume augmentation")
|
||||
parser.add_argument("--tiny", action="store_true", help="Whether to train sovits tiny")
|
||||
parser.add_argument("--tf_flow", action="store_true", help="Whether to Use transformer_flow")
|
||||
args = parser.parse_args()
|
||||
|
||||
config_template = json.load(open("configs_template/config_tiny_template.json")) if args.tiny else json.load(open("configs_template/config_template.json"))
|
||||
|
@ -111,7 +112,10 @@ if __name__ == "__main__":
|
|||
|
||||
if args.tiny:
|
||||
config_template["model"]["filter_channels"] = 512
|
||||
|
||||
|
||||
if args.tf_flow:
|
||||
config_template["model"]["flow_share_parameter"] = config_template["model"]["use_transformer_flow"] = True
|
||||
|
||||
logger.info("Writing to configs/config.json")
|
||||
with open("configs/config.json", "w") as f:
|
||||
json.dump(config_template, f, indent=2)
|
||||
|
|
3
utils.py
3
utils.py
|
@ -100,7 +100,7 @@ def get_f0_predictor(f0_predictor,hop_length,sampling_rate,**kargs):
|
|||
f0_predictor_object = DioF0Predictor(hop_length=hop_length,sampling_rate=sampling_rate)
|
||||
elif f0_predictor == "rmvpe":
|
||||
from modules.F0Predictor.RMVPEF0Predictor import RMVPEF0Predictor
|
||||
f0_predictor_object = RMVPEF0Predictor(hop_length=hop_length,sampling_rate=sampling_rate,dtype=torch.float32 ,device=kargs["device"],threshold=kargs["threshold"])
|
||||
f0_predictor_object = RMVPEF0Predictor(hop_length=hop_length,sampling_rate=sampling_rate,dtype=torch.float16 ,device=kargs["device"],threshold=kargs["threshold"])
|
||||
elif f0_predictor == "fcpe":
|
||||
from modules.F0Predictor.FCPEF0Predictor import FCPEF0Predictor
|
||||
f0_predictor_object = FCPEF0Predictor(hop_length=hop_length,sampling_rate=sampling_rate,dtype=torch.float32 ,device=kargs["device"],threshold=kargs["threshold"])
|
||||
|
@ -566,7 +566,6 @@ class Volume_Extractor:
|
|||
audio = torch.Tensor(audio)
|
||||
n_frames = int(audio.size(-1) // self.hop_size)
|
||||
audio2 = audio ** 2
|
||||
audio2 = torch.nn.functional.pad(audio2, (int(self.hop_size // 2), int((self.hop_size + 1) // 2)), mode = 'reflect')
|
||||
volume = torch.nn.functional.unfold(audio2[:,None,None,:],(1,self.hop_size),stride=self.hop_size)[:,:,:n_frames].mean(dim=1)[0]
|
||||
volume = torch.sqrt(volume)
|
||||
return volume
|
||||
|
|
|
@ -238,14 +238,7 @@ class SineGen(torch.nn.Module):
|
|||
torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
|
||||
)
|
||||
sine_waves = sine_waves * self.sine_amp
|
||||
uv = self._f02uv(f0)
|
||||
uv = F.interpolate(
|
||||
uv.transpose(2, 1), scale_factor=upp, mode="nearest"
|
||||
).transpose(2, 1)
|
||||
noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
|
||||
noise = noise_amp * torch.randn_like(sine_waves)
|
||||
sine_waves = sine_waves * uv + noise
|
||||
return sine_waves, uv, noise
|
||||
return sine_waves, 0, 0
|
||||
else:
|
||||
with torch.no_grad():
|
||||
# fundamental component
|
||||
|
@ -253,22 +246,8 @@ class SineGen(torch.nn.Module):
|
|||
|
||||
# generate sine waveforms
|
||||
sine_waves = self._f02sine(fn) * self.sine_amp
|
||||
|
||||
# generate uv signal
|
||||
# uv = torch.ones(f0.shape)
|
||||
# uv = uv * (f0 > self.voiced_threshold)
|
||||
uv = self._f02uv(f0)
|
||||
|
||||
# noise: for unvoiced should be similar to sine_amp
|
||||
# std = self.sine_amp/3 -> max value ~ self.sine_amp
|
||||
# . for voiced regions is self.noise_std
|
||||
noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
|
||||
noise = noise_amp * torch.randn_like(sine_waves)
|
||||
|
||||
# first: set the unvoiced part to 0 by uv
|
||||
# then: additive noise
|
||||
sine_waves = sine_waves * uv + noise
|
||||
return sine_waves, uv, noise
|
||||
|
||||
return sine_waves, 0, 0
|
||||
|
||||
|
||||
class SourceModuleHnNSF(torch.nn.Module):
|
||||
|
@ -312,12 +291,10 @@ class SourceModuleHnNSF(torch.nn.Module):
|
|||
noise_source (batchsize, length 1)
|
||||
"""
|
||||
# source for harmonic branch
|
||||
sine_wavs, uv, _ = self.l_sin_gen(x, upp)
|
||||
sine_wavs, _, _ = self.l_sin_gen(x, upp)
|
||||
sine_merge = self.l_tanh(self.l_linear(sine_wavs.to(self.l_linear.weight.dtype)))
|
||||
|
||||
# source for noise branch, in the same shape as uv
|
||||
noise = torch.randn_like(uv) * self.sine_amp / 3
|
||||
return sine_merge, noise, uv
|
||||
return sine_merge, 0, 0
|
||||
|
||||
|
||||
class Generator(torch.nn.Module):
|
||||
|
|
Loading…
Reference in New Issue