diff --git a/LICENSE b/LICENSE index c7202d4..28bac26 100644 --- a/LICENSE +++ b/LICENSE @@ -1,21 +1,28 @@ -MIT License +BSD 3-Clause License -Copyright (c) 2021 Jingyi Li +Copyright (c) 2023, SVC Develop Team -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.md b/README.md index c3234df..3253df2 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,10 @@ #### ✨ A client supports real-time conversion: [w-okada/voice-changer](https://github.com/w-okada/voice-changer) +## Warning!!! + +This project is an open source, offline project, and all members of SvcDevelopTeam and all developers and maintainers of this project (hereinafter referred to as contributors) have no control over this project. The contributor of this project has never provided any organization or individual with any form of assistance, including but not limited to data set extraction, data set processing, computing support, training support, infering, etc. Contributors to the project do not and cannot know what users are using the project for. Therefore, all AI models and synthesized audio based on the training of this project have nothing to do with the contributors of this project. All problems arising therefrom shall be borne by the user. + ## 📏 Terms of Use # Warning: Please solve the authorization problem of the dataset on your own. You shall be solely responsible for any problems caused by the use of non-authorized datasets for training and all consequences thereof.The repository and its maintainer, svc develop team, have nothing to do with the consequences! @@ -13,8 +17,8 @@ 1. This project is established for academic exchange purposes only and is intended for communication and learning purposes. It is not intended for production environments. 2. Any videos based on sovits that are published on video platforms must clearly indicate in the description that they are used for voice changing and specify the input source of the voice or audio, for example, using videos or audios published by others and separating the vocals as input source for conversion, which must provide clear original video or music links. If your own voice or other synthesized voices from other commercial vocal synthesis software are used as the input source for conversion, you must also explain it in the description. 3. You shall be solely responsible for any infringement problems caused by the input source. When using other commercial vocal synthesis software as input source, please ensure that you comply with the terms of use of the software. Note that many vocal synthesis engines clearly state in their terms of use that they cannot be used for input source conversion. -4. Continuing to use this project is deemed as agreeing to the relevant provisions stated in this repository README. This repository README has the obligation to persuade, and is not responsible for any subsequent problems that may arise. -5. If you distribute this repository's code or publish any results produced by this project publicly (including but not limited to video sharing platforms), please indicate the original author and code source (this repository). +4. It is forbidden to use the project to engage in illegal activities, religious and political activities. The project developers firmly resist the above activities. If they do not agree with this article, the use of the project is prohibited. +5. Continuing to use this project is deemed as agreeing to the relevant provisions stated in this repository README. This repository README has the obligation to persuade, and is not responsible for any subsequent problems that may arise. 6. If you use this project for any other plan, please contact and inform the author of this repository in advance. Thank you very much. ## 🆕 Update! @@ -34,7 +38,7 @@ The singing voice conversion model uses SoftVC content encoder to extract source - The dataset creation and training process are consistent with version 3.0, but the model is completely non-universal, and the data set needs to be fully pre-processed again. - Added an option 1: automatic pitch prediction for vc mode, which means that you don't need to manually enter the pitch key when converting speech, and the pitch of male and female voices can be automatically converted. However, this mode will cause pitch shift when converting songs. - Added option 2: reduce timbre leakage through k-means clustering scheme, making the timbre more similar to the target timbre. -- Added option 3: Added [NFS-HIFIGAN Enhancer](https://github.com/yxlllc/DDSP-SVC), which has certain sound quality enhancement effect on some models with few train-sets, but has negative effect on well-trained models, so it is closed by default +- Added option 3: Added [NSF-HIFIGAN Enhancer](https://github.com/yxlllc/DDSP-SVC), which has certain sound quality enhancement effect on some models with few train-sets, but has negative effect on well-trained models, so it is closed by default ## 💬 About Python Version diff --git a/README_zh_CN.md b/README_zh_CN.md index 624f95e..9bb1a0d 100644 --- a/README_zh_CN.md +++ b/README_zh_CN.md @@ -6,6 +6,10 @@ #### ✨ 支持实时转换的一个客户端:[w-okada/voice-changer](https://github.com/w-okada/voice-changer) +## 声明 + +本项目为开源、离线的项目,SvcDevelopTeam的所有成员与本项目的所有开发者以及维护者(以下简称贡献者)对本项目没有控制力。本项目的贡献者从未向任何组织或个人提供包括但不限于数据集提取、数据集加工、算力支持、训练支持、推理等一切形式的帮助;本项目的贡献者不知晓也无法知晓使用者使用该项目的用途。故一切基于本项目训练的AI模型和合成的音频都与本项目贡献者无关。一切由此造成的问题由使用者自行承担。 + ## 📏 使用规约 # Warning:请自行解决数据集授权问题,禁止使用非授权数据集进行训练!任何由于使用非授权数据集进行训练造成的问题,需自行承担全部责任和后果!与仓库、仓库维护者、svc develop team 无关! @@ -13,8 +17,8 @@ 1. 本项目是基于学术交流目的建立,仅供交流与学习使用,并非为生产环境准备。 2. 任何发布到视频平台的基于 sovits 制作的视频,都必须要在简介明确指明用于变声器转换的输入源歌声、音频,例如:使用他人发布的视频 / 音频,通过分离的人声作为输入源进行转换的,必须要给出明确的原视频、音乐链接;若使用是自己的人声,或是使用其他歌声合成引擎合成的声音作为输入源进行转换的,也必须在简介加以说明。 3. 由输入源造成的侵权问题需自行承担全部责任和一切后果。使用其他商用歌声合成软件作为输入源时,请确保遵守该软件的使用条例,注意,许多歌声合成引擎使用条例中明确指明不可用于输入源进行转换! -4. 继续使用视为已同意本仓库 README 所述相关条例,本仓库 README 已进行劝导义务,不对后续可能存在问题负责。 -5. 如将本仓库代码二次分发,或将由此项目产出的任何结果公开发表 (包括但不限于视频网站投稿),请注明原作者及代码来源 (此仓库)。 +4. 禁止使用该项目从事违法行为与宗教、政治等活动,该项目维护者坚决抵制上述行为,不同意此条则禁止使用该项目。 +5. 继续使用视为已同意本仓库 README 所述相关条例,本仓库 README 已进行劝导义务,不对后续可能存在问题负责。 6. 如果将此项目用于任何其他企划,请提前联系并告知本仓库作者,十分感谢。 ## 🆕 Update! @@ -34,7 +38,7 @@ + 数据集制作、训练过程和3.0保持一致,但模型完全不通用,数据集也需要全部重新预处理 + 增加了可选项 1:vc模式自动预测音高f0,即转换语音时不需要手动输入变调key,男女声的调能自动转换,但仅限语音转换,该模式转换歌声会跑调 + 增加了可选项 2:通过kmeans聚类方案减小音色泄漏,即使得音色更加像目标音色 -+ 增加了可选项 3:增加了[NFS-HIFIGAN增强器](https://github.com/yxlllc/DDSP-SVC),对部分训练集少的模型有一定的音质增强效果,但是对训练好的模型有反面效果,默认关闭 ++ 增加了可选项 3:增加了[NSF-HIFIGAN增强器](https://github.com/yxlllc/DDSP-SVC),对部分训练集少的模型有一定的音质增强效果,但是对训练好的模型有反面效果,默认关闭 ## 💬 关于 Python 版本问题 diff --git a/app.py b/app.py deleted file mode 100644 index 0ff0c88..0000000 --- a/app.py +++ /dev/null @@ -1,69 +0,0 @@ -import io -import os - -# os.system("wget -P cvec/ https://huggingface.co/spaces/innnky/nanami/resolve/main/checkpoint_best_legacy_500.pt") -import gradio as gr -import librosa -import numpy as np -import soundfile -from inference.infer_tool import Svc -import logging - -logging.getLogger('numba').setLevel(logging.WARNING) -logging.getLogger('markdown_it').setLevel(logging.WARNING) -logging.getLogger('urllib3').setLevel(logging.WARNING) -logging.getLogger('matplotlib').setLevel(logging.WARNING) - -config_path = "configs/config.json" - -model = Svc("logs/44k/G_114400.pth", "configs/config.json", cluster_model_path="logs/44k/kmeans_10000.pt") - - - -def vc_fn(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale): - if input_audio is None: - return "You need to upload an audio", None - sampling_rate, audio = input_audio - # print(audio.shape,sampling_rate) - duration = audio.shape[0] / sampling_rate - if duration > 90: - return "请上传小于90s的音频,需要转换长音频请本地进行转换", None - audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32) - if len(audio.shape) > 1: - audio = librosa.to_mono(audio.transpose(1, 0)) - if sampling_rate != 16000: - audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000) - print(audio.shape) - out_wav_path = "temp.wav" - soundfile.write(out_wav_path, audio, 16000, format="wav") - print( cluster_ratio, auto_f0, noise_scale) - _audio = model.slice_inference(out_wav_path, sid, vc_transform, slice_db, cluster_ratio, auto_f0, noise_scale) - return "Success", (44100, _audio) - - -app = gr.Blocks() -with app: - with gr.Tabs(): - with gr.TabItem("Basic"): - gr.Markdown(value=""" - sovits4.0 在线demo - - 此demo为预训练底模在线demo,使用数据:云灏 即霜 辉宇·星AI 派蒙 绫地宁宁 - """) - spks = list(model.spk2id.keys()) - sid = gr.Dropdown(label="音色", choices=spks, value=spks[0]) - vc_input3 = gr.Audio(label="上传音频(长度小于90秒)") - vc_transform = gr.Number(label="变调(整数,可以正负,半音数量,升高八度就是12)", value=0) - cluster_ratio = gr.Number(label="聚类模型混合比例,0-1之间,默认为0不启用聚类,能提升音色相似度,但会导致咬字下降(如果使用建议0.5左右)", value=0) - auto_f0 = gr.Checkbox(label="自动f0预测,配合聚类模型f0预测效果更好,会导致变调功能失效(仅限转换语音,歌声不要勾选此项会究极跑调)", value=False) - slice_db = gr.Number(label="切片阈值", value=-40) - noise_scale = gr.Number(label="noise_scale 建议不要动,会影响音质,玄学参数", value=0.4) - vc_submit = gr.Button("转换", variant="primary") - vc_output1 = gr.Textbox(label="Output Message") - vc_output2 = gr.Audio(label="Output Audio") - vc_submit.click(vc_fn, [sid, vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale], [vc_output1, vc_output2]) - - app.launch() - - - diff --git a/inference/infer_tool.py b/inference/infer_tool.py index 5328c54..94b3ca6 100644 --- a/inference/infer_tool.py +++ b/inference/infer_tool.py @@ -6,6 +6,7 @@ import os import time from pathlib import Path from inference import slicer +import gc import librosa import numpy as np @@ -221,6 +222,16 @@ class Svc(object): # 清理显存 torch.cuda.empty_cache() + def unload_model(self): + # 卸载模型 + self.net_g_ms = self.net_g_ms.to("cpu") + del self.net_g_ms + if hasattr(self,"enhancer"): + self.enhancer.enhancer = self.enhancer.enhancer.to("cpu") + del self.enhancer.enhancer + del self.enhancer + gc.collect() + def slice_inference(self, raw_audio_path, spk, diff --git a/requirements.txt b/requirements.txt index a2b60c1..9dd41d4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,9 @@ Flask Flask_Cors -gradio -numpy +gradio>=3.7.0 +numpy==1.23.0 pyworld==0.2.5 -scipy==1.7.3 +scipy==1.10.0 SoundFile==0.12.1 torch==1.13.1 torchaudio==0.13.1 diff --git a/requirements_win.txt b/requirements_win.txt index 2c57f89..8201f6d 100644 --- a/requirements_win.txt +++ b/requirements_win.txt @@ -2,7 +2,7 @@ librosa==0.9.1 fairseq==0.12.2 Flask==2.1.2 Flask_Cors==3.0.10 -gradio +gradio>=3.7.0 numpy playsound==1.3.0 PyAudio==0.2.12 diff --git a/train.py b/train.py index 9f6e743..6cd3f97 100644 --- a/train.py +++ b/train.py @@ -105,7 +105,9 @@ def run(rank, n_gpus, hps): _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "D_*.pth"), net_d, optim_d, skip_optimizer) epoch_str = max(epoch_str, 1) - global_step = (epoch_str - 1) * len(train_loader) + name=utils.latest_checkpoint_path(hps.model_dir, "D_*.pth") + global_step=int(name[name.rfind("_")+1:name.rfind(".")])+1 + #global_step = (epoch_str - 1) * len(train_loader) except: print("load old checkpoint failed...") epoch_str = 1 @@ -114,20 +116,30 @@ def run(rank, n_gpus, hps): epoch_str = 1 global_step = 0 + warmup_epoch = hps.train.warmup_epochs scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2) scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2) scaler = GradScaler(enabled=hps.train.fp16_run) for epoch in range(epoch_str, hps.train.epochs + 1): + # update learning rate + if epoch > 1: + scheduler_g.step() + scheduler_d.step() + # set up warm-up learning rate + if epoch <= warmup_epoch: + for param_group in optim_g.param_groups: + param_group['lr'] = hps.train.learning_rate / warmup_epoch * epoch + for param_group in optim_d.param_groups: + param_group['lr'] = hps.train.learning_rate / warmup_epoch * epoch + # training if rank == 0: train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler, [train_loader, eval_loader], logger, [writer, writer_eval]) else: train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler, [train_loader, None], None, None) - scheduler_g.step() - scheduler_d.step() def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers): @@ -211,10 +223,14 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade if global_step % hps.train.log_interval == 0: lr = optim_g.param_groups[0]['lr'] losses = [loss_disc, loss_gen, loss_fm, loss_mel, loss_kl] + reference_loss=0 + for i in losses: + reference_loss += math.log(i, 10) + reference_loss*=10 logger.info('Train Epoch: {} [{:.0f}%]'.format( epoch, 100. * batch_idx / len(train_loader))) - logger.info(f"Losses: {[x.item() for x in losses]}, step: {global_step}, lr: {lr}") + logger.info(f"Losses: {[x.item() for x in losses]}, step: {global_step}, lr: {lr}, reference_loss: {reference_loss}") scalar_dict = {"loss/g/total": loss_gen_all, "loss/d/total": loss_disc_all, "learning_rate": lr, "grad_norm_d": grad_norm_d, "grad_norm_g": grad_norm_g} diff --git a/utils.py b/utils.py index 8df243a..fb60150 100644 --- a/utils.py +++ b/utils.py @@ -205,7 +205,7 @@ def f0_to_coarse(f0): f0_mel[f0_mel <= 1] = 1 f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1 - f0_coarse = (f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(np.int) + f0_coarse = (f0_mel + 0.5).int() if is_torch else np.rint(f0_mel).astype(np.int) assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (f0_coarse.max(), f0_coarse.min()) return f0_coarse diff --git a/webUI.py b/webUI.py index c0467ba..ba2b361 100644 --- a/webUI.py +++ b/webUI.py @@ -9,7 +9,6 @@ import numpy as np import soundfile from inference.infer_tool import Svc import logging -import traceback import subprocess import edge_tts @@ -18,6 +17,7 @@ from scipy.io import wavfile import librosa import torch import time +import traceback logging.getLogger('numba').setLevel(logging.WARNING) logging.getLogger('markdown_it').setLevel(logging.WARNING) @@ -27,20 +27,53 @@ logging.getLogger('multipart').setLevel(logging.WARNING) model = None spk = None -debug=False +debug = False -cuda = [] +cuda = {} if torch.cuda.is_available(): for i in range(torch.cuda.device_count()): - cuda.append("cuda:{}".format(i)) + device_name = torch.cuda.get_device_properties(i).name + cuda[f"CUDA:{i} {device_name}"] = f"cuda:{i}" + +def modelAnalysis(model_path,config_path,cluster_model_path,device,enhance): + global model + try: + device = cuda[device] if "CUDA" in device else device + model = Svc(model_path.name, config_path.name, device=device if device!="Auto" else None, cluster_model_path = cluster_model_path.name if cluster_model_path != None else "",nsf_hifigan_enhance=enhance) + spks = list(model.spk2id.keys()) + device_name = torch.cuda.get_device_properties(model.dev).name if "cuda" in str(model.dev) else str(model.dev) + msg = f"成功加载模型到设备{device_name}上\n" + if cluster_model_path is None: + msg += "未加载聚类模型\n" + else: + msg += f"聚类模型{cluster_model_path.name}加载成功\n" + msg += "当前模型的可用音色:\n" + for i in spks: + msg += i + " " + return sid.update(choices = spks,value=spks[0]), msg + except Exception as e: + if debug: traceback.print_exc() + raise gr.Error(e) + + +def modelUnload(): + global model + if model is None: + return sid.update(choices = [],value=""),"没有模型需要卸载!" + else: + model.unload_model() + model = None + torch.cuda.empty_cache() + return sid.update(choices = [],value=""),"模型卸载完毕!" + def vc_fn(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,F0_mean_pooling,enhancer_adaptive_key): global model try: if input_audio is None: - return "You need to upload an audio", None + raise gr.Error("你需要上传音频") if model is None: - return "You need to upload an model", None + raise gr.Error("你需要指定模型") sampling_rate, audio = input_audio # print(audio.shape,sampling_rate) audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32) @@ -54,16 +87,18 @@ def vc_fn(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, noise #构建保存文件的路径,并保存到results文件夹内 try: timestamp = str(int(time.time())) - output_file = os.path.join("./results", sid + "_" + timestamp + ".wav") + filename = sid + "_" + timestamp + ".wav" + output_file = os.path.join("./results", filename) soundfile.write(output_file, _audio, model.target_sample, format="wav") - return "Success", (model.target_sample, _audio) + return f"推理成功,音频文件保存为results/{filename}", (model.target_sample, _audio) except Exception as e: - if debug:traceback.print_exc() - return "自动保存失败,请手动保存,音乐输出见下", (model.target_sample, _audio) + if debug: traceback.print_exc() + raise gr.Error(e) except Exception as e: - if debug:traceback.print_exc() - return "异常信息:"+str(e)+"\n请排障后重试",None - + if debug: traceback.print_exc() + raise gr.Error(e) + + def tts_func(_text,_rate): #使用edge-tts把文字转成音频 # voice = "zh-CN-XiaoyiNeural"#女性,较高音 @@ -88,6 +123,7 @@ def tts_func(_text,_rate): p.wait() return output_file + def vc_fn2(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,text2tts,tts_rate,F0_mean_pooling,enhancer_adaptive_key): #使用edge-tts把文字转成音频 output_file=tts_func(text2tts,tts_rate) @@ -110,76 +146,82 @@ def vc_fn2(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, nois os.remove(save_path2) return a,b -app = gr.Blocks() -with app: +def debug_change(): + global debug + debug = debug_button.value + +with gr.Blocks( + theme=gr.themes.Base( + primary_hue = gr.themes.colors.green, + font=["Source Sans Pro", "Arial", "sans-serif"], + font_mono=['JetBrains mono', "Consolas", 'Courier New'] + ), +) as app: with gr.Tabs(): - with gr.TabItem("Sovits4.0"): + with gr.TabItem("Inference"): gr.Markdown(value=""" - Sovits4.0 WebUI + So-vits-svc 4.0 推理 webui """) - - gr.Markdown(value=""" - 下面是模型文件选择: - """) - model_path = gr.File(label="模型文件") - gr.Markdown(value=""" - 下面是配置文件选择: - """) - config_path = gr.File(label="配置文件") - gr.Markdown(value=""" - 下面是聚类模型文件选择,没有可以不填: - """) - cluster_model_path = gr.File(label="聚类模型文件") - device = gr.Dropdown(label="推理设备,默认为自动选择cpu和gpu",choices=["Auto",*cuda,"cpu"],value="Auto") - enhance = gr.Checkbox(label="是否使用NSF_HIFIGAN增强,该选项对部分训练集少的模型有一定的音质增强效果,但是对训练好的模型有反面效果,默认关闭", value=False) - gr.Markdown(value=""" - 全部上传完毕后(全部文件模块显示download),点击模型解析进行解析: - """) - model_analysis_button = gr.Button(value="模型解析") - model_unload_button = gr.Button(value="模型卸载") - sid = gr.Dropdown(label="音色(说话人)") - sid_output = gr.Textbox(label="Output Message") + with gr.Row(variant="panel"): + with gr.Column(): + gr.Markdown(value=""" + 模型设置 + """) + model_path = gr.File(label="选择模型文件") + config_path = gr.File(label="选择配置文件") + cluster_model_path = gr.File(label="选择聚类模型文件(没有可以不选)") + device = gr.Dropdown(label="推理设备,默认为自动选择CPU和GPU", choices=["Auto",*cuda.keys(),"CPU"], value="Auto") + enhance = gr.Checkbox(label="是否使用NSF_HIFIGAN增强,该选项对部分训练集少的模型有一定的音质增强效果,但是对训练好的模型有反面效果,默认关闭", value=False) + with gr.Column(): + gr.Markdown(value=""" + 左侧文件全部选择完毕后(全部文件模块显示download),点击“加载模型”进行解析: + """) + model_load_button = gr.Button(value="加载模型", variant="primary") + model_unload_button = gr.Button(value="卸载模型", variant="primary") + sid = gr.Dropdown(label="音色(说话人)") + sid_output = gr.Textbox(label="Output Message") - text2tts=gr.Textbox(label="在此输入要转译的文字。注意,使用该功能建议打开F0预测,不然会很怪") - tts_rate = gr.Number(label="tts语速", value=0) - - vc_input3 = gr.Audio(label="上传音频") - vc_transform = gr.Number(label="变调(整数,可以正负,半音数量,升高八度就是12)", value=0) - cluster_ratio = gr.Number(label="聚类模型混合比例,0-1之间,默认为0不启用聚类,能提升音色相似度,但会导致咬字下降(如果使用建议0.5左右)", value=0) - auto_f0 = gr.Checkbox(label="自动f0预测,配合聚类模型f0预测效果更好,会导致变调功能失效(仅限转换语音,歌声不要勾选此项会究极跑调)", value=False) - F0_mean_pooling = gr.Checkbox(label="是否对F0使用均值滤波器(池化),对部分哑音有改善。注意,启动该选项会导致推理速度下降,默认关闭", value=False) - slice_db = gr.Number(label="切片阈值", value=-40) - noise_scale = gr.Number(label="noise_scale 建议不要动,会影响音质,玄学参数", value=0.4) - cl_num = gr.Number(label="音频自动切片,0为不切片,单位为秒/s", value=0) - pad_seconds = gr.Number(label="推理音频pad秒数,由于未知原因开头结尾会有异响,pad一小段静音段后就不会出现", value=0.5) - lg_num = gr.Number(label="两端音频切片的交叉淡入长度,如果自动切片后出现人声不连贯可调整该数值,如果连贯建议采用默认值0,注意,该设置会影响推理速度,单位为秒/s", value=0) - lgr_num = gr.Number(label="自动音频切片后,需要舍弃每段切片的头尾。该参数设置交叉长度保留的比例,范围0-1,左开右闭", value=0.75,interactive=True) - enhancer_adaptive_key = gr.Number(label="使增强器适应更高的音域(单位为半音数)|默认为0", value=0,interactive=True) - vc_submit = gr.Button("音频直接转换", variant="primary") - vc_submit2 = gr.Button("文字转音频+转换", variant="primary") - vc_output1 = gr.Textbox(label="Output Message") - vc_output2 = gr.Audio(label="Output Audio") - def modelAnalysis(model_path,config_path,cluster_model_path,device,enhance): - global model - try: - model = Svc(model_path.name, config_path.name,device=device if device!="Auto" else None,cluster_model_path= cluster_model_path.name if cluster_model_path!=None else "",nsf_hifigan_enhance=enhance) - spks = list(model.spk2id.keys()) - device_name = torch.cuda.get_device_properties(model.dev).name if "cuda" in str(model.dev) else str(model.dev) - return sid.update(choices = spks,value=spks[0]),"ok,模型被加载到了设备{}之上".format(device_name) - except Exception as e: - if debug:traceback.print_exc() - return "","异常信息:"+str(e)+"\n请排障后重试" - def modelUnload(): - global model - if model is None: - return sid.update(choices = [],value=""),"没有模型需要卸载!" - else: - model = None - torch.cuda.empty_cache() - return sid.update(choices = [],value=""),"模型卸载完毕!" + + with gr.Row(variant="panel"): + with gr.Column(): + gr.Markdown(value=""" + 推理设置 + """) + auto_f0 = gr.Checkbox(label="自动f0预测,配合聚类模型f0预测效果更好,会导致变调功能失效(仅限转换语音,歌声勾选此项会究极跑调)", value=False) + F0_mean_pooling = gr.Checkbox(label="是否对F0使用均值滤波器(池化),对部分哑音有改善。注意,启动该选项会导致推理速度下降,默认关闭", value=False) + vc_transform = gr.Number(label="变调(整数,可以正负,半音数量,升高八度就是12)", value=0) + cluster_ratio = gr.Number(label="聚类模型混合比例,0-1之间,0即不启用聚类。使用聚类模型能提升音色相似度,但会导致咬字下降(如果使用建议0.5左右)", value=0) + slice_db = gr.Number(label="切片阈值", value=-40) + noise_scale = gr.Number(label="noise_scale 建议不要动,会影响音质,玄学参数", value=0.4) + with gr.Column(): + pad_seconds = gr.Number(label="推理音频pad秒数,由于未知原因开头结尾会有异响,pad一小段静音段后就不会出现", value=0.5) + cl_num = gr.Number(label="音频自动切片,0为不切片,单位为秒(s)", value=0) + lg_num = gr.Number(label="两端音频切片的交叉淡入长度,如果自动切片后出现人声不连贯可调整该数值,如果连贯建议采用默认值0,注意,该设置会影响推理速度,单位为秒/s", value=0) + lgr_num = gr.Number(label="自动音频切片后,需要舍弃每段切片的头尾。该参数设置交叉长度保留的比例,范围0-1,左开右闭", value=0.75) + enhancer_adaptive_key = gr.Number(label="使增强器适应更高的音域(单位为半音数)|默认为0", value=0) + with gr.Tabs(): + with gr.TabItem("音频转音频"): + vc_input3 = gr.Audio(label="选择音频") + vc_submit = gr.Button("音频转换", variant="primary") + with gr.TabItem("文字转音频"): + text2tts=gr.Textbox(label="在此输入要转译的文字。注意,使用该功能建议打开F0预测,不然会很怪") + tts_rate = gr.Number(label="tts语速", value=0) + vc_submit2 = gr.Button("文字转换", variant="primary") + with gr.Row(): + with gr.Column(): + vc_output1 = gr.Textbox(label="Output Message") + with gr.Column(): + vc_output2 = gr.Audio(label="Output Audio", interactive=False) + with gr.Row(variant="panel"): + with gr.Column(): + gr.Markdown(value=""" + WebUI设置 + """) + debug_button = gr.Checkbox(label="Debug模式,如果向社区反馈BUG需要打开,打开后控制台可以显示具体错误提示", value=debug) vc_submit.click(vc_fn, [sid, vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,F0_mean_pooling,enhancer_adaptive_key], [vc_output1, vc_output2]) vc_submit2.click(vc_fn2, [sid, vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,text2tts,tts_rate,F0_mean_pooling,enhancer_adaptive_key], [vc_output1, vc_output2]) - model_analysis_button.click(modelAnalysis,[model_path,config_path,cluster_model_path,device,enhance],[sid,sid_output]) + debug_button.change(debug_change,[],[]) + model_load_button.click(modelAnalysis,[model_path,config_path,cluster_model_path,device,enhance],[sid,sid_output]) model_unload_button.click(modelUnload,[],[sid,sid_output]) app.launch()