From 94a185e9575136520e79ab06e5ee4afd8f9c1677 Mon Sep 17 00:00:00 2001 From: asdfw13 <86564126+asdfw13@users.noreply.github.com> Date: Sat, 1 Jul 2023 03:18:32 +0800 Subject: [PATCH 1/7] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 3247eb9..a1c9f37 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ This project serves as a framework only and does not possess speech synthesis fu # Warning: Please ensure that you address any authorization issues related to the dataset on your own. You bear full responsibility for any problems arising from the usage of non-authorized datasets for training, as well as any resulting consequences. The repository and its maintainer, svc develop team, disclaim any association with or liability for the consequences. 1. This project is exclusively established for academic purposes, aiming to facilitate communication and learning. It is not intended for deployment in production environments. -2. Any sovits-based video posted to a video platform must clearly specify in the introduction the input source vocals and audio used for the voice changer conversion, e.g., if you use someone else's video/audio and convert it by separating the vocals as the input source, you must give a clear link to the original video or music; if you use your own vocals or a voice synthesized by another voice synthesis engine as the input source, you must also specify this in the introduction. +2. Any sovits-based video posted to a video platform must clearly specify in the introduction the input source vocals and audio used for the voice changer conversion, e.g., if you use someone else's video/audio and convert it by separating the vocals as the input source, you must give a clear link to the original video or music; if you use your own vocals or a voice synthesized by another voice synthesis engine as the input source, you must also state this in your introduction. 3. You are solely responsible for any infringement issues caused by the input source and all consequences. When using other commercial vocal synthesis software as an input source, please ensure that you comply with the regulations of that software, noting that the regulations of many vocal synthesis engines explicitly state that they cannot be used to convert input sources! 4. Engaging in illegal activities, as well as religious and political activities, is strictly prohibited when using this project. The project developers vehemently oppose the aforementioned activities. If you disagree with this provision, the usage of the project is prohibited. 5. If you continue to use the program, you will be deemed to have agreed to the terms and conditions set forth in README and README has discouraged you and is not responsible for any subsequent problems. @@ -206,7 +206,7 @@ python resample.py #### Cautions -Although this project has resample.py scripts for resampling, mono and loudness matching, the default loudness matching is to match to 0db. This can cause damage to the sound quality. While python's loudness matching package pyloudnorm does not limit the level, this can lead to popping. Therefore, it is recommended to consider using professional sound processing software, such as `adobe audition` for loudness matching. If you are already using other software for loudness matching, add the parameter `-skip_loudnorm` to the run command: +Although this project has resample.py scripts for resampling, mono and loudness matching, the default loudness matching is to match to 0db. This can cause damage to the sound quality. While python's loudness matching package pyloudnorm does not limit the level, this can lead to sonic boom. Therefore, it is recommended to consider using professional sound processing software, such as `adobe audition` for loudness matching. If you are already using other software for loudness matching, add the parameter `-skip_loudnorm` to the run command: ```shell python resample.py --skip_loudnorm From 0bf22ab8da4fd1d02d2ce926e3457ffbb5ca0ff2 Mon Sep 17 00:00:00 2001 From: YuriHead Date: Sun, 2 Jul 2023 03:27:38 +0800 Subject: [PATCH 2/7] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a1c9f37..ae67f6d 100644 --- a/README.md +++ b/README.md @@ -299,7 +299,7 @@ python preprocess_hubert_f0.py --f0_predictor dio --use_diff After completing the above steps, the dataset directory will contain the preprocessed data, and the dataset_raw folder can be deleted. -## 🏋️‍♀️ Training +## 🏋️‍ Training ### Sovits Model From 71847b11db8150fd4f82239d93934e9039cd57a7 Mon Sep 17 00:00:00 2001 From: YuriHead Date: Sun, 2 Jul 2023 03:28:12 +0800 Subject: [PATCH 3/7] Update README_zh_CN.md --- README_zh_CN.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README_zh_CN.md b/README_zh_CN.md index c1d5d9a..b3af5f0 100644 --- a/README_zh_CN.md +++ b/README_zh_CN.md @@ -301,7 +301,7 @@ python preprocess_hubert_f0.py --f0_predictor dio --use_diff 执行完以上步骤后 dataset 目录便是预处理完成的数据,可以删除 dataset_raw 文件夹了 -## 🏋️‍♀️ 训练 +## 🏋️‍ 训练 ### 主模型训练 From 3f29e43bed94ed9ab82f4e7ef68c2a4aa0b5f480 Mon Sep 17 00:00:00 2001 From: umoubuton <127150330+umoufuton@users.noreply.github.com> Date: Tue, 4 Jul 2023 18:53:11 +0800 Subject: [PATCH 4/7] Update webUI.py 1. Standardize the output file names to the format in `inference_main.py` 2. Avoid assigning the cluster_ratio before uploading the cluster model or feature retrieval model --- webUI.py | 43 ++++++++++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/webUI.py b/webUI.py index e3a8cad..4f3ee9f 100644 --- a/webUI.py +++ b/webUI.py @@ -15,6 +15,7 @@ import numpy as np import soundfile import torch from scipy.io import wavfile +from pathlib import Path from compress_model import removeOptimizer from inference.infer_tool import Svc @@ -81,7 +82,6 @@ def modelAnalysis(model_path,config_path,cluster_model_path,device,enhance,diff_ device = cuda[device] if "CUDA" in device else device cluster_filepath = os.path.split(cluster_model_path.name) if cluster_model_path is not None else "no_cluster" fr = ".pkl" in cluster_filepath[1] - #model = Svc(model_path.name, config_path.name, device=device if device!="Auto" else None, cluster_model_path = cluster_model_path.name if cluster_model_path != None else "",nsf_hifigan_enhance=enhance) model = Svc(model_path.name, config_path.name, device=device if device != "Auto" else None, @@ -127,24 +127,30 @@ def modelUnload(): torch.cuda.empty_cache() return sid.update(choices = [],value=""),"模型卸载完毕!" -def vc_fn(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold,k_step,use_spk_mix,second_encoding,loudness_envelope_adjustment): +def vc_fn(sid, input_audio, output_format, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold,k_step,use_spk_mix,second_encoding,loudness_envelope_adjustment): global model try: if input_audio is None: return "You need to upload an audio", None if model is None: return "You need to upload an model", None - print(input_audio) - sampling_rate, audio = input_audio - print(audio.shape,sampling_rate) - audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32) - print(audio.dtype) + if getattr(model, 'cluster_model', None) is None and model.feature_retrieval is False: + if cluster_ratio != 0: + return "You need to upload an cluster model or feature retrieval model before assigning cluster ratio!", None + #print(input_audio) + audio, sampling_rate = soundfile.read(input_audio) + #print(audio.shape,sampling_rate) + if np.issubdtype(audio.dtype, np.integer): + audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32) + #print(audio.dtype) if len(audio.shape) > 1: audio = librosa.to_mono(audio.transpose(1, 0)) - temp_path = "temp.wav" - soundfile.write(temp_path, audio, sampling_rate, format="wav") + # 未知原因Gradio上传的filepath会有一个奇怪的固定后缀,这里去掉 + truncated_basename = Path(input_audio).stem[:-6] + processed_audio = os.path.join("raw", f"{truncated_basename}.wav") + soundfile.write(processed_audio, audio, sampling_rate, format="wav") _audio = model.slice_inference( - temp_path, + processed_audio, sid, vc_transform, slice_db, @@ -164,13 +170,19 @@ def vc_fn(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, noise loudness_envelope_adjustment ) model.clear_empty() - os.remove(temp_path) + #os.remove(temp_path) #构建保存文件的路径,并保存到results文件夹内 timestamp = str(int(time.time())) if not os.path.exists("results"): os.makedirs("results") - output_file = os.path.join("results", sid + "_" + timestamp + ".wav") - soundfile.write(output_file, _audio, model.target_sample, format="wav") + key = "auto" if auto_f0 else f"{int(vc_transform)}key" + cluster = "_" if cluster_ratio == 0 else f"_{cluster_ratio}_" + isdiffusion = "sovits" + if model.shallow_diffusion : isdiffusion = "sovdiff" + if model.only_diffusion : isdiffusion = "diff" + output_file_name = 'result_'+truncated_basename+f'_{sid}_{key}{cluster}{isdiffusion}.{output_format}' + output_file = os.path.join("results", output_file_name) + soundfile.write(output_file, _audio, model.target_sample, format=output_format) return "Success", output_file except Exception as e: if debug: @@ -291,6 +303,7 @@ with gr.Blocks( vc_transform = gr.Number(label="变调(整数,可以正负,半音数量,升高八度就是12)", value=0) cluster_ratio = gr.Number(label="聚类模型/特征检索混合比例,0-1之间,0即不启用聚类/特征检索。使用聚类/特征检索能提升音色相似度,但会导致咬字下降(如果使用建议0.5左右)", value=0) slice_db = gr.Number(label="切片阈值", value=-40) + output_format = gr.Radio(label="音频输出格式", choices=["wav", "flac", "mp3"], value = "wav") noise_scale = gr.Number(label="noise_scale 建议不要动,会影响音质,玄学参数", value=0.4) k_step = gr.Slider(label="浅扩散步数,只有使用了扩散模型才有效,步数越大越接近扩散模型的结果", value=100, minimum = 1, maximum = 1000) with gr.Column(): @@ -305,7 +318,7 @@ with gr.Blocks( use_spk_mix = gr.Checkbox(label = "动态声线融合", value = False, interactive = False) with gr.Tabs(): with gr.TabItem("音频转音频"): - vc_input3 = gr.Audio(label="选择音频") + vc_input3 = gr.Audio(label="选择音频", type="filepath") vc_submit = gr.Button("音频转换", variant="primary") with gr.TabItem("文字转音频"): text2tts=gr.Textbox(label="在此输入要转译的文字。注意,使用该功能建议打开F0预测,不然会很怪") @@ -371,7 +384,7 @@ with gr.Blocks( WebUI设置 """) debug_button = gr.Checkbox(label="Debug模式,如果向社区反馈BUG需要打开,打开后控制台可以显示具体错误提示", value=debug) - vc_submit.click(vc_fn, [sid, vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold,k_step,use_spk_mix,second_encoding,loudness_envelope_adjustment], [vc_output1, vc_output2]) + vc_submit.click(vc_fn, [sid, vc_input3, output_format, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold,k_step,use_spk_mix,second_encoding,loudness_envelope_adjustment], [vc_output1, vc_output2]) vc_submit2.click(vc_fn2, [sid, vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,text2tts,tts_rate,tts_voice,f0_predictor,enhancer_adaptive_key,cr_threshold], [vc_output1, vc_output2]) debug_button.change(debug_change,[],[]) model_load_button.click(modelAnalysis,[model_path,config_path,cluster_model_path,device,enhance,diff_model_path,diff_config_path,only_diffusion,use_spk_mix],[sid,sid_output]) From bf543d24e84500307e5bd29db3d5c147afa005b4 Mon Sep 17 00:00:00 2001 From: magic-akari Date: Wed, 5 Jul 2023 14:33:40 +0800 Subject: [PATCH 5/7] feat: add `Ruff` as github action --- .github/workflows/ruff.yml | 8 ++++++++ .gitignore | 9 +-------- .ruff.toml | 4 ++++ 3 files changed, 13 insertions(+), 8 deletions(-) create mode 100644 .github/workflows/ruff.yml create mode 100644 .ruff.toml diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml new file mode 100644 index 0000000..563b87d --- /dev/null +++ b/.github/workflows/ruff.yml @@ -0,0 +1,8 @@ +name: Ruff +on: [push, pull_request] +jobs: + ruff: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: chartboost/ruff-action@v1 diff --git a/.gitignore b/.gitignore index a26a9e6..999aa5d 100644 --- a/.gitignore +++ b/.gitignore @@ -150,7 +150,6 @@ results inference/chunks_temp.json logs hubert/checkpoint_best_legacy_500.pt -pretrain/**/*.pt configs/config.json filelists/test.txt filelists/train.txt @@ -162,11 +161,5 @@ filelists/val.txt .idea/vcs.xml .idea/inspectionProfiles/profiles_settings.xml .idea/inspectionProfiles/Project_Default.xml -pretrain/vec-768-layer-12.onnx -pretrain/hubert-soft.onnx -pretrain/hubert4.0.onnx -pretrain/vec-256-layer-9.onnx -pretrain/vec-256-layer-12.onnx -pretrain/vec-768-layer-9.onnx +pretrain/ .vscode/launch.json -.ruff.toml diff --git a/.ruff.toml b/.ruff.toml new file mode 100644 index 0000000..ba961ac --- /dev/null +++ b/.ruff.toml @@ -0,0 +1,4 @@ +select = ["E", "F", "I"] + +# Never enforce `E501` (line length violations). +ignore = ["E501"] From f557072b4982118995a971dc6144b577d4386ece Mon Sep 17 00:00:00 2001 From: magic-akari Date: Wed, 5 Jul 2023 14:52:16 +0800 Subject: [PATCH 6/7] chore: make Ruff happy --- .ruff.toml | 2 +- modules/DSConv.py | 4 ++-- modules/mel_processing.py | 4 +++- modules/modules.py | 10 ++++++---- webUI.py | 12 ++++++++---- 5 files changed, 20 insertions(+), 12 deletions(-) diff --git a/.ruff.toml b/.ruff.toml index ba961ac..fb05db4 100644 --- a/.ruff.toml +++ b/.ruff.toml @@ -1,4 +1,4 @@ select = ["E", "F", "I"] # Never enforce `E501` (line length violations). -ignore = ["E501"] +ignore = ["E501", "E741"] diff --git a/modules/DSConv.py b/modules/DSConv.py index 9909521..44c2bf6 100644 --- a/modules/DSConv.py +++ b/modules/DSConv.py @@ -1,6 +1,6 @@ -import torch import torch.nn as nn -from torch.nn.utils import weight_norm, remove_weight_norm +from torch.nn.utils import remove_weight_norm, weight_norm + class Depthwise_Separable_Conv1D(nn.Module): def __init__( diff --git a/modules/mel_processing.py b/modules/mel_processing.py index 8ac0717..c21e4bf 100644 --- a/modules/mel_processing.py +++ b/modules/mel_processing.py @@ -53,7 +53,9 @@ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False) y = y.squeeze(1) y_dtype = y.dtype - if y.dtype == torch.bfloat16: y = y.to(torch.float32) + if y.dtype == torch.bfloat16: + y = y.to(torch.float32) + spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device], center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=True) spec = torch.view_as_real(spec).to(y_dtype) diff --git a/modules/modules.py b/modules/modules.py index ba67df6..214bca4 100644 --- a/modules/modules.py +++ b/modules/modules.py @@ -1,12 +1,14 @@ import torch from torch import nn -from torch.nn import Conv1d from torch.nn import functional as F -from modules.DSConv import weight_norm_modules, remove_weight_norm_modules, Depthwise_Separable_Conv1D - import modules.commons as commons -from modules.commons import init_weights, get_padding +from modules.commons import get_padding, init_weights +from modules.DSConv import ( + Depthwise_Separable_Conv1D, + remove_weight_norm_modules, + weight_norm_modules, +) LRELU_SLOPE = 0.1 diff --git a/webUI.py b/webUI.py index 4f3ee9f..6cd429a 100644 --- a/webUI.py +++ b/webUI.py @@ -6,6 +6,7 @@ import subprocess import time import traceback from itertools import chain +from pathlib import Path # os.system("wget -P cvec/ https://huggingface.co/spaces/innnky/nanami/resolve/main/checkpoint_best_legacy_500.pt") import gradio as gr @@ -15,7 +16,6 @@ import numpy as np import soundfile import torch from scipy.io import wavfile -from pathlib import Path from compress_model import removeOptimizer from inference.infer_tool import Svc @@ -172,14 +172,18 @@ def vc_fn(sid, input_audio, output_format, vc_transform, auto_f0,cluster_ratio, model.clear_empty() #os.remove(temp_path) #构建保存文件的路径,并保存到results文件夹内 - timestamp = str(int(time.time())) + str(int(time.time())) if not os.path.exists("results"): os.makedirs("results") key = "auto" if auto_f0 else f"{int(vc_transform)}key" cluster = "_" if cluster_ratio == 0 else f"_{cluster_ratio}_" isdiffusion = "sovits" - if model.shallow_diffusion : isdiffusion = "sovdiff" - if model.only_diffusion : isdiffusion = "diff" + if model.shallow_diffusion: + isdiffusion = "sovdiff" + + if model.only_diffusion: + isdiffusion = "diff" + output_file_name = 'result_'+truncated_basename+f'_{sid}_{key}{cluster}{isdiffusion}.{output_format}' output_file = os.path.join("results", output_file_name) soundfile.write(output_file, _audio, model.target_sample, format=output_format) From 647e9faa4658c823f55751edffbeb7d635da9542 Mon Sep 17 00:00:00 2001 From: magic-akari Date: Wed, 5 Jul 2023 14:59:17 +0800 Subject: [PATCH 7/7] chore: ignore some lint issues --- train.py | 2 +- utils.py | 2 +- vdecoder/hifiganwithsnake/alias/__init__.py | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/train.py b/train.py index dbdec11..248dd8b 100644 --- a/train.py +++ b/train.py @@ -99,7 +99,7 @@ def run(rank, n_gpus, hps): name=utils.latest_checkpoint_path(hps.model_dir, "D_*.pth") global_step=int(name[name.rfind("_")+1:name.rfind(".")])+1 #global_step = (epoch_str - 1) * len(train_loader) - except: + except: # noqa: E722 I have no idea about this CC: @ylzz1997 print("load old checkpoint failed...") epoch_str = 1 global_step = 0 diff --git a/utils.py b/utils.py index c3336b1..b9fe21a 100644 --- a/utils.py +++ b/utils.py @@ -161,7 +161,7 @@ def load_checkpoint(checkpoint_path, model, optimizer=None, skip_optimizer=False # print("load", k) new_state_dict[k] = saved_state_dict[k] assert saved_state_dict[k].shape == v.shape, (saved_state_dict[k].shape, v.shape) - except: + except: # noqa: E722 I have no idea about this CC: @ylzz1997 print("error, %s is not in the checkpoint" % k) logger.info("%s is not in the checkpoint" % k) new_state_dict[k] = v diff --git a/vdecoder/hifiganwithsnake/alias/__init__.py b/vdecoder/hifiganwithsnake/alias/__init__.py index 117e5ac..be97a33 100644 --- a/vdecoder/hifiganwithsnake/alias/__init__.py +++ b/vdecoder/hifiganwithsnake/alias/__init__.py @@ -1,6 +1,6 @@ # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 # LICENSE is in incl_licenses directory. -from .act import * -from .filter import * -from .resample import * +from .act import * # noqa: F403 +from .filter import * # noqa: F403 +from .resample import * # noqa: F403