Merge branch '4.1-Latest' into 4.1-Latest

This commit is contained in:
YuriHead 2023-07-05 23:03:08 +08:00 committed by GitHub
commit 6c18b0eb58
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 35 additions and 24 deletions

8
.github/workflows/ruff.yml vendored Normal file
View File

@ -0,0 +1,8 @@
name: Ruff
on: [push, pull_request]
jobs:
ruff:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: chartboost/ruff-action@v1

9
.gitignore vendored
View File

@ -150,7 +150,6 @@ results
inference/chunks_temp.json
logs
hubert/checkpoint_best_legacy_500.pt
pretrain/**/*.pt
configs/config.json
filelists/test.txt
filelists/train.txt
@ -162,11 +161,5 @@ filelists/val.txt
.idea/vcs.xml
.idea/inspectionProfiles/profiles_settings.xml
.idea/inspectionProfiles/Project_Default.xml
pretrain/vec-768-layer-12.onnx
pretrain/hubert-soft.onnx
pretrain/hubert4.0.onnx
pretrain/vec-256-layer-9.onnx
pretrain/vec-256-layer-12.onnx
pretrain/vec-768-layer-9.onnx
pretrain/
.vscode/launch.json
.ruff.toml

4
.ruff.toml Normal file
View File

@ -0,0 +1,4 @@
select = ["E", "F", "I"]
# Never enforce `E501` (line length violations).
ignore = ["E501", "E741"]

View File

@ -27,7 +27,7 @@ This project serves as a framework only and does not possess speech synthesis fu
# Warning: Please ensure that you address any authorization issues related to the dataset on your own. You bear full responsibility for any problems arising from the usage of non-authorized datasets for training, as well as any resulting consequences. The repository and its maintainer, svc develop team, disclaim any association with or liability for the consequences.
1. This project is exclusively established for academic purposes, aiming to facilitate communication and learning. It is not intended for deployment in production environments.
2. Any sovits-based video posted to a video platform must clearly specify in the introduction the input source vocals and audio used for the voice changer conversion, e.g., if you use someone else's video/audio and convert it by separating the vocals as the input source, you must give a clear link to the original video or music; if you use your own vocals or a voice synthesized by another voice synthesis engine as the input source, you must also specify this in the introduction.
2. Any sovits-based video posted to a video platform must clearly specify in the introduction the input source vocals and audio used for the voice changer conversion, e.g., if you use someone else's video/audio and convert it by separating the vocals as the input source, you must give a clear link to the original video or music; if you use your own vocals or a voice synthesized by another voice synthesis engine as the input source, you must also state this in your introduction.
3. You are solely responsible for any infringement issues caused by the input source and all consequences. When using other commercial vocal synthesis software as an input source, please ensure that you comply with the regulations of that software, noting that the regulations of many vocal synthesis engines explicitly state that they cannot be used to convert input sources!
4. Engaging in illegal activities, as well as religious and political activities, is strictly prohibited when using this project. The project developers vehemently oppose the aforementioned activities. If you disagree with this provision, the usage of the project is prohibited.
5. If you continue to use the program, you will be deemed to have agreed to the terms and conditions set forth in README and README has discouraged you and is not responsible for any subsequent problems.
@ -206,7 +206,7 @@ python resample.py
#### Cautions
Although this project has resample.py scripts for resampling, mono and loudness matching, the default loudness matching is to match to 0db. This can cause damage to the sound quality. While python's loudness matching package pyloudnorm does not limit the level, this can lead to popping. Therefore, it is recommended to consider using professional sound processing software, such as `adobe audition` for loudness matching. If you are already using other software for loudness matching, add the parameter `-skip_loudnorm` to the run command:
Although this project has resample.py scripts for resampling, mono and loudness matching, the default loudness matching is to match to 0db. This can cause damage to the sound quality. While python's loudness matching package pyloudnorm does not limit the level, this can lead to sonic boom. Therefore, it is recommended to consider using professional sound processing software, such as `adobe audition` for loudness matching. If you are already using other software for loudness matching, add the parameter `-skip_loudnorm` to the run command:
```shell
python resample.py --skip_loudnorm
@ -299,7 +299,7 @@ python preprocess_hubert_f0.py --f0_predictor dio --use_diff
After completing the above steps, the dataset directory will contain the preprocessed data, and the dataset_raw folder can be deleted.
## 🏋️‍♀️ Training
## 🏋️‍ Training
### Sovits Model

View File

@ -301,7 +301,7 @@ python preprocess_hubert_f0.py --f0_predictor dio --use_diff
执行完以上步骤后 dataset 目录便是预处理完成的数据,可以删除 dataset_raw 文件夹了
## 🏋️‍♀️ 训练
## 🏋️‍ 训练
### 主模型训练

View File

@ -1,6 +1,6 @@
import torch
import torch.nn as nn
from torch.nn.utils import weight_norm, remove_weight_norm
from torch.nn.utils import remove_weight_norm, weight_norm
class Depthwise_Separable_Conv1D(nn.Module):
def __init__(

View File

@ -53,7 +53,9 @@ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False)
y = y.squeeze(1)
y_dtype = y.dtype
if y.dtype == torch.bfloat16: y = y.to(torch.float32)
if y.dtype == torch.bfloat16:
y = y.to(torch.float32)
spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=True)
spec = torch.view_as_real(spec).to(y_dtype)

View File

@ -1,12 +1,14 @@
import torch
from torch import nn
from torch.nn import Conv1d
from torch.nn import functional as F
from modules.DSConv import weight_norm_modules, remove_weight_norm_modules, Depthwise_Separable_Conv1D
import modules.commons as commons
from modules.commons import init_weights, get_padding
from modules.commons import get_padding, init_weights
from modules.DSConv import (
Depthwise_Separable_Conv1D,
remove_weight_norm_modules,
weight_norm_modules,
)
LRELU_SLOPE = 0.1

View File

@ -99,7 +99,7 @@ def run(rank, n_gpus, hps):
name=utils.latest_checkpoint_path(hps.model_dir, "D_*.pth")
global_step=int(name[name.rfind("_")+1:name.rfind(".")])+1
#global_step = (epoch_str - 1) * len(train_loader)
except:
except: # noqa: E722 I have no idea about this CC: @ylzz1997
print("load old checkpoint failed...")
epoch_str = 1
global_step = 0

View File

@ -161,7 +161,7 @@ def load_checkpoint(checkpoint_path, model, optimizer=None, skip_optimizer=False
# print("load", k)
new_state_dict[k] = saved_state_dict[k]
assert saved_state_dict[k].shape == v.shape, (saved_state_dict[k].shape, v.shape)
except:
except: # noqa: E722 I have no idea about this CC: @ylzz1997
print("error, %s is not in the checkpoint" % k)
logger.info("%s is not in the checkpoint" % k)
new_state_dict[k] = v

View File

@ -1,6 +1,6 @@
# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
# LICENSE is in incl_licenses directory.
from .act import *
from .filter import *
from .resample import *
from .act import * # noqa: F403
from .filter import * # noqa: F403
from .resample import * # noqa: F403

View File

@ -377,7 +377,9 @@ with gr.Blocks(
""")
debug_button = gr.Checkbox(label="Debug模式如果向社区反馈BUG需要打开打开后控制台可以显示具体错误提示", value=debug)
vc_submit.click(vc_fn, [sid, vc_input3, output_format, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold,k_step,use_spk_mix,second_encoding,loudness_envelope_adjustment], [vc_output1, vc_output2])
vc_submit2.click(vc_fn2, [text2tts, tts_lang, tts_gender, tts_rate, tts_volume, sid, output_format, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold,k_step,use_spk_mix,second_encoding,loudness_envelope_adjustment], [vc_output1, vc_output2])
debug_button.change(debug_change,[],[])
model_load_button.click(modelAnalysis,[model_path,config_path,cluster_model_path,device,enhance,diff_model_path,diff_config_path,only_diffusion,use_spk_mix],[sid,sid_output])
model_unload_button.click(modelUnload,[],[sid,sid_output])