Merge branch '4.1-Stable' into 4.1-Latest

This commit is contained in:
magic-akari 2023-07-23 23:05:02 +08:00
commit 317cde248d
No known key found for this signature in database
GPG Key ID: EC005B1159285BDD
10 changed files with 51 additions and 40 deletions

View File

@ -1,4 +1,5 @@
<div align="center">
<img alt="LOGO" src="https://avatars.githubusercontent.com/u/127122328?s=400&u=5395a98a4f945a3a50cb0cc96c2747505d190dbc&v=4" width="300" height="300" />
# SoftVC VITS Singing Voice Conversion
@ -265,15 +266,6 @@ Add `--vol_aug` if you want to enable loudness embedding:
python preprocess_flist_config.py --speech_encoder vec768l12 --vol_aug
```
**Speed Up preprocess**
If your dataset is pretty large,you can increase the param `--num_processes` like that:
```shell
python preprocess_flist_config.py --speech_encoder vec768l12 --vol_aug --num_processes 8
```
All the worker will be assigned to different GPU if you have more than one GPUs.
After enabling loudness embedding, the trained model will match the loudness of the input source; otherwise, it will match the loudness of the training set.
#### You can modify some parameters in the generated config.json and diffusion.yaml
@ -332,6 +324,15 @@ If you want shallow diffusion (optional), you need to add the `--use_diff` param
python preprocess_hubert_f0.py --f0_predictor dio --use_diff
```
**Speed Up preprocess**
If your dataset is pretty large,you can increase the param `--num_processes` like that:
```shell
python preprocess_hubert_f0.py --speech_encoder vec768l12 --vol_aug --num_processes 8
```
All the worker will be assigned to different GPU if you have more than one GPUs.
After completing the above steps, the dataset directory will contain the preprocessed data, and the dataset_raw folder can be deleted.
## 🏋️‍ Training

View File

@ -1,5 +1,6 @@
<div align="center">
<img alt="LOGO" src="https://avatars.githubusercontent.com/u/127122328?s=400&u=5395a98a4f945a3a50cb0cc96c2747505d190dbc&v=4" width="300" height="300" />
# SoftVC VITS Singing Voice Conversion
[**English**](./README.md) | [**中文简体**](./README_zh_CN.md)
@ -268,13 +269,6 @@ wavlmbase+
```shell
python preprocess_flist_config.py --speech_encoder vec768l12 --vol_aug
```
**加速预处理**
如若您的数据集比较大,可以尝试添加`--num_processes`参数:
```shell
python preprocess_flist_config.py --speech_encoder vec768l12 --vol_aug --num_processes 8
```
所有的Workers会被自动分配到多个GPU上如果您有多个GPU的话
使用后训练出的模型将匹配到输入源响度,否则为训练集响度。
#### 此时可以在生成的 config.json 与 diffusion.yaml 修改部分参数
@ -335,6 +329,13 @@ fcpe
python preprocess_hubert_f0.py --f0_predictor dio --use_diff
```
**加速预处理**
如若您的数据集比较大,可以尝试添加`--num_processes`参数:
```shell
python preprocess_hubert_f0.py --f0_predictor dio --use_diff --num_processes 8
```
所有的Workers会被自动分配到多个线程上
执行完以上步骤后 dataset 目录便是预处理完成的数据,可以删除 dataset_raw 文件夹了
## 🏋️‍ 训练

View File

@ -28,7 +28,7 @@ class RMVPE:
def mel2hidden(self, mel):
with torch.no_grad():
n_frames = mel.shape[-1]
mel = F.pad(mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode='reflect')
mel = F.pad(mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode='constant')
hidden = self.model(mel)
return hidden[:, :n_frames]

View File

@ -5,6 +5,7 @@ import re
import wave
from random import shuffle
from loguru import logger
from tqdm import tqdm
import diffusion.logger.utils as du
@ -47,9 +48,9 @@ if __name__ == "__main__":
if not file.endswith("wav"):
continue
if not pattern.match(file):
print(f"warning文件名{file}中包含非字母数字下划线,可能会导致错误。(也可能不会)")
logger.warning(f"文件名{file}中包含非字母数字下划线,可能会导致错误。(也可能不会)")
if get_wav_duration(file) < 0.3:
print("skip too short audio:", file)
logger.info("Skip too short audio:" + file)
continue
new_wavs.append(file)
wavs = new_wavs
@ -60,13 +61,13 @@ if __name__ == "__main__":
shuffle(train)
shuffle(val)
print("Writing", args.train_list)
logger.info("Writing" + args.train_list)
with open(args.train_list, "w") as f:
for fname in tqdm(train):
wavpath = fname
f.write(wavpath + "\n")
print("Writing", args.val_list)
logger.info("Writing" + args.val_list)
with open(args.val_list, "w") as f:
for fname in tqdm(val):
wavpath = fname
@ -101,8 +102,8 @@ if __name__ == "__main__":
if args.tiny:
config_template["model"]["filter_channels"] = 512
print("Writing configs/config.json")
logger.info("Writing to configs/config.json")
with open("configs/config.json", "w") as f:
json.dump(config_template, f, indent=2)
print("Writing configs/diffusion.yaml")
logger.info("Writing to configs/diffusion.yaml")
du.save_config("configs/diffusion.yaml",d_config_template)

View File

@ -10,6 +10,7 @@ import librosa
import numpy as np
import torch
import torch.multiprocessing as mp
from loguru import logger
from tqdm import tqdm
import diffusion.logger.utils as du
@ -27,13 +28,11 @@ hop_length = hps.data.hop_length
speech_encoder = hps["model"]["speech_encoder"]
def process_one(filename, hmodel,f0p,diff=False,mel_extractor=None):
# print(filename)
def process_one(filename, hmodel,f0p,rank,diff=False,mel_extractor=None):
wav, sr = librosa.load(filename, sr=sampling_rate)
audio_norm = torch.FloatTensor(wav)
audio_norm = audio_norm.unsqueeze(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device(f"cuda:{rank}")
soft_path = filename + ".soft.pt"
if not os.path.exists(soft_path):
wav16k = librosa.resample(wav, orig_sr=sampling_rate, target_sr=16000)
@ -106,17 +105,17 @@ def process_one(filename, hmodel,f0p,diff=False,mel_extractor=None):
def process_batch(file_chunk, f0p, diff=False, mel_extractor=None, device="cpu"):
print("Loading speech encoder for content...")
logger.info("Loading speech encoder for content...")
rank = mp.current_process()._identity
rank = rank[0] if len(rank) > 0 else 0
if torch.cuda.is_available():
gpu_id = rank % torch.cuda.device_count()
device = torch.device(f"cuda:{gpu_id}")
print("Rank {rank} uses device {device}")
logger.info(f"Rank {rank} uses device {device}")
hmodel = utils.get_speech_encoder(speech_encoder, device=device)
print("Loaded speech encoder.")
logger.info(f"Loaded speech encoder for rank {rank}")
for filename in tqdm(file_chunk):
process_one(filename, hmodel, f0p, diff, mel_extractor)
process_one(filename, hmodel, f0p, gpu_id, diff, mel_extractor)
def parallel_process(filenames, num_processes, f0p, diff, mel_extractor, device):
with ProcessPoolExecutor(max_workers=num_processes) as executor:
@ -151,9 +150,11 @@ if __name__ == "__main__":
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(speech_encoder)
print(f0p)
print("use_diff: ", args.use_diff)
print("device: ", device)
logger.info("Using device: ", device)
logger.info("Using SpeechEncoder: " + speech_encoder)
logger.info("Using extractor: " + f0p)
logger.info("Using diff Mode: " + str( args.use_diff))
if args.use_diff:
print("use_diff")
print("Loading Mel Extractor...")

View File

@ -10,6 +10,8 @@ torch
torchaudio
torchcrepe
tqdm
rich
loguru
scikit-maad
praat-parselmouth
onnx

View File

@ -9,6 +9,8 @@ torch==1.13.1
torchaudio==0.13.1
torchcrepe
tqdm
rich.progress
loguru
scikit-maad
praat-parselmouth
onnx

View File

@ -15,6 +15,8 @@ sounddevice==0.4.5
SoundFile==0.10.3.post1
starlette==0.19.1
tqdm==4.63.0
rich
loguru
torchcrepe
scikit-maad
praat-parselmouth

View File

@ -6,8 +6,8 @@ from multiprocessing import cpu_count
import librosa
import numpy as np
from rich.progress import track
from scipy.io import wavfile
from tqdm import tqdm
def load_wav(wav_path):
@ -81,7 +81,7 @@ def process_all_speakers():
if os.path.isdir(spk_dir):
print(spk_dir)
futures = [executor.submit(process, (spk_dir, i, args)) for i in os.listdir(spk_dir) if i.endswith("wav")]
for _ in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
for _ in track(concurrent.futures.as_completed(futures), total=len(futures), description="resampling:"):
pass

View File

@ -1,6 +1,7 @@
import argparse
import torch
from loguru import logger
from torch.optim import lr_scheduler
from diffusion.data_loaders import get_data_loaders
@ -28,8 +29,8 @@ if __name__ == '__main__':
# load config
args = utils.load_config(cmd.config)
print(' > config:', cmd.config)
print(' > exp:', args.env.expdir)
logger.info(' > config:'+ cmd.config)
logger.info(' > exp:'+ args.env.expdir)
# load vocoder
vocoder = Vocoder(args.vocoder.type, args.vocoder.ckpt, device=args.device)
@ -47,7 +48,7 @@ if __name__ == '__main__':
args.model.k_step_max
)
print(f' > INFO: now model timesteps is {model.timesteps}, and k_step_max is {model.k_step_max}')
logger.info(f' > Now model timesteps is {model.timesteps}, and k_step_max is {model.k_step_max}')
# load parameters
optimizer = torch.optim.AdamW(model.parameters())