diff --git a/README.md b/README.md index 1cd0d5a..c200a9f 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@
+LOGO # SoftVC VITS Singing Voice Conversion @@ -265,15 +266,6 @@ Add `--vol_aug` if you want to enable loudness embedding: python preprocess_flist_config.py --speech_encoder vec768l12 --vol_aug ``` -**Speed Up preprocess** - -If your dataset is pretty large,you can increase the param `--num_processes` like that: - -```shell -python preprocess_flist_config.py --speech_encoder vec768l12 --vol_aug --num_processes 8 -``` -All the worker will be assigned to different GPU if you have more than one GPUs. - After enabling loudness embedding, the trained model will match the loudness of the input source; otherwise, it will match the loudness of the training set. #### You can modify some parameters in the generated config.json and diffusion.yaml @@ -332,6 +324,15 @@ If you want shallow diffusion (optional), you need to add the `--use_diff` param python preprocess_hubert_f0.py --f0_predictor dio --use_diff ``` +**Speed Up preprocess** + +If your dataset is pretty large,you can increase the param `--num_processes` like that: + +```shell +python preprocess_hubert_f0.py --speech_encoder vec768l12 --vol_aug --num_processes 8 +``` +All the worker will be assigned to different GPU if you have more than one GPUs. + After completing the above steps, the dataset directory will contain the preprocessed data, and the dataset_raw folder can be deleted. ## 🏋️‍ Training diff --git a/README_zh_CN.md b/README_zh_CN.md index 7fa8670..8c5b7a5 100644 --- a/README_zh_CN.md +++ b/README_zh_CN.md @@ -1,5 +1,6 @@
- +LOGO + # SoftVC VITS Singing Voice Conversion [**English**](./README.md) | [**中文简体**](./README_zh_CN.md) @@ -268,13 +269,6 @@ wavlmbase+ ```shell python preprocess_flist_config.py --speech_encoder vec768l12 --vol_aug ``` - -**加速预处理** -如若您的数据集比较大,可以尝试添加`--num_processes`参数: -```shell -python preprocess_flist_config.py --speech_encoder vec768l12 --vol_aug --num_processes 8 -``` -所有的Workers会被自动分配到多个GPU上(如果您有多个GPU的话) 使用后训练出的模型将匹配到输入源响度,否则为训练集响度。 #### 此时可以在生成的 config.json 与 diffusion.yaml 修改部分参数 @@ -335,6 +329,13 @@ fcpe python preprocess_hubert_f0.py --f0_predictor dio --use_diff ``` +**加速预处理** +如若您的数据集比较大,可以尝试添加`--num_processes`参数: +```shell +python preprocess_hubert_f0.py --f0_predictor dio --use_diff --num_processes 8 +``` +所有的Workers会被自动分配到多个线程上 + 执行完以上步骤后 dataset 目录便是预处理完成的数据,可以删除 dataset_raw 文件夹了 ## 🏋️‍ 训练 diff --git a/modules/F0Predictor/rmvpe/inference.py b/modules/F0Predictor/rmvpe/inference.py index 40b6e94..02d2188 100644 --- a/modules/F0Predictor/rmvpe/inference.py +++ b/modules/F0Predictor/rmvpe/inference.py @@ -28,7 +28,7 @@ class RMVPE: def mel2hidden(self, mel): with torch.no_grad(): n_frames = mel.shape[-1] - mel = F.pad(mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode='reflect') + mel = F.pad(mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode='constant') hidden = self.model(mel) return hidden[:, :n_frames] diff --git a/preprocess_flist_config.py b/preprocess_flist_config.py index fb1b8a3..e7fbe21 100644 --- a/preprocess_flist_config.py +++ b/preprocess_flist_config.py @@ -5,6 +5,7 @@ import re import wave from random import shuffle +from loguru import logger from tqdm import tqdm import diffusion.logger.utils as du @@ -47,9 +48,9 @@ if __name__ == "__main__": if not file.endswith("wav"): continue if not pattern.match(file): - print(f"warning:文件名{file}中包含非字母数字下划线,可能会导致错误。(也可能不会)") + logger.warning(f"文件名{file}中包含非字母数字下划线,可能会导致错误。(也可能不会)") if get_wav_duration(file) < 0.3: - print("skip too short audio:", file) + logger.info("Skip too short audio:" + file) continue new_wavs.append(file) wavs = new_wavs @@ -60,13 +61,13 @@ if __name__ == "__main__": shuffle(train) shuffle(val) - print("Writing", args.train_list) + logger.info("Writing" + args.train_list) with open(args.train_list, "w") as f: for fname in tqdm(train): wavpath = fname f.write(wavpath + "\n") - print("Writing", args.val_list) + logger.info("Writing" + args.val_list) with open(args.val_list, "w") as f: for fname in tqdm(val): wavpath = fname @@ -101,8 +102,8 @@ if __name__ == "__main__": if args.tiny: config_template["model"]["filter_channels"] = 512 - print("Writing configs/config.json") + logger.info("Writing to configs/config.json") with open("configs/config.json", "w") as f: json.dump(config_template, f, indent=2) - print("Writing configs/diffusion.yaml") + logger.info("Writing to configs/diffusion.yaml") du.save_config("configs/diffusion.yaml",d_config_template) diff --git a/preprocess_hubert_f0.py b/preprocess_hubert_f0.py index 0ace6de..485a4a6 100644 --- a/preprocess_hubert_f0.py +++ b/preprocess_hubert_f0.py @@ -10,6 +10,7 @@ import librosa import numpy as np import torch import torch.multiprocessing as mp +from loguru import logger from tqdm import tqdm import diffusion.logger.utils as du @@ -27,13 +28,11 @@ hop_length = hps.data.hop_length speech_encoder = hps["model"]["speech_encoder"] -def process_one(filename, hmodel,f0p,diff=False,mel_extractor=None): - # print(filename) +def process_one(filename, hmodel,f0p,rank,diff=False,mel_extractor=None): wav, sr = librosa.load(filename, sr=sampling_rate) audio_norm = torch.FloatTensor(wav) audio_norm = audio_norm.unsqueeze(0) - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - + device = torch.device(f"cuda:{rank}") soft_path = filename + ".soft.pt" if not os.path.exists(soft_path): wav16k = librosa.resample(wav, orig_sr=sampling_rate, target_sr=16000) @@ -106,17 +105,17 @@ def process_one(filename, hmodel,f0p,diff=False,mel_extractor=None): def process_batch(file_chunk, f0p, diff=False, mel_extractor=None, device="cpu"): - print("Loading speech encoder for content...") + logger.info("Loading speech encoder for content...") rank = mp.current_process()._identity rank = rank[0] if len(rank) > 0 else 0 if torch.cuda.is_available(): gpu_id = rank % torch.cuda.device_count() device = torch.device(f"cuda:{gpu_id}") - print("Rank {rank} uses device {device}") + logger.info(f"Rank {rank} uses device {device}") hmodel = utils.get_speech_encoder(speech_encoder, device=device) - print("Loaded speech encoder.") + logger.info(f"Loaded speech encoder for rank {rank}") for filename in tqdm(file_chunk): - process_one(filename, hmodel, f0p, diff, mel_extractor) + process_one(filename, hmodel, f0p, gpu_id, diff, mel_extractor) def parallel_process(filenames, num_processes, f0p, diff, mel_extractor, device): with ProcessPoolExecutor(max_workers=num_processes) as executor: @@ -151,9 +150,11 @@ if __name__ == "__main__": device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print(speech_encoder) - print(f0p) - print("use_diff: ", args.use_diff) - print("device: ", device) + logger.info("Using device: ", device) + logger.info("Using SpeechEncoder: " + speech_encoder) + logger.info("Using extractor: " + f0p) + logger.info("Using diff Mode: " + str( args.use_diff)) + if args.use_diff: print("use_diff") print("Loading Mel Extractor...") diff --git a/requirements.txt b/requirements.txt index 962a697..9f5d55e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,6 +10,8 @@ torch torchaudio torchcrepe tqdm +rich +loguru scikit-maad praat-parselmouth onnx diff --git a/requirements_onnx_encoder.txt b/requirements_onnx_encoder.txt index a624622..cfde17c 100644 --- a/requirements_onnx_encoder.txt +++ b/requirements_onnx_encoder.txt @@ -9,6 +9,8 @@ torch==1.13.1 torchaudio==0.13.1 torchcrepe tqdm +rich.progress +loguru scikit-maad praat-parselmouth onnx diff --git a/requirements_win.txt b/requirements_win.txt index 924a641..461a992 100644 --- a/requirements_win.txt +++ b/requirements_win.txt @@ -15,6 +15,8 @@ sounddevice==0.4.5 SoundFile==0.10.3.post1 starlette==0.19.1 tqdm==4.63.0 +rich +loguru torchcrepe scikit-maad praat-parselmouth diff --git a/resample.py b/resample.py index 50eacc1..af421fd 100644 --- a/resample.py +++ b/resample.py @@ -6,8 +6,8 @@ from multiprocessing import cpu_count import librosa import numpy as np +from rich.progress import track from scipy.io import wavfile -from tqdm import tqdm def load_wav(wav_path): @@ -81,7 +81,7 @@ def process_all_speakers(): if os.path.isdir(spk_dir): print(spk_dir) futures = [executor.submit(process, (spk_dir, i, args)) for i in os.listdir(spk_dir) if i.endswith("wav")] - for _ in tqdm(concurrent.futures.as_completed(futures), total=len(futures)): + for _ in track(concurrent.futures.as_completed(futures), total=len(futures), description="resampling:"): pass diff --git a/train_diff.py b/train_diff.py index adf5fb3..65ba382 100644 --- a/train_diff.py +++ b/train_diff.py @@ -1,6 +1,7 @@ import argparse import torch +from loguru import logger from torch.optim import lr_scheduler from diffusion.data_loaders import get_data_loaders @@ -28,8 +29,8 @@ if __name__ == '__main__': # load config args = utils.load_config(cmd.config) - print(' > config:', cmd.config) - print(' > exp:', args.env.expdir) + logger.info(' > config:'+ cmd.config) + logger.info(' > exp:'+ args.env.expdir) # load vocoder vocoder = Vocoder(args.vocoder.type, args.vocoder.ckpt, device=args.device) @@ -47,7 +48,7 @@ if __name__ == '__main__': args.model.k_step_max ) - print(f' > INFO: now model timesteps is {model.timesteps}, and k_step_max is {model.k_step_max}') + logger.info(f' > Now model timesteps is {model.timesteps}, and k_step_max is {model.k_step_max}') # load parameters optimizer = torch.optim.AdamW(model.parameters())