Merge branch '4.1-Latest' of https://github.com/svc-develop-team/so-vits-svc into 4.1-Latest

2023-07-22 20:29:22 +08:00 · 2023-07-22 20:29:22 +08:00 · 277ed41e23
parent dc4233279b d9b108ecbd
commit 277ed41e23
5 changed files with 26 additions and 10 deletions
--- a/README.md
+++ b/README.md
@ -258,6 +258,15 @@ Add `--vol_aug` if you want to enable loudness embedding:
 python preprocess_flist_config.py --speech_encoder vec768l12 --vol_aug
 ```

+**Speed Up preprocess**
+
+If your dataset is pretty large,you can increase the param `--num_processes` like that:
+
+```shell
+python preprocess_flist_config.py --speech_encoder vec768l12 --vol_aug --num_processes 8
+```
+All the worker will be assigned to different GPU if you have more than one GPUs.
+
 After enabling loudness embedding, the trained model will match the loudness of the input source; otherwise, it will match the loudness of the training set.

 #### You can modify some parameters in the generated config.json and diffusion.yaml
--- a/README_zh_CN.md
+++ b/README_zh_CN.md
@ -260,6 +260,12 @@ wavlmbase+
 python preprocess_flist_config.py --speech_encoder vec768l12 --vol_aug
 ```

+**加速预处理**
+如若您的数据集比较大，可以尝试添加`--num_processes`参数：
+```shell
+python preprocess_flist_config.py --speech_encoder vec768l12 --vol_aug --num_processes 8
+```
+所有的Workers会被自动分配到多个GPU上（如果您有多个GPU的话）
 使用后训练出的模型将匹配到输入源响度，否则为训练集响度。

 #### 此时可以在生成的 config.json 与 diffusion.yaml 修改部分参数
--- a/inference/infer_tool.py
+++ b/inference/infer_tool.py
@ -232,7 +232,7 @@ class Svc(object):
                if speaker_id is None:
                    raise RuntimeError("The name you entered is not in the speaker list!")
                feature_index = self.cluster_model[speaker_id]
-                feat_np = c.transpose(0,1).cpu().numpy()
+                feat_np = np.ascontiguousarray(c.transpose(0,1).cpu().numpy())
                if self.big_npy is None or self.now_spk_id != speaker_id:
                   self.big_npy = feature_index.reconstruct_n(0, feature_index.ntotal)
                   self.now_spk_id = speaker_id
--- a/preprocess_hubert_f0.py
+++ b/preprocess_hubert_f0.py
@ -1,6 +1,5 @@
 import argparse
 import logging
-import multiprocessing
 import os
 import random
 from concurrent.futures import ProcessPoolExecutor
@ -10,6 +9,7 @@ from random import shuffle
 import librosa
 import numpy as np
 import torch
+import torch.multiprocessing as mp
 from tqdm import tqdm

 import diffusion.logger.utils as du
@ -106,10 +106,14 @@ def process_one(filename, hmodel,f0p,diff=False,mel_extractor=None):

 def process_batch(file_chunk, f0p, diff=False, mel_extractor=None):
    print("Loading speech encoder for content...")
-    device = "cuda" if torch.cuda.is_available() else "cpu"
+    rank = mp.current_process()._identity
+    rank = rank[0] if len(rank) > 0 else 0
+    if torch.cuda.is_available():
+        gpu_id = rank % torch.cuda.device_count()
+        device = torch.device(f"cuda:{gpu_id}")
+    print("Rank {rank} uses device {device}")
    hmodel = utils.get_speech_encoder(speech_encoder, device=device)
    print("Loaded speech encoder.")
-
    for filename in tqdm(file_chunk):
        process_one(filename, hmodel, f0p, diff, mel_extractor)

@ -121,7 +125,6 @@ def parallel_process(filenames, num_processes, f0p, diff, mel_extractor):
            end = int((i + 1) * len(filenames) / num_processes)
            file_chunk = filenames[start:end]
            tasks.append(executor.submit(process_batch, file_chunk, f0p, diff, mel_extractor))
-
        for task in tqdm(tasks):
            task.result()

@ -139,7 +142,6 @@ if __name__ == "__main__":
    parser.add_argument(
        '--num_processes', type=int, default=1, help='You are advised to set the number of processes to the same as the number of CPU cores'
    )
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    args = parser.parse_args()
    f0p = args.f0_predictor
    print(speech_encoder)
@ -148,16 +150,16 @@ if __name__ == "__main__":
    if args.use_diff:
        print("use_diff")
        print("Loading Mel Extractor...")
-        mel_extractor = Vocoder(dconfig.vocoder.type, dconfig.vocoder.ckpt, device = device)
+        mel_extractor = Vocoder(dconfig.vocoder.type, dconfig.vocoder.ckpt, device = "cuda:0")
        print("Loaded Mel Extractor.")
    else:
        mel_extractor = None
    filenames = glob(f"{args.in_dir}/*/*.wav", recursive=True)  # [:10]
    shuffle(filenames)
-    multiprocessing.set_start_method("spawn", force=True)
+    mp.set_start_method("spawn", force=True)

    num_processes = args.num_processes
    if num_processes == 0:
        num_processes = os.cpu_count()
    
-    parallel_process(filenames, num_processes, f0p, args.use_diff, mel_extractor)
+    parallel_process(filenames, num_processes, f0p, args.use_diff, mel_extractor)
--- a/utils.py
+++ b/utils.py
@ -43,7 +43,6 @@ def normalize_f0(f0, x_mask, uv, random_scale=True):
    if torch.isnan(f0_norm).any():
        exit(0)
    return f0_norm * x_mask
-
 def plot_data_to_numpy(x, y):
    global MATPLOTLIB_FLAG
    if not MATPLOTLIB_FLAG: