diff --git a/data_utils.py b/data_utils.py index 5929dbc..93c3d5c 100644 --- a/data_utils.py +++ b/data_utils.py @@ -47,6 +47,8 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): audio_norm = audio / self.max_wav_value audio_norm = audio_norm.unsqueeze(0) spec_filename = filename.replace(".wav", ".spec.pt") + + # Ideally, all data generated after Mar 25 should have .spec.pt if os.path.exists(spec_filename): spec = torch.load(spec_filename) else: diff --git a/preprocess_flist_config.py b/preprocess_flist_config.py index 6a29726..2717e51 100644 --- a/preprocess_flist_config.py +++ b/preprocess_flist_config.py @@ -25,13 +25,11 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--train_list", type=str, default="./filelists/train.txt", help="path to train list") parser.add_argument("--val_list", type=str, default="./filelists/val.txt", help="path to val list") - parser.add_argument("--test_list", type=str, default="./filelists/test.txt", help="path to test list") parser.add_argument("--source_dir", type=str, default="./dataset/44k", help="path to source dir") args = parser.parse_args() train = [] val = [] - test = [] idx = 0 spk_dict = {} spk_id = 0 @@ -51,13 +49,11 @@ if __name__ == "__main__": new_wavs.append(file) wavs = new_wavs shuffle(wavs) - train += wavs[2:-2] + train += wavs[2:] val += wavs[:2] - test += wavs[-2:] shuffle(train) shuffle(val) - shuffle(test) print("Writing", args.train_list) with open(args.train_list, "w") as f: @@ -70,12 +66,6 @@ if __name__ == "__main__": for fname in tqdm(val): wavpath = fname f.write(wavpath + "\n") - - print("Writing", args.test_list) - with open(args.test_list, "w") as f: - for fname in tqdm(test): - wavpath = fname - f.write(wavpath + "\n") config_template["spk"] = spk_dict config_template["model"]["n_speakers"] = spk_id diff --git a/preprocess_hubert_f0.py b/preprocess_hubert_f0.py index 66bac7e..763fb0d 100644 --- a/preprocess_hubert_f0.py +++ b/preprocess_hubert_f0.py @@ -7,10 +7,12 @@ from random import shuffle import torch from glob import glob from tqdm import tqdm +from modules.mel_processing import spectrogram_torch import utils import logging -logging.getLogger('numba').setLevel(logging.WARNING) + +logging.getLogger("numba").setLevel(logging.WARNING) import librosa import numpy as np @@ -29,11 +31,42 @@ def process_one(filename, hmodel): wav16k = torch.from_numpy(wav16k).to(device) c = utils.get_hubert_content(hmodel, wav_16k_tensor=wav16k) torch.save(c.cpu(), soft_path) + f0_path = filename + ".f0.npy" if not os.path.exists(f0_path): - f0 = utils.compute_f0_dio(wav, sampling_rate=sampling_rate, hop_length=hop_length) + f0 = utils.compute_f0_dio( + wav, sampling_rate=sampling_rate, hop_length=hop_length + ) np.save(f0_path, f0) + spec_path = filename.replace(".wav", ".spec.pt") + if not os.path.exists(spec_path): + # Process spectrogram + # The following code can't be replaced by torch.FloatTensor(wav) + # because load_wav_to_torch return a tensor that need to be normalized + + audio, sr = utils.load_wav_to_torch(filename) + if sr != hps.data.sampling_rate: + raise ValueError( + "{} SR doesn't match target {} SR".format( + sr, hps.data.sampling_rate + ) + ) + + audio_norm = audio / hps.data.max_wav_value + audio_norm = audio_norm.unsqueeze(0) + + spec = spectrogram_torch( + audio_norm, + hps.data.filter_length, + hps.data.sampling_rate, + hps.data.hop_length, + hps.data.win_length, + center=False, + ) + spec = torch.squeeze(spec, 0) + torch.save(spec, spec_path) + def process_batch(filenames): print("Loading hubert for content...") @@ -46,17 +79,23 @@ def process_batch(filenames): if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--in_dir", type=str, default="dataset/44k", help="path to input dir") + parser.add_argument( + "--in_dir", type=str, default="dataset/44k", help="path to input dir" + ) args = parser.parse_args() - filenames = glob(f'{args.in_dir}/*/*.wav', recursive=True) # [:10] + filenames = glob(f"{args.in_dir}/*/*.wav", recursive=True) # [:10] shuffle(filenames) - multiprocessing.set_start_method('spawn',force=True) + multiprocessing.set_start_method("spawn", force=True) num_processes = 1 chunk_size = int(math.ceil(len(filenames) / num_processes)) - chunks = [filenames[i:i + chunk_size] for i in range(0, len(filenames), chunk_size)] + chunks = [ + filenames[i : i + chunk_size] for i in range(0, len(filenames), chunk_size) + ] print([len(c) for c in chunks]) - processes = [multiprocessing.Process(target=process_batch, args=(chunk,)) for chunk in chunks] + processes = [ + multiprocessing.Process(target=process_batch, args=(chunk,)) for chunk in chunks + ] for p in processes: p.start() diff --git a/spec_gen.py b/spec_gen.py deleted file mode 100644 index 9476395..0000000 --- a/spec_gen.py +++ /dev/null @@ -1,22 +0,0 @@ -from data_utils import TextAudioSpeakerLoader -import json -from tqdm import tqdm - -from utils import HParams - -config_path = 'configs/config.json' -with open(config_path, "r") as f: - data = f.read() -config = json.loads(data) -hps = HParams(**config) - -train_dataset = TextAudioSpeakerLoader("filelists/train.txt", hps) -test_dataset = TextAudioSpeakerLoader("filelists/test.txt", hps) -eval_dataset = TextAudioSpeakerLoader("filelists/val.txt", hps) - -for _ in tqdm(train_dataset): - pass -for _ in tqdm(eval_dataset): - pass -for _ in tqdm(test_dataset): - pass \ No newline at end of file