diff --git a/configs_template/config_template.json b/configs_template/config_template.json index 3353144..bbe0a30 100644 --- a/configs_template/config_template.json +++ b/configs_template/config_template.json @@ -54,7 +54,9 @@ "use_spectral_norm": false, "gin_channels": 768, "ssl_dim": 768, - "n_speakers": 200 + "n_speakers": 200, + "speech_encoder":"vec768l12", + "speaker_embedding":false }, "spk": { "nyaru": 0, diff --git a/data_utils.py b/data_utils.py index 7c76fd1..5958403 100644 --- a/data_utils.py +++ b/data_utils.py @@ -65,8 +65,8 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): spk = filename.split("/")[-2] spk = torch.LongTensor([self.spk_map[spk]]) - f0 = np.load(filename + ".f0.npy") - f0, uv = utils.interpolate_f0(f0) + f0, uv = np.load(filename + ".f0.npy",allow_pickle=True) + f0 = torch.FloatTensor(f0) uv = torch.FloatTensor(uv) diff --git a/inference/infer_tool.py b/inference/infer_tool.py index 960ebd5..1ddeb78 100644 --- a/inference/infer_tool.py +++ b/inference/infer_tool.py @@ -11,13 +11,11 @@ import gc import librosa import numpy as np # import onnxruntime -import parselmouth import soundfile import torch import torchaudio import cluster -from hubert import hubert_model import utils from models import SynthesizerTrn diff --git a/modules/F0Predictor/HarvestF0Predictor.py b/modules/F0Predictor/HarvestF0Predictor.py index fdb6016..2f3c015 100644 --- a/modules/F0Predictor/HarvestF0Predictor.py +++ b/modules/F0Predictor/HarvestF0Predictor.py @@ -44,7 +44,7 @@ class HarvestF0Predictor(F0Predictor): def compute_f0_uv(self,wav,p_len=None): if p_len is None: p_len = wav.shape[0]//self.hop_length - f0, t = pyworld.dio( + f0, t = pyworld.harvest( wav.astype(np.double), fs=self.sampling_rate, f0_floor=self.f0_min, diff --git a/preprocess_hubert_f0.py b/preprocess_hubert_f0.py index b374a26..3d23ba2 100644 --- a/preprocess_hubert_f0.py +++ b/preprocess_hubert_f0.py @@ -36,10 +36,11 @@ def process_one(filename, hmodel): if not os.path.exists(f0_path): from modules.F0Predictor.DioF0Predictor import DioF0Predictor f0_predictor = DioF0Predictor(sampling_rate=sampling_rate, hop_length=hop_length) - f0 = f0_predictor.compute_f0( + f0,uv = f0_predictor.compute_f0_uv( wav ) - np.save(f0_path, f0) + np.save(f0_path, np.asanyarray((f0,uv),dtype=object)) + spec_path = filename.replace(".wav", ".spec.pt") if not os.path.exists(spec_path): diff --git a/hubert/put_hubert_ckpt_here b/pretrain/put_hubert_ckpt_here similarity index 100% rename from hubert/put_hubert_ckpt_here rename to pretrain/put_hubert_ckpt_here diff --git a/utils.py b/utils.py index 5f866f2..68c987d 100644 --- a/utils.py +++ b/utils.py @@ -16,7 +16,6 @@ from scipy.io.wavfile import read import torch from torch.nn import functional as F from modules.commons import sequence_mask -from hubert import hubert_model MATPLOTLIB_FLAG = False diff --git a/vencoder/ContentVec256L9.py b/vencoder/ContentVec256L9.py new file mode 100644 index 0000000..6d3846b --- /dev/null +++ b/vencoder/ContentVec256L9.py @@ -0,0 +1,31 @@ +from vencoder.encoder import SpeechEncoder +import torch +from fairseq import checkpoint_utils + +class ContentVec256L9(SpeechEncoder): + def __init__(self,vec_path = "pretrain/checkpoint_best_legacy_500.pt"): + print("load model(s) from {}".format(vec_path)) + models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task( + [vec_path], + suffix="", + ) + self.hidden_dim = 256 + self.model = models[0] + self.model.eval() + + def encoder(self, wav): + feats = wav + if feats.dim() == 2: # double channels + feats = feats.mean(-1) + assert feats.dim() == 1, feats.dim() + feats = feats.view(1, -1) + padding_mask = torch.BoolTensor(feats.shape).fill_(False) + inputs = { + "source": feats.to(wav.device), + "padding_mask": padding_mask.to(wav.device), + "output_layer": 9, # layer 9 + } + with torch.no_grad(): + logits = self.model.extract_features(**inputs) + feats = self.model.final_proj(logits[0]) + return feats.transpose(1, 2) diff --git a/vencoder/ContentVec768L12.py b/vencoder/ContentVec768L12.py new file mode 100644 index 0000000..e38e77e --- /dev/null +++ b/vencoder/ContentVec768L12.py @@ -0,0 +1,30 @@ +from vencoder.encoder import SpeechEncoder +import torch + +class ContentVec768L12(SpeechEncoder): + def __init__(self,vec_path = "pretrain/checkpoint_best_legacy_500.pt"): + print("load model(s) from {}".format(vec_path)) + from fairseq import checkpoint_utils + models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task( + [vec_path], + suffix="", + ) + self.hidden_dim = 768 + self.model = models[0] + self.model.eval() + + def encoder(self, wav): + feats = wav + if feats.dim() == 2: # double channels + feats = feats.mean(-1) + assert feats.dim() == 1, feats.dim() + feats = feats.view(1, -1) + padding_mask = torch.BoolTensor(feats.shape).fill_(False) + inputs = { + "source": feats.to(wav.device), + "padding_mask": padding_mask.to(wav.device), + "output_layer": 12, # layer 12 + } + with torch.no_grad(): + logits = self.model.extract_features(**inputs) + return logits[0].transpose(1, 2) \ No newline at end of file diff --git a/vencoder/HuberSoft.py b/vencoder/HuberSoft.py new file mode 100644 index 0000000..44eee11 --- /dev/null +++ b/vencoder/HuberSoft.py @@ -0,0 +1,24 @@ +from vencoder.encoder import SpeechEncoder +import torch +from vencoder.hubert import hubert_model +class Hubersoft(SpeechEncoder): + def __init__(self,vec_path = "pretrain/hubert-soft-0d54a1f4.pt",device=None): + print("load model(s) from {}".format(vec_path)) + hubert_soft = hubert_model.hubert_soft("hubert/hubert-soft-0d54a1f4.pt") + if device is None: + self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu") + else: + self.dev = torch.device(device) + self.hidden_dim = 256 + self.model = hubert_soft.to(self.dev) + return hubert_soft + + def encoder(self, wav): + feats = wav + if feats.dim() == 2: # double channels + feats = feats.mean(-1) + assert feats.dim() == 1, feats.dim() + feats = feats.view(1, -1) + with torch.inference_mode(): + units = self.model.units(feats) + return units.transpose(1,2) \ No newline at end of file diff --git a/hubert/__init__.py b/vencoder/__init__.py similarity index 100% rename from hubert/__init__.py rename to vencoder/__init__.py diff --git a/vencoder/encoder.py b/vencoder/encoder.py new file mode 100644 index 0000000..e37411f --- /dev/null +++ b/vencoder/encoder.py @@ -0,0 +1,12 @@ +class SpeechEncoder(object): + def __init__(self,vec_path = "pretrain/checkpoint_best_legacy_500.pt"): + self.model = None #This is Model + self.hidden_dim = 768 + pass + + def encoder(self,wav): + ''' + input: wav:[batchsize,signal_length] + output: embedding:[batchsize,wav_frame,hidden_dim] + ''' + pass \ No newline at end of file diff --git a/vencoder/hubert/__init__.py b/vencoder/hubert/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/hubert/hubert_model.py b/vencoder/hubert/hubert_model.py similarity index 100% rename from hubert/hubert_model.py rename to vencoder/hubert/hubert_model.py diff --git a/hubert/hubert_model_onnx.py b/vencoder/hubert/hubert_model_onnx.py similarity index 100% rename from hubert/hubert_model_onnx.py rename to vencoder/hubert/hubert_model_onnx.py