From 856d500777a7b65d3396a4874ca21a345eb9a2f1 Mon Sep 17 00:00:00 2001 From: Ftps Date: Thu, 22 Jun 2023 03:04:03 +0900 Subject: [PATCH 1/3] Fix vencoder warning --- vencoder/CNHubertLarge.py | 8 +++++--- vencoder/ContentVec256L12_Onnx.py | 14 +++++++++----- vencoder/ContentVec256L9.py | 10 ++++++---- vencoder/ContentVec256L9_Onnx.py | 8 +++++--- vencoder/ContentVec768L12.py | 10 ++++++---- vencoder/ContentVec768L12_Onnx.py | 16 ++++++++++------ vencoder/ContentVec768L9_Onnx.py | 14 +++++++++----- vencoder/DPHubert.py | 12 +++++++----- vencoder/HubertSoft.py | 11 +++++++---- vencoder/HubertSoft_Onnx.py | 16 ++++++++++------ vencoder/WavLMBasePlus.py | 12 +++++++----- vencoder/WhisperPPG.py | 7 ++++--- vencoder/WhisperPPGLarge.py | 7 ++++--- vencoder/encoder.py | 12 ++++++------ 14 files changed, 95 insertions(+), 62 deletions(-) diff --git a/vencoder/CNHubertLarge.py b/vencoder/CNHubertLarge.py index 9db9378..e8eacf3 100644 --- a/vencoder/CNHubertLarge.py +++ b/vencoder/CNHubertLarge.py @@ -2,8 +2,10 @@ from vencoder.encoder import SpeechEncoder import torch from fairseq import checkpoint_utils + class CNHubertLarge(SpeechEncoder): - def __init__(self,vec_path = "pretrain/chinese-hubert-large-fairseq-ckpt.pt",device=None): + def __init__(self, vec_path="pretrain/chinese-hubert-large-fairseq-ckpt.pt", device=None): + super().__init__() print("load model(s) from {}".format(vec_path)) self.hidden_dim = 1024 models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task( @@ -20,7 +22,7 @@ class CNHubertLarge(SpeechEncoder): def encoder(self, wav): feats = wav if feats.dim() == 2: # double channels - feats = feats.mean(-1) + feats = feats.mean(-1) assert feats.dim() == 1, feats.dim() feats = feats.view(1, -1) padding_mask = torch.BoolTensor(feats.shape).fill_(False) @@ -29,5 +31,5 @@ class CNHubertLarge(SpeechEncoder): "padding_mask": padding_mask.to(wav.device) } with torch.no_grad(): - logits = self.model.extract_features(**inputs) + logits = self.model.extract_features(**inputs) return logits[0].transpose(1, 2) \ No newline at end of file diff --git a/vencoder/ContentVec256L12_Onnx.py b/vencoder/ContentVec256L12_Onnx.py index 9ad5085..6663c06 100644 --- a/vencoder/ContentVec256L12_Onnx.py +++ b/vencoder/ContentVec256L12_Onnx.py @@ -2,24 +2,28 @@ from vencoder.encoder import SpeechEncoder import onnxruntime import torch + class ContentVec256L12_Onnx(SpeechEncoder): - def __init__(self,vec_path = "pretrain/vec-256-layer-12.onnx",device=None): + def __init__(self, vec_path="pretrain/vec-256-layer-12.onnx", device=None): + super().__init__() print("load model(s) from {}".format(vec_path)) self.hidden_dim = 256 if device is None: self.dev = torch.device("cpu") else: self.dev = torch.device(device) - if device == 'cpu' or device == torch.device("cpu") or device is None: - providers = ['CPUExecutionProvider'] - elif device == 'cuda' or device == torch.device("cuda"): + + if device == 'cuda' or device == torch.device("cuda"): providers = ['CUDAExecutionProvider', 'CPUExecutionProvider'] + else: + providers = ['CPUExecutionProvider'] + self.model = onnxruntime.InferenceSession(vec_path, providers=providers) def encoder(self, wav): feats = wav if feats.dim() == 2: # double channels - feats = feats.mean(-1) + feats = feats.mean(-1) assert feats.dim() == 1, feats.dim() feats = feats.view(1, -1) feats = feats.unsqueeze(0).cpu().detach().numpy() diff --git a/vencoder/ContentVec256L9.py b/vencoder/ContentVec256L9.py index b0089c7..fef12cb 100644 --- a/vencoder/ContentVec256L9.py +++ b/vencoder/ContentVec256L9.py @@ -2,8 +2,10 @@ from vencoder.encoder import SpeechEncoder import torch from fairseq import checkpoint_utils + class ContentVec256L9(SpeechEncoder): - def __init__(self,vec_path = "pretrain/checkpoint_best_legacy_500.pt",device=None): + def __init__(self, vec_path="pretrain/checkpoint_best_legacy_500.pt", device=None): + super().__init__() print("load model(s) from {}".format(vec_path)) models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task( [vec_path], @@ -20,7 +22,7 @@ class ContentVec256L9(SpeechEncoder): def encoder(self, wav): feats = wav if feats.dim() == 2: # double channels - feats = feats.mean(-1) + feats = feats.mean(-1) assert feats.dim() == 1, feats.dim() feats = feats.view(1, -1) padding_mask = torch.BoolTensor(feats.shape).fill_(False) @@ -30,6 +32,6 @@ class ContentVec256L9(SpeechEncoder): "output_layer": 9, # layer 9 } with torch.no_grad(): - logits = self.model.extract_features(**inputs) - feats = self.model.final_proj(logits[0]) + logits = self.model.extract_features(**inputs) + feats = self.model.final_proj(logits[0]) return feats.transpose(1, 2) diff --git a/vencoder/ContentVec256L9_Onnx.py b/vencoder/ContentVec256L9_Onnx.py index fae2b92..27f7a93 100644 --- a/vencoder/ContentVec256L9_Onnx.py +++ b/vencoder/ContentVec256L9_Onnx.py @@ -3,7 +3,8 @@ import onnxruntime import torch class ContentVec256L9_Onnx(SpeechEncoder): - def __init__(self,vec_path = "pretrain/vec-256-layer-9.onnx",device=None): + def __init__(self, vec_path="pretrain/vec-256-layer-9.onnx", device=None): + super().__init__() print("load model(s) from {}".format(vec_path)) self.hidden_dim = 256 if device is None: @@ -19,10 +20,11 @@ class ContentVec256L9_Onnx(SpeechEncoder): def encoder(self, wav): feats = wav if feats.dim() == 2: # double channels - feats = feats.mean(-1) + feats = feats.mean(-1) assert feats.dim() == 1, feats.dim() feats = feats.view(1, -1) feats = feats.unsqueeze(0).cpu().detach().numpy() onnx_input = {self.model.get_inputs()[0].name: feats} logits = self.model.run(None, onnx_input) - return torch.tensor(logits[0]).transpose(1, 2).to(self.dev) \ No newline at end of file + return torch.tensor(logits[0]).transpose(1, 2).to(self.dev) + \ No newline at end of file diff --git a/vencoder/ContentVec768L12.py b/vencoder/ContentVec768L12.py index 0d1591c..b9f1856 100644 --- a/vencoder/ContentVec768L12.py +++ b/vencoder/ContentVec768L12.py @@ -2,8 +2,10 @@ from vencoder.encoder import SpeechEncoder import torch from fairseq import checkpoint_utils + class ContentVec768L12(SpeechEncoder): - def __init__(self,vec_path = "pretrain/checkpoint_best_legacy_500.pt",device=None): + def __init__(self, vec_path="pretrain/checkpoint_best_legacy_500.pt", device=None): + super().__init__() print("load model(s) from {}".format(vec_path)) self.hidden_dim = 768 models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task( @@ -20,7 +22,7 @@ class ContentVec768L12(SpeechEncoder): def encoder(self, wav): feats = wav if feats.dim() == 2: # double channels - feats = feats.mean(-1) + feats = feats.mean(-1) assert feats.dim() == 1, feats.dim() feats = feats.view(1, -1) padding_mask = torch.BoolTensor(feats.shape).fill_(False) @@ -30,5 +32,5 @@ class ContentVec768L12(SpeechEncoder): "output_layer": 12, # layer 12 } with torch.no_grad(): - logits = self.model.extract_features(**inputs) - return logits[0].transpose(1, 2) \ No newline at end of file + logits = self.model.extract_features(**inputs) + return logits[0].transpose(1, 2) diff --git a/vencoder/ContentVec768L12_Onnx.py b/vencoder/ContentVec768L12_Onnx.py index 8dde0f1..0562623 100644 --- a/vencoder/ContentVec768L12_Onnx.py +++ b/vencoder/ContentVec768L12_Onnx.py @@ -2,27 +2,31 @@ from vencoder.encoder import SpeechEncoder import onnxruntime import torch + class ContentVec768L12_Onnx(SpeechEncoder): - def __init__(self,vec_path = "pretrain/vec-768-layer-12.onnx",device=None): + def __init__(self, vec_path="pretrain/vec-768-layer-12.onnx", device=None): + super().__init__() print("load model(s) from {}".format(vec_path)) self.hidden_dim = 768 if device is None: self.dev = torch.device("cpu") else: self.dev = torch.device(device) - if device == 'cpu' or device == torch.device("cpu") or device is None: - providers = ['CPUExecutionProvider'] - elif device == 'cuda' or device == torch.device("cuda"): + + if device == 'cuda' or device == torch.device("cuda"): providers = ['CUDAExecutionProvider', 'CPUExecutionProvider'] + else: + providers = ['CPUExecutionProvider'] + self.model = onnxruntime.InferenceSession(vec_path, providers=providers) def encoder(self, wav): feats = wav if feats.dim() == 2: # double channels - feats = feats.mean(-1) + feats = feats.mean(-1) assert feats.dim() == 1, feats.dim() feats = feats.view(1, -1) feats = feats.unsqueeze(0).cpu().detach().numpy() onnx_input = {self.model.get_inputs()[0].name: feats} logits = self.model.run(None, onnx_input) - return torch.tensor(logits[0]).transpose(1, 2).to(self.dev) \ No newline at end of file + return torch.tensor(logits[0]).transpose(1, 2).to(self.dev) diff --git a/vencoder/ContentVec768L9_Onnx.py b/vencoder/ContentVec768L9_Onnx.py index 7cdac4c..40d6329 100644 --- a/vencoder/ContentVec768L9_Onnx.py +++ b/vencoder/ContentVec768L9_Onnx.py @@ -2,27 +2,31 @@ from vencoder.encoder import SpeechEncoder import onnxruntime import torch + class ContentVec768L9_Onnx(SpeechEncoder): def __init__(self,vec_path = "pretrain/vec-768-layer-9.onnx",device=None): + super().__init__() print("load model(s) from {}".format(vec_path)) self.hidden_dim = 768 if device is None: self.dev = torch.device("cpu") else: self.dev = torch.device(device) - if device == 'cpu' or device == torch.device("cpu") or device is None: - providers = ['CPUExecutionProvider'] - elif device == 'cuda' or device == torch.device("cuda"): + + if device == 'cuda' or device == torch.device("cuda"): providers = ['CUDAExecutionProvider', 'CPUExecutionProvider'] + else: + providers = ['CPUExecutionProvider'] + self.model = onnxruntime.InferenceSession(vec_path, providers=providers) def encoder(self, wav): feats = wav if feats.dim() == 2: # double channels - feats = feats.mean(-1) + feats = feats.mean(-1) assert feats.dim() == 1, feats.dim() feats = feats.view(1, -1) feats = feats.unsqueeze(0).cpu().detach().numpy() onnx_input = {self.model.get_inputs()[0].name: feats} logits = self.model.run(None, onnx_input) - return torch.tensor(logits[0]).transpose(1, 2).to(self.dev) \ No newline at end of file + return torch.tensor(logits[0]).transpose(1, 2).to(self.dev) diff --git a/vencoder/DPHubert.py b/vencoder/DPHubert.py index 95b98b8..a62cbac 100644 --- a/vencoder/DPHubert.py +++ b/vencoder/DPHubert.py @@ -2,8 +2,10 @@ from vencoder.encoder import SpeechEncoder import torch from vencoder.dphubert.model import wav2vec2_model + class DPHubert(SpeechEncoder): - def __init__(self,vec_path = "pretrain/DPHuBERT-sp0.75.pth",device=None): + def __init__(self, vec_path="pretrain/DPHuBERT-sp0.75.pth", device=None): + super().__init__() print("load model(s) from {}".format(vec_path)) if device is None: self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu") @@ -17,10 +19,10 @@ class DPHubert(SpeechEncoder): def encoder(self, wav): feats = wav if feats.dim() == 2: # double channels - feats = feats.mean(-1) + feats = feats.mean(-1) assert feats.dim() == 1, feats.dim() - feats = feats[None,:] + feats = feats[None, :] with torch.no_grad(): with torch.inference_mode(): - units = self.model(feats)[0] - return units.transpose(1,2) + units = self.model(feats)[0] + return units.transpose(1,2) diff --git a/vencoder/HubertSoft.py b/vencoder/HubertSoft.py index c7155e9..9847a7b 100644 --- a/vencoder/HubertSoft.py +++ b/vencoder/HubertSoft.py @@ -1,8 +1,11 @@ from vencoder.encoder import SpeechEncoder import torch from vencoder.hubert import hubert_model + + class HubertSoft(SpeechEncoder): - def __init__(self,vec_path = "pretrain/hubert-soft-0d54a1f4.pt",device=None): + def __init__(self, vec_path="pretrain/hubert-soft-0d54a1f4.pt", device=None): + super().__init__() print("load model(s) from {}".format(vec_path)) hubert_soft = hubert_model.hubert_soft(vec_path) if device is None: @@ -15,10 +18,10 @@ class HubertSoft(SpeechEncoder): def encoder(self, wav): feats = wav if feats.dim() == 2: # double channels - feats = feats.mean(-1) + feats = feats.mean(-1) assert feats.dim() == 1, feats.dim() feats = feats[None,None,:] with torch.no_grad(): with torch.inference_mode(): - units = self.model.units(feats) - return units.transpose(1,2) + units = self.model.units(feats) + return units.transpose(1,2) diff --git a/vencoder/HubertSoft_Onnx.py b/vencoder/HubertSoft_Onnx.py index 06f10a4..9b502d8 100644 --- a/vencoder/HubertSoft_Onnx.py +++ b/vencoder/HubertSoft_Onnx.py @@ -2,27 +2,31 @@ from vencoder.encoder import SpeechEncoder import onnxruntime import torch + class HubertSoft_Onnx(SpeechEncoder): - def __init__(self,vec_path = "pretrain/hubert-soft.onnx",device=None): + def __init__(self, vec_path="pretrain/hubert-soft.onnx", device=None): + super().__init__() print("load model(s) from {}".format(vec_path)) self.hidden_dim = 256 if device is None: self.dev = torch.device("cpu") else: self.dev = torch.device(device) - if device == 'cpu' or device == torch.device("cpu") or device is None: - providers = ['CPUExecutionProvider'] - elif device == 'cuda' or device == torch.device("cuda"): + + if device == 'cuda' or device == torch.device("cuda"): providers = ['CUDAExecutionProvider', 'CPUExecutionProvider'] + else: + providers = ['CPUExecutionProvider'] + self.model = onnxruntime.InferenceSession(vec_path, providers=providers) def encoder(self, wav): feats = wav if feats.dim() == 2: # double channels - feats = feats.mean(-1) + feats = feats.mean(-1) assert feats.dim() == 1, feats.dim() feats = feats.view(1, -1) feats = feats.unsqueeze(0).cpu().detach().numpy() onnx_input = {self.model.get_inputs()[0].name: feats} logits = self.model.run(None, onnx_input) - return torch.tensor(logits[0]).transpose(1, 2).to(self.dev) \ No newline at end of file + return torch.tensor(logits[0]).transpose(1, 2).to(self.dev) diff --git a/vencoder/WavLMBasePlus.py b/vencoder/WavLMBasePlus.py index b105dbc..8d45a35 100644 --- a/vencoder/WavLMBasePlus.py +++ b/vencoder/WavLMBasePlus.py @@ -2,8 +2,10 @@ from vencoder.encoder import SpeechEncoder import torch from vencoder.wavlm.WavLM import WavLM, WavLMConfig + class WavLMBasePlus(SpeechEncoder): - def __init__(self,vec_path = "pretrain/WavLM-Base+.pt",device=None): + def __init__(self, vec_path="pretrain/WavLM-Base+.pt", device=None): + super().__init__() print("load model(s) from {}".format(vec_path)) checkpoint = torch.load(vec_path) self.cfg = WavLMConfig(checkpoint['cfg']) @@ -19,11 +21,11 @@ class WavLMBasePlus(SpeechEncoder): def encoder(self, wav): feats = wav if feats.dim() == 2: # double channels - feats = feats.mean(-1) + feats = feats.mean(-1) assert feats.dim() == 1, feats.dim() if self.cfg.normalize: - feats = torch.nn.functional.layer_norm(feats , feats.shape) + feats = torch.nn.functional.layer_norm(feats, feats.shape) with torch.no_grad(): with torch.inference_mode(): - units = self.model.extract_features(feats[None,:])[0] - return units.transpose(1,2) + units = self.model.extract_features(feats[None, :])[0] + return units.transpose(1, 2) diff --git a/vencoder/WhisperPPG.py b/vencoder/WhisperPPG.py index aa988b0..5d156cc 100644 --- a/vencoder/WhisperPPG.py +++ b/vencoder/WhisperPPG.py @@ -6,7 +6,8 @@ from vencoder.whisper.audio import pad_or_trim, log_mel_spectrogram class WhisperPPG(SpeechEncoder): - def __init__(self,vec_path = "pretrain/medium.pt",device=None): + def __init__(self, vec_path="pretrain/medium.pt", device=None): + super().__init__() if device is None: self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu") else: @@ -26,5 +27,5 @@ class WhisperPPG(SpeechEncoder): mel = log_mel_spectrogram(audio).to(self.dev) with torch.no_grad(): ppg = self.model.encoder(mel.unsqueeze(0)).squeeze().data.cpu().float().numpy() - ppg = torch.FloatTensor(ppg[:ppgln,]).to(self.dev) - return ppg[None,:,:].transpose(1, 2) + ppg = torch.FloatTensor(ppg[:ppgln, ]).to(self.dev) + return ppg[None, :, :].transpose(1, 2) diff --git a/vencoder/WhisperPPGLarge.py b/vencoder/WhisperPPGLarge.py index 8deef0a..4494c08 100644 --- a/vencoder/WhisperPPGLarge.py +++ b/vencoder/WhisperPPGLarge.py @@ -6,7 +6,8 @@ from vencoder.whisper.audio import pad_or_trim, log_mel_spectrogram class WhisperPPGLarge(SpeechEncoder): - def __init__(self,vec_path = "pretrain/large-v2.pt",device=None): + def __init__(self, vec_path="pretrain/large-v2.pt", device=None): + super().__init__() if device is None: self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu") else: @@ -26,5 +27,5 @@ class WhisperPPGLarge(SpeechEncoder): mel = log_mel_spectrogram(audio).to(self.dev) with torch.no_grad(): ppg = self.model.encoder(mel.unsqueeze(0)).squeeze().data.cpu().float().numpy() - ppg = torch.FloatTensor(ppg[:ppgln,]).to(self.dev) - return ppg[None,:,:].transpose(1, 2) + ppg = torch.FloatTensor(ppg[:ppgln, ]).to(self.dev) + return ppg[None, :, :].transpose(1, 2) diff --git a/vencoder/encoder.py b/vencoder/encoder.py index 2cf5678..67e94c7 100644 --- a/vencoder/encoder.py +++ b/vencoder/encoder.py @@ -1,12 +1,12 @@ class SpeechEncoder(object): - def __init__(self,vec_path = "pretrain/checkpoint_best_legacy_500.pt",device=None): - self.model = None #This is Model + def __init__(self, vec_path="pretrain/checkpoint_best_legacy_500.pt", device=None): + self.model = None # This is Model self.hidden_dim = 768 pass - def encoder(self,wav): - ''' + def encoder(self, wav): + """ input: wav:[batchsize,signal_length] output: embedding:[batchsize,hidden_dim,wav_frame] - ''' - pass \ No newline at end of file + """ + pass From 80b7649880d8b046bad5d23462537587b3ba54fb Mon Sep 17 00:00:00 2001 From: YuriHead Date: Fri, 23 Jun 2023 00:51:19 +0800 Subject: [PATCH 2/3] Update encoder.py --- vencoder/encoder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vencoder/encoder.py b/vencoder/encoder.py index 5d27228..a64173a 100644 --- a/vencoder/encoder.py +++ b/vencoder/encoder.py @@ -6,7 +6,7 @@ class SpeechEncoder(object): def encoder(self,wav): - ''' + """ input: wav:[signal_length] output: embedding:[batchsize,hidden_dim,wav_frame] """ From 7d76d46d293bbb7e3de89d81e89b470f1598e6f9 Mon Sep 17 00:00:00 2001 From: YuriHead Date: Fri, 23 Jun 2023 00:52:53 +0800 Subject: [PATCH 3/3] Update encoder.py --- vencoder/encoder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vencoder/encoder.py b/vencoder/encoder.py index a64173a..9ad120d 100644 --- a/vencoder/encoder.py +++ b/vencoder/encoder.py @@ -5,7 +5,7 @@ class SpeechEncoder(object): pass - def encoder(self,wav): + def encoder(self, wav): """ input: wav:[signal_length] output: embedding:[batchsize,hidden_dim,wav_frame]