Merge pull request #250 from Tps-F/Fix-vencoder-warning

Fix vencoder warning
This commit is contained in:
YuriHead 2023-06-23 00:54:23 +08:00 committed by GitHub
commit 4f43531207
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 96 additions and 62 deletions

View File

@ -2,8 +2,10 @@ from vencoder.encoder import SpeechEncoder
import torch
from fairseq import checkpoint_utils
class CNHubertLarge(SpeechEncoder):
def __init__(self,vec_path = "pretrain/chinese-hubert-large-fairseq-ckpt.pt",device=None):
def __init__(self, vec_path="pretrain/chinese-hubert-large-fairseq-ckpt.pt", device=None):
super().__init__()
print("load model(s) from {}".format(vec_path))
self.hidden_dim = 1024
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
@ -20,7 +22,7 @@ class CNHubertLarge(SpeechEncoder):
def encoder(self, wav):
feats = wav
if feats.dim() == 2: # double channels
feats = feats.mean(-1)
feats = feats.mean(-1)
assert feats.dim() == 1, feats.dim()
feats = feats.view(1, -1)
padding_mask = torch.BoolTensor(feats.shape).fill_(False)
@ -29,5 +31,5 @@ class CNHubertLarge(SpeechEncoder):
"padding_mask": padding_mask.to(wav.device)
}
with torch.no_grad():
logits = self.model.extract_features(**inputs)
logits = self.model.extract_features(**inputs)
return logits[0].transpose(1, 2)

View File

@ -2,24 +2,28 @@ from vencoder.encoder import SpeechEncoder
import onnxruntime
import torch
class ContentVec256L12_Onnx(SpeechEncoder):
def __init__(self,vec_path = "pretrain/vec-256-layer-12.onnx",device=None):
def __init__(self, vec_path="pretrain/vec-256-layer-12.onnx", device=None):
super().__init__()
print("load model(s) from {}".format(vec_path))
self.hidden_dim = 256
if device is None:
self.dev = torch.device("cpu")
else:
self.dev = torch.device(device)
if device == 'cpu' or device == torch.device("cpu") or device is None:
providers = ['CPUExecutionProvider']
elif device == 'cuda' or device == torch.device("cuda"):
if device == 'cuda' or device == torch.device("cuda"):
providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
else:
providers = ['CPUExecutionProvider']
self.model = onnxruntime.InferenceSession(vec_path, providers=providers)
def encoder(self, wav):
feats = wav
if feats.dim() == 2: # double channels
feats = feats.mean(-1)
feats = feats.mean(-1)
assert feats.dim() == 1, feats.dim()
feats = feats.view(1, -1)
feats = feats.unsqueeze(0).cpu().detach().numpy()

View File

@ -2,8 +2,10 @@ from vencoder.encoder import SpeechEncoder
import torch
from fairseq import checkpoint_utils
class ContentVec256L9(SpeechEncoder):
def __init__(self,vec_path = "pretrain/checkpoint_best_legacy_500.pt",device=None):
def __init__(self, vec_path="pretrain/checkpoint_best_legacy_500.pt", device=None):
super().__init__()
print("load model(s) from {}".format(vec_path))
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
[vec_path],
@ -20,7 +22,7 @@ class ContentVec256L9(SpeechEncoder):
def encoder(self, wav):
feats = wav
if feats.dim() == 2: # double channels
feats = feats.mean(-1)
feats = feats.mean(-1)
assert feats.dim() == 1, feats.dim()
feats = feats.view(1, -1)
padding_mask = torch.BoolTensor(feats.shape).fill_(False)
@ -30,6 +32,6 @@ class ContentVec256L9(SpeechEncoder):
"output_layer": 9, # layer 9
}
with torch.no_grad():
logits = self.model.extract_features(**inputs)
feats = self.model.final_proj(logits[0])
logits = self.model.extract_features(**inputs)
feats = self.model.final_proj(logits[0])
return feats.transpose(1, 2)

View File

@ -3,7 +3,8 @@ import onnxruntime
import torch
class ContentVec256L9_Onnx(SpeechEncoder):
def __init__(self,vec_path = "pretrain/vec-256-layer-9.onnx",device=None):
def __init__(self, vec_path="pretrain/vec-256-layer-9.onnx", device=None):
super().__init__()
print("load model(s) from {}".format(vec_path))
self.hidden_dim = 256
if device is None:
@ -19,10 +20,11 @@ class ContentVec256L9_Onnx(SpeechEncoder):
def encoder(self, wav):
feats = wav
if feats.dim() == 2: # double channels
feats = feats.mean(-1)
feats = feats.mean(-1)
assert feats.dim() == 1, feats.dim()
feats = feats.view(1, -1)
feats = feats.unsqueeze(0).cpu().detach().numpy()
onnx_input = {self.model.get_inputs()[0].name: feats}
logits = self.model.run(None, onnx_input)
return torch.tensor(logits[0]).transpose(1, 2).to(self.dev)
return torch.tensor(logits[0]).transpose(1, 2).to(self.dev)

View File

@ -2,8 +2,10 @@ from vencoder.encoder import SpeechEncoder
import torch
from fairseq import checkpoint_utils
class ContentVec768L12(SpeechEncoder):
def __init__(self,vec_path = "pretrain/checkpoint_best_legacy_500.pt",device=None):
def __init__(self, vec_path="pretrain/checkpoint_best_legacy_500.pt", device=None):
super().__init__()
print("load model(s) from {}".format(vec_path))
self.hidden_dim = 768
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
@ -20,7 +22,7 @@ class ContentVec768L12(SpeechEncoder):
def encoder(self, wav):
feats = wav
if feats.dim() == 2: # double channels
feats = feats.mean(-1)
feats = feats.mean(-1)
assert feats.dim() == 1, feats.dim()
feats = feats.view(1, -1)
padding_mask = torch.BoolTensor(feats.shape).fill_(False)
@ -30,5 +32,5 @@ class ContentVec768L12(SpeechEncoder):
"output_layer": 12, # layer 12
}
with torch.no_grad():
logits = self.model.extract_features(**inputs)
return logits[0].transpose(1, 2)
logits = self.model.extract_features(**inputs)
return logits[0].transpose(1, 2)

View File

@ -2,27 +2,31 @@ from vencoder.encoder import SpeechEncoder
import onnxruntime
import torch
class ContentVec768L12_Onnx(SpeechEncoder):
def __init__(self,vec_path = "pretrain/vec-768-layer-12.onnx",device=None):
def __init__(self, vec_path="pretrain/vec-768-layer-12.onnx", device=None):
super().__init__()
print("load model(s) from {}".format(vec_path))
self.hidden_dim = 768
if device is None:
self.dev = torch.device("cpu")
else:
self.dev = torch.device(device)
if device == 'cpu' or device == torch.device("cpu") or device is None:
providers = ['CPUExecutionProvider']
elif device == 'cuda' or device == torch.device("cuda"):
if device == 'cuda' or device == torch.device("cuda"):
providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
else:
providers = ['CPUExecutionProvider']
self.model = onnxruntime.InferenceSession(vec_path, providers=providers)
def encoder(self, wav):
feats = wav
if feats.dim() == 2: # double channels
feats = feats.mean(-1)
feats = feats.mean(-1)
assert feats.dim() == 1, feats.dim()
feats = feats.view(1, -1)
feats = feats.unsqueeze(0).cpu().detach().numpy()
onnx_input = {self.model.get_inputs()[0].name: feats}
logits = self.model.run(None, onnx_input)
return torch.tensor(logits[0]).transpose(1, 2).to(self.dev)
return torch.tensor(logits[0]).transpose(1, 2).to(self.dev)

View File

@ -2,27 +2,31 @@ from vencoder.encoder import SpeechEncoder
import onnxruntime
import torch
class ContentVec768L9_Onnx(SpeechEncoder):
def __init__(self,vec_path = "pretrain/vec-768-layer-9.onnx",device=None):
super().__init__()
print("load model(s) from {}".format(vec_path))
self.hidden_dim = 768
if device is None:
self.dev = torch.device("cpu")
else:
self.dev = torch.device(device)
if device == 'cpu' or device == torch.device("cpu") or device is None:
providers = ['CPUExecutionProvider']
elif device == 'cuda' or device == torch.device("cuda"):
if device == 'cuda' or device == torch.device("cuda"):
providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
else:
providers = ['CPUExecutionProvider']
self.model = onnxruntime.InferenceSession(vec_path, providers=providers)
def encoder(self, wav):
feats = wav
if feats.dim() == 2: # double channels
feats = feats.mean(-1)
feats = feats.mean(-1)
assert feats.dim() == 1, feats.dim()
feats = feats.view(1, -1)
feats = feats.unsqueeze(0).cpu().detach().numpy()
onnx_input = {self.model.get_inputs()[0].name: feats}
logits = self.model.run(None, onnx_input)
return torch.tensor(logits[0]).transpose(1, 2).to(self.dev)
return torch.tensor(logits[0]).transpose(1, 2).to(self.dev)

View File

@ -2,8 +2,10 @@ from vencoder.encoder import SpeechEncoder
import torch
from vencoder.dphubert.model import wav2vec2_model
class DPHubert(SpeechEncoder):
def __init__(self,vec_path = "pretrain/DPHuBERT-sp0.75.pth",device=None):
def __init__(self, vec_path="pretrain/DPHuBERT-sp0.75.pth", device=None):
super().__init__()
print("load model(s) from {}".format(vec_path))
if device is None:
self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@ -17,10 +19,10 @@ class DPHubert(SpeechEncoder):
def encoder(self, wav):
feats = wav
if feats.dim() == 2: # double channels
feats = feats.mean(-1)
feats = feats.mean(-1)
assert feats.dim() == 1, feats.dim()
feats = feats[None,:]
feats = feats[None, :]
with torch.no_grad():
with torch.inference_mode():
units = self.model(feats)[0]
return units.transpose(1,2)
units = self.model(feats)[0]
return units.transpose(1,2)

View File

@ -1,8 +1,11 @@
from vencoder.encoder import SpeechEncoder
import torch
from vencoder.hubert import hubert_model
class HubertSoft(SpeechEncoder):
def __init__(self,vec_path = "pretrain/hubert-soft-0d54a1f4.pt",device=None):
def __init__(self, vec_path="pretrain/hubert-soft-0d54a1f4.pt", device=None):
super().__init__()
print("load model(s) from {}".format(vec_path))
hubert_soft = hubert_model.hubert_soft(vec_path)
if device is None:
@ -15,10 +18,10 @@ class HubertSoft(SpeechEncoder):
def encoder(self, wav):
feats = wav
if feats.dim() == 2: # double channels
feats = feats.mean(-1)
feats = feats.mean(-1)
assert feats.dim() == 1, feats.dim()
feats = feats[None,None,:]
with torch.no_grad():
with torch.inference_mode():
units = self.model.units(feats)
return units.transpose(1,2)
units = self.model.units(feats)
return units.transpose(1,2)

View File

@ -2,27 +2,31 @@ from vencoder.encoder import SpeechEncoder
import onnxruntime
import torch
class HubertSoft_Onnx(SpeechEncoder):
def __init__(self,vec_path = "pretrain/hubert-soft.onnx",device=None):
def __init__(self, vec_path="pretrain/hubert-soft.onnx", device=None):
super().__init__()
print("load model(s) from {}".format(vec_path))
self.hidden_dim = 256
if device is None:
self.dev = torch.device("cpu")
else:
self.dev = torch.device(device)
if device == 'cpu' or device == torch.device("cpu") or device is None:
providers = ['CPUExecutionProvider']
elif device == 'cuda' or device == torch.device("cuda"):
if device == 'cuda' or device == torch.device("cuda"):
providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
else:
providers = ['CPUExecutionProvider']
self.model = onnxruntime.InferenceSession(vec_path, providers=providers)
def encoder(self, wav):
feats = wav
if feats.dim() == 2: # double channels
feats = feats.mean(-1)
feats = feats.mean(-1)
assert feats.dim() == 1, feats.dim()
feats = feats.view(1, -1)
feats = feats.unsqueeze(0).cpu().detach().numpy()
onnx_input = {self.model.get_inputs()[0].name: feats}
logits = self.model.run(None, onnx_input)
return torch.tensor(logits[0]).transpose(1, 2).to(self.dev)
return torch.tensor(logits[0]).transpose(1, 2).to(self.dev)

View File

@ -2,8 +2,10 @@ from vencoder.encoder import SpeechEncoder
import torch
from vencoder.wavlm.WavLM import WavLM, WavLMConfig
class WavLMBasePlus(SpeechEncoder):
def __init__(self,vec_path = "pretrain/WavLM-Base+.pt",device=None):
def __init__(self, vec_path="pretrain/WavLM-Base+.pt", device=None):
super().__init__()
print("load model(s) from {}".format(vec_path))
checkpoint = torch.load(vec_path)
self.cfg = WavLMConfig(checkpoint['cfg'])
@ -19,11 +21,11 @@ class WavLMBasePlus(SpeechEncoder):
def encoder(self, wav):
feats = wav
if feats.dim() == 2: # double channels
feats = feats.mean(-1)
feats = feats.mean(-1)
assert feats.dim() == 1, feats.dim()
if self.cfg.normalize:
feats = torch.nn.functional.layer_norm(feats , feats.shape)
feats = torch.nn.functional.layer_norm(feats, feats.shape)
with torch.no_grad():
with torch.inference_mode():
units = self.model.extract_features(feats[None,:])[0]
return units.transpose(1,2)
units = self.model.extract_features(feats[None, :])[0]
return units.transpose(1, 2)

View File

@ -6,7 +6,8 @@ from vencoder.whisper.audio import pad_or_trim, log_mel_spectrogram
class WhisperPPG(SpeechEncoder):
def __init__(self,vec_path = "pretrain/medium.pt",device=None):
def __init__(self, vec_path="pretrain/medium.pt", device=None):
super().__init__()
if device is None:
self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
else:
@ -26,5 +27,5 @@ class WhisperPPG(SpeechEncoder):
mel = log_mel_spectrogram(audio).to(self.dev)
with torch.no_grad():
ppg = self.model.encoder(mel.unsqueeze(0)).squeeze().data.cpu().float().numpy()
ppg = torch.FloatTensor(ppg[:ppgln,]).to(self.dev)
return ppg[None,:,:].transpose(1, 2)
ppg = torch.FloatTensor(ppg[:ppgln, ]).to(self.dev)
return ppg[None, :, :].transpose(1, 2)

View File

@ -6,7 +6,8 @@ from vencoder.whisper.audio import pad_or_trim, log_mel_spectrogram
class WhisperPPGLarge(SpeechEncoder):
def __init__(self,vec_path = "pretrain/large-v2.pt",device=None):
def __init__(self, vec_path="pretrain/large-v2.pt", device=None):
super().__init__()
if device is None:
self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
else:
@ -26,5 +27,5 @@ class WhisperPPGLarge(SpeechEncoder):
mel = log_mel_spectrogram(audio).to(self.dev)
with torch.no_grad():
ppg = self.model.encoder(mel.unsqueeze(0)).squeeze().data.cpu().float().numpy()
ppg = torch.FloatTensor(ppg[:ppgln,]).to(self.dev)
return ppg[None,:,:].transpose(1, 2)
ppg = torch.FloatTensor(ppg[:ppgln, ]).to(self.dev)
return ppg[None, :, :].transpose(1, 2)

View File

@ -1,12 +1,13 @@
class SpeechEncoder(object):
def __init__(self,vec_path = "pretrain/checkpoint_best_legacy_500.pt",device=None):
self.model = None #This is Model
def __init__(self, vec_path="pretrain/checkpoint_best_legacy_500.pt", device=None):
self.model = None # This is Model
self.hidden_dim = 768
pass
def encoder(self,wav):
'''
def encoder(self, wav):
"""
input: wav:[signal_length]
output: embedding:[batchsize,hidden_dim,wav_frame]
'''
pass
"""
pass