Merge branch '4.1-Latest' into 4.1-Stable
This commit is contained in:
commit
ff19dec41e
|
@ -22,7 +22,8 @@
|
|||
"max_speclen": 512,
|
||||
"port": "8001",
|
||||
"keep_ckpts": 3,
|
||||
"all_in_mem": false
|
||||
"all_in_mem": false,
|
||||
"vol_aug":false
|
||||
},
|
||||
"data": {
|
||||
"training_files": "filelists/train.txt",
|
||||
|
@ -56,7 +57,8 @@
|
|||
"ssl_dim": 768,
|
||||
"n_speakers": 200,
|
||||
"speech_encoder":"vec768l12",
|
||||
"speaker_embedding":false
|
||||
"speaker_embedding":false,
|
||||
"vol_embedding":false
|
||||
},
|
||||
"spk": {
|
||||
"nyaru": 0,
|
||||
|
|
|
@ -7,7 +7,7 @@ import torch.utils.data
|
|||
|
||||
import modules.commons as commons
|
||||
import utils
|
||||
from modules.mel_processing import spectrogram_torch, spec_to_mel_torch
|
||||
from modules.mel_processing import spectrogram_torch, spec_to_mel_torch, spectrogram_torch
|
||||
from utils import load_wav_to_torch, load_filepaths_and_text
|
||||
|
||||
# import h5py
|
||||
|
@ -23,8 +23,9 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
|||
3) computes spectrograms from audio files.
|
||||
"""
|
||||
|
||||
def __init__(self, audiopaths, hparams, all_in_mem: bool = False):
|
||||
def __init__(self, audiopaths, hparams, all_in_mem: bool = False, vol_aug: bool = True):
|
||||
self.audiopaths = load_filepaths_and_text(audiopaths)
|
||||
self.hparams = hparams
|
||||
self.max_wav_value = hparams.data.max_wav_value
|
||||
self.sampling_rate = hparams.data.sampling_rate
|
||||
self.filter_length = hparams.data.filter_length
|
||||
|
@ -34,7 +35,8 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
|||
self.use_sr = hparams.train.use_sr
|
||||
self.spec_len = hparams.train.max_speclen
|
||||
self.spk_map = hparams.spk
|
||||
|
||||
self.vol_emb = hparams.model.vol_embedding
|
||||
self.vol_aug = hparams.train.vol_aug and vol_aug
|
||||
random.seed(1234)
|
||||
random.shuffle(self.audiopaths)
|
||||
|
||||
|
@ -72,27 +74,48 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
|||
|
||||
c = torch.load(filename+ ".soft.pt")
|
||||
c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[0])
|
||||
|
||||
if self.vol_emb:
|
||||
volume_path = filename + ".vol.npy"
|
||||
volume = np.load(volume_path)
|
||||
volume = torch.from_numpy(volume).float()
|
||||
else:
|
||||
volume = None
|
||||
|
||||
lmin = min(c.size(-1), spec.size(-1))
|
||||
assert abs(c.size(-1) - spec.size(-1)) < 3, (c.size(-1), spec.size(-1), f0.shape, filename)
|
||||
assert abs(audio_norm.shape[1]-lmin * self.hop_length) < 3 * self.hop_length
|
||||
spec, c, f0, uv = spec[:, :lmin], c[:, :lmin], f0[:lmin], uv[:lmin]
|
||||
audio_norm = audio_norm[:, :lmin * self.hop_length]
|
||||
if volume!= None:
|
||||
volume = volume[:lmin]
|
||||
return c, f0, spec, audio_norm, spk, uv, volume
|
||||
|
||||
return c, f0, spec, audio_norm, spk, uv
|
||||
|
||||
def random_slice(self, c, f0, spec, audio_norm, spk, uv):
|
||||
def random_slice(self, c, f0, spec, audio_norm, spk, uv, volume):
|
||||
# if spec.shape[1] < 30:
|
||||
# print("skip too short audio:", filename)
|
||||
# return None
|
||||
|
||||
if random.choice([True, False]) and self.vol_aug and volume!=None:
|
||||
max_amp = float(torch.max(torch.abs(audio_norm))) + 1e-5
|
||||
max_shift = min(1, np.log10(1/max_amp))
|
||||
log10_vol_shift = random.uniform(-1, max_shift)
|
||||
audio_norm = audio_norm * (10 ** log10_vol_shift)
|
||||
volume = volume * (10 ** log10_vol_shift)
|
||||
spec = spectrogram_torch(audio_norm,
|
||||
self.hparams.data.filter_length,
|
||||
self.hparams.data.sampling_rate,
|
||||
self.hparams.data.hop_length,
|
||||
self.hparams.data.win_length,
|
||||
center=False)[0]
|
||||
|
||||
if spec.shape[1] > 800:
|
||||
start = random.randint(0, spec.shape[1]-800)
|
||||
end = start + 790
|
||||
spec, c, f0, uv = spec[:, start:end], c[:, start:end], f0[start:end], uv[start:end]
|
||||
audio_norm = audio_norm[:, start * self.hop_length : end * self.hop_length]
|
||||
|
||||
return c, f0, spec, audio_norm, spk, uv
|
||||
if volume !=None:
|
||||
volume = volume[start:end]
|
||||
return c, f0, spec, audio_norm, spk, uv,volume
|
||||
|
||||
def __getitem__(self, index):
|
||||
if self.all_in_mem:
|
||||
|
@ -124,12 +147,14 @@ class TextAudioCollate:
|
|||
wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
|
||||
spkids = torch.LongTensor(len(batch), 1)
|
||||
uv_padded = torch.FloatTensor(len(batch), max_c_len)
|
||||
volume_padded = torch.FloatTensor(len(batch), max_c_len)
|
||||
|
||||
c_padded.zero_()
|
||||
spec_padded.zero_()
|
||||
f0_padded.zero_()
|
||||
wav_padded.zero_()
|
||||
uv_padded.zero_()
|
||||
volume_padded.zero_()
|
||||
|
||||
for i in range(len(ids_sorted_decreasing)):
|
||||
row = batch[ids_sorted_decreasing[i]]
|
||||
|
@ -151,5 +176,9 @@ class TextAudioCollate:
|
|||
|
||||
uv = row[5]
|
||||
uv_padded[i, :uv.size(0)] = uv
|
||||
|
||||
return c_padded, f0_padded, spec_padded, wav_padded, spkids, lengths, uv_padded
|
||||
volume = row[6]
|
||||
if volume != None:
|
||||
volume_padded[i, :volume.size(0)] = volume
|
||||
else :
|
||||
volume_padded = None
|
||||
return c_padded, f0_padded, spec_padded, wav_padded, spkids, lengths, uv_padded, volume_padded
|
||||
|
|
|
@ -137,6 +137,10 @@ class Svc(object):
|
|||
self.target_sample = self.hps_ms.data.sampling_rate
|
||||
self.hop_size = self.hps_ms.data.hop_length
|
||||
self.spk2id = self.hps_ms.spk
|
||||
try:
|
||||
self.vol_embedding = self.hps_ms.model.vol_embedding
|
||||
except Exception as e:
|
||||
self.vol_embedding = False
|
||||
try:
|
||||
self.speech_encoder = self.hps_ms.model.speech_encoder
|
||||
except Exception as e:
|
||||
|
@ -245,16 +249,17 @@ class Svc(object):
|
|||
c = c.half()
|
||||
with torch.no_grad():
|
||||
start = time.time()
|
||||
vol = None
|
||||
if not self.only_diffusion:
|
||||
audio,f0 = self.net_g_ms.infer(c, f0=f0, g=sid, uv=uv, predict_f0=auto_predict_f0, noice_scale=noice_scale)
|
||||
vol = self.volume_extractor.extract(torch.FloatTensor(wav).to(self.dev)[None,:])[None,:].to(self.dev) if self.vol_embedding else None
|
||||
audio,f0 = self.net_g_ms.infer(c, f0=f0, g=sid, uv=uv, predict_f0=auto_predict_f0, noice_scale=noice_scale,vol=vol)
|
||||
audio = audio[0,0].data.float()
|
||||
if self.shallow_diffusion:
|
||||
audio_mel = self.vocoder.extract(audio[None,:],self.target_sample)
|
||||
audio_mel = self.vocoder.extract(audio[None,:],self.target_sample) if self.shallow_diffusion else None
|
||||
else:
|
||||
audio = torch.FloatTensor(wav).to(self.dev)
|
||||
audio_mel = None
|
||||
if self.only_diffusion or self.shallow_diffusion:
|
||||
vol = self.volume_extractor.extract(audio[None,:])[None,:,None].to(self.dev)
|
||||
vol = self.volume_extractor.extract(audio[None,:])[None,:,None].to(self.dev) if vol==None else vol[:,:,None]
|
||||
f0 = f0[:,:,None]
|
||||
c = c.transpose(-1,-2)
|
||||
audio_mel = self.diffusion_model(
|
||||
|
|
30
models.py
30
models.py
|
@ -16,7 +16,6 @@ from modules.commons import init_weights, get_padding
|
|||
from vdecoder.hifigan.models import Generator
|
||||
from utils import f0_to_coarse
|
||||
|
||||
|
||||
class ResidualCouplingBlock(nn.Module):
|
||||
def __init__(self,
|
||||
channels,
|
||||
|
@ -253,7 +252,6 @@ class SpeakerEncoder(torch.nn.Module):
|
|||
|
||||
return embed
|
||||
|
||||
|
||||
class F0Decoder(nn.Module):
|
||||
def __init__(self,
|
||||
out_channels,
|
||||
|
@ -322,6 +320,7 @@ class SynthesizerTrn(nn.Module):
|
|||
ssl_dim,
|
||||
n_speakers,
|
||||
sampling_rate=44100,
|
||||
vol_embedding=False,
|
||||
**kwargs):
|
||||
|
||||
super().__init__()
|
||||
|
@ -342,7 +341,10 @@ class SynthesizerTrn(nn.Module):
|
|||
self.segment_size = segment_size
|
||||
self.gin_channels = gin_channels
|
||||
self.ssl_dim = ssl_dim
|
||||
self.vol_embedding = vol_embedding
|
||||
self.emb_g = nn.Embedding(n_speakers, gin_channels)
|
||||
if vol_embedding:
|
||||
self.emb_vol = nn.Linear(1, hidden_channels)
|
||||
|
||||
self.pre = nn.Conv1d(ssl_dim, hidden_channels, kernel_size=5, padding=2)
|
||||
|
||||
|
@ -389,11 +391,15 @@ class SynthesizerTrn(nn.Module):
|
|||
self.speaker_map = self.speaker_map.unsqueeze(0).to(device)
|
||||
self.character_mix = True
|
||||
|
||||
def forward(self, c, f0, uv, spec, g=None, c_lengths=None, spec_lengths=None):
|
||||
g = self.emb_g(g).transpose(1, 2)
|
||||
def forward(self, c, f0, uv, spec, g=None, c_lengths=None, spec_lengths=None, vol = None):
|
||||
g = self.emb_g(g).transpose(1,2)
|
||||
|
||||
# vol proj
|
||||
vol = self.emb_vol(vol[:,:,None]).transpose(1,2) if vol!=None and self.vol_embedding else 0
|
||||
|
||||
# ssl prenet
|
||||
x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to(c.dtype)
|
||||
x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1, 2)
|
||||
x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1,2) + vol
|
||||
|
||||
# f0 predict
|
||||
lf0 = 2595. * torch.log10(1. + f0.unsqueeze(1) / 700.) / 500
|
||||
|
@ -413,7 +419,7 @@ class SynthesizerTrn(nn.Module):
|
|||
|
||||
return o, ids_slice, spec_mask, (z, z_p, m_p, logs_p, m_q, logs_q), pred_lf0, norm_lf0, lf0
|
||||
|
||||
def infer(self, c, f0, uv, g=None, noice_scale=0.35, seed=52468, predict_f0=False):
|
||||
def infer(self, c, f0, uv, g=None, noice_scale=0.35, seed=52468, predict_f0=False, vol = None):
|
||||
|
||||
if c.device == torch.device("cuda"):
|
||||
torch.cuda.manual_seed_all(seed)
|
||||
|
@ -433,15 +439,19 @@ class SynthesizerTrn(nn.Module):
|
|||
g = self.emb_g(g).transpose(1, 2)
|
||||
|
||||
x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to(c.dtype)
|
||||
x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1, 2)
|
||||
|
||||
# vol proj
|
||||
vol = self.emb_vol(vol[:,:,None]).transpose(1,2) if vol!=None and self.vol_embedding else 0
|
||||
|
||||
x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1,2) + vol
|
||||
|
||||
if predict_f0:
|
||||
lf0 = 2595. * torch.log10(1. + f0.unsqueeze(1) / 700.) / 500
|
||||
norm_lf0 = utils.normalize_f0(lf0, x_mask, uv, random_scale=False)
|
||||
pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g)
|
||||
f0 = (700 * (torch.pow(10, pred_lf0 * 500 / 2595) - 1)).squeeze(1)
|
||||
|
||||
|
||||
z_p, m_p, logs_p, c_mask = self.enc_p(x, x_mask, f0=f0_to_coarse(f0), noice_scale=noice_scale)
|
||||
z = self.flow(z_p, c_mask, g=g, reverse=True)
|
||||
o = self.dec(z * c_mask, g=g, f0=f0)
|
||||
return o, f0
|
||||
return o,f0
|
||||
|
||||
|
|
|
@ -78,12 +78,14 @@ def process_one(filename, hmodel,f0p,diff=False,mel_extractor=None):
|
|||
spec = torch.squeeze(spec, 0)
|
||||
torch.save(spec, spec_path)
|
||||
|
||||
if diff:
|
||||
if diff or hps.model.vol_embedding:
|
||||
volume_path = filename + ".vol.npy"
|
||||
volume_extractor = utils.Volume_Extractor(hop_length)
|
||||
if not os.path.exists(volume_path):
|
||||
volume = volume_extractor.extract(audio_norm)
|
||||
np.save(volume_path, volume.to('cpu').numpy())
|
||||
|
||||
if diff:
|
||||
mel_path = filename + ".mel.npy"
|
||||
if not os.path.exists(mel_path) and mel_extractor is not None:
|
||||
mel_t = mel_extractor.extract(audio_norm.to(device), sampling_rate)
|
||||
|
|
11
train.py
11
train.py
|
@ -75,7 +75,7 @@ def run(rank, n_gpus, hps):
|
|||
train_loader = DataLoader(train_dataset, num_workers=num_workers, shuffle=False, pin_memory=True,
|
||||
batch_size=hps.train.batch_size, collate_fn=collate_fn)
|
||||
if rank == 0:
|
||||
eval_dataset = TextAudioSpeakerLoader(hps.data.validation_files, hps, all_in_mem=all_in_mem)
|
||||
eval_dataset = TextAudioSpeakerLoader(hps.data.validation_files, hps, all_in_mem=all_in_mem,vol_aug = False)
|
||||
eval_loader = DataLoader(eval_dataset, num_workers=1, shuffle=False,
|
||||
batch_size=1, pin_memory=False,
|
||||
drop_last=False, collate_fn=collate_fn)
|
||||
|
@ -155,7 +155,7 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade
|
|||
net_g.train()
|
||||
net_d.train()
|
||||
for batch_idx, items in enumerate(train_loader):
|
||||
c, f0, spec, y, spk, lengths, uv = items
|
||||
c, f0, spec, y, spk, lengths, uv,volume = items
|
||||
g = spk.cuda(rank, non_blocking=True)
|
||||
spec, y = spec.cuda(rank, non_blocking=True), y.cuda(rank, non_blocking=True)
|
||||
c = c.cuda(rank, non_blocking=True)
|
||||
|
@ -173,7 +173,7 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade
|
|||
with autocast(enabled=hps.train.fp16_run):
|
||||
y_hat, ids_slice, z_mask, \
|
||||
(z, z_p, m_p, logs_p, m_q, logs_q), pred_lf0, norm_lf0, lf0 = net_g(c, f0, uv, spec, g=g, c_lengths=lengths,
|
||||
spec_lengths=lengths)
|
||||
spec_lengths=lengths,vol = volume)
|
||||
|
||||
y_mel = commons.slice_segments(mel, ids_slice, hps.train.segment_size // hps.data.hop_length)
|
||||
y_hat_mel = mel_spectrogram_torch(
|
||||
|
@ -281,12 +281,13 @@ def evaluate(hps, generator, eval_loader, writer_eval):
|
|||
audio_dict = {}
|
||||
with torch.no_grad():
|
||||
for batch_idx, items in enumerate(eval_loader):
|
||||
c, f0, spec, y, spk, _, uv = items
|
||||
c, f0, spec, y, spk, _, uv,volume = items
|
||||
g = spk[:1].cuda(0)
|
||||
spec, y = spec[:1].cuda(0), y[:1].cuda(0)
|
||||
c = c[:1].cuda(0)
|
||||
f0 = f0[:1].cuda(0)
|
||||
uv= uv[:1].cuda(0)
|
||||
volume = volume[:1].cuda(0)
|
||||
mel = spec_to_mel_torch(
|
||||
spec,
|
||||
hps.data.filter_length,
|
||||
|
@ -294,7 +295,7 @@ def evaluate(hps, generator, eval_loader, writer_eval):
|
|||
hps.data.sampling_rate,
|
||||
hps.data.mel_fmin,
|
||||
hps.data.mel_fmax)
|
||||
y_hat,_ = generator.module.infer(c, f0, uv, g=g)
|
||||
y_hat,_ = generator.module.infer(c, f0, uv, g=g,vol = volume)
|
||||
|
||||
y_hat_mel = mel_spectrogram_torch(
|
||||
y_hat.squeeze(1).float(),
|
||||
|
|
Loading…
Reference in New Issue