From 358369d0327e3bcfa4814e4cc6d27ddc9af00620 Mon Sep 17 00:00:00 2001 From: ylzz1997 Date: Sun, 28 May 2023 21:47:32 +0800 Subject: [PATCH 1/9] Updata vol emb --- configs_template/config_template.json | 3 ++- data_utils.py | 32 ++++++++++++++++++------- models.py | 34 +++++++++++++++++---------- preprocess_hubert_f0.py | 4 +++- train.py | 4 ++-- 5 files changed, 52 insertions(+), 25 deletions(-) diff --git a/configs_template/config_template.json b/configs_template/config_template.json index bbe0a30..86eafd7 100644 --- a/configs_template/config_template.json +++ b/configs_template/config_template.json @@ -56,7 +56,8 @@ "ssl_dim": 768, "n_speakers": 200, "speech_encoder":"vec768l12", - "speaker_embedding":false + "speaker_embedding":false, + "vol_embedding":false }, "spk": { "nyaru": 0, diff --git a/data_utils.py b/data_utils.py index d428006..0893e6e 100644 --- a/data_utils.py +++ b/data_utils.py @@ -23,7 +23,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): 3) computes spectrograms from audio files. """ - def __init__(self, audiopaths, hparams, all_in_mem: bool = False): + def __init__(self, audiopaths, hparams, all_in_mem: bool = False, vol_aug: bool = False): self.audiopaths = load_filepaths_and_text(audiopaths) self.max_wav_value = hparams.data.max_wav_value self.sampling_rate = hparams.data.sampling_rate @@ -34,6 +34,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): self.use_sr = hparams.train.use_sr self.spec_len = hparams.train.max_speclen self.spk_map = hparams.spk + self.vol_emb = hparams.model.vol_embedding random.seed(1234) random.shuffle(self.audiopaths) @@ -72,17 +73,23 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): c = torch.load(filename+ ".soft.pt") c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[0]) - + if self.vol_emb: + volume_path = filename + ".vol.npy" + volume = np.load(volume_path) + volume = torch.from_numpy(volume).float() + else: + volume = None lmin = min(c.size(-1), spec.size(-1)) assert abs(c.size(-1) - spec.size(-1)) < 3, (c.size(-1), spec.size(-1), f0.shape, filename) assert abs(audio_norm.shape[1]-lmin * self.hop_length) < 3 * self.hop_length spec, c, f0, uv = spec[:, :lmin], c[:, :lmin], f0[:lmin], uv[:lmin] audio_norm = audio_norm[:, :lmin * self.hop_length] + if volume!= None: + volume = volume[:lmin] + return c, f0, spec, audio_norm, spk, uv, volume - return c, f0, spec, audio_norm, spk, uv - - def random_slice(self, c, f0, spec, audio_norm, spk, uv): + def random_slice(self, c, f0, spec, audio_norm, spk, uv, volume): # if spec.shape[1] < 30: # print("skip too short audio:", filename) # return None @@ -91,8 +98,9 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): end = start + 790 spec, c, f0, uv = spec[:, start:end], c[:, start:end], f0[start:end], uv[start:end] audio_norm = audio_norm[:, start * self.hop_length : end * self.hop_length] - - return c, f0, spec, audio_norm, spk, uv + if volume !=None: + volume = volume[start:end] + return c, f0, spec, audio_norm, spk, uv,volume def __getitem__(self, index): if self.all_in_mem: @@ -124,12 +132,14 @@ class TextAudioCollate: wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len) spkids = torch.LongTensor(len(batch), 1) uv_padded = torch.FloatTensor(len(batch), max_c_len) + volume_padded = torch.FloatTensor(len(batch), max_c_len) c_padded.zero_() spec_padded.zero_() f0_padded.zero_() wav_padded.zero_() uv_padded.zero_() + volume_padded.zero_() for i in range(len(ids_sorted_decreasing)): row = batch[ids_sorted_decreasing[i]] @@ -151,5 +161,9 @@ class TextAudioCollate: uv = row[5] uv_padded[i, :uv.size(0)] = uv - - return c_padded, f0_padded, spec_padded, wav_padded, spkids, lengths, uv_padded + volume = row[6] + if volume != None: + volume_padded[i, :volume.size(0)] = volume + else : + volume_padded = None + return c_padded, f0_padded, spec_padded, wav_padded, spkids, lengths, uv_padded, volume_padded diff --git a/models.py b/models.py index fcfc2de..1c7f876 100644 --- a/models.py +++ b/models.py @@ -16,7 +16,6 @@ from modules.commons import init_weights, get_padding from vdecoder.hifigan.models import Generator from utils import f0_to_coarse - class ResidualCouplingBlock(nn.Module): def __init__(self, channels, @@ -253,7 +252,6 @@ class SpeakerEncoder(torch.nn.Module): return embed - class F0Decoder(nn.Module): def __init__(self, out_channels, @@ -322,6 +320,7 @@ class SynthesizerTrn(nn.Module): ssl_dim, n_speakers, sampling_rate=44100, + vol_embedding=False, **kwargs): super().__init__() @@ -342,7 +341,10 @@ class SynthesizerTrn(nn.Module): self.segment_size = segment_size self.gin_channels = gin_channels self.ssl_dim = ssl_dim + self.vol_embedding = vol_embedding self.emb_g = nn.Embedding(n_speakers, gin_channels) + if vol_embedding: + self.emb_vol = nn.Linear(1, hidden_channels) self.pre = nn.Conv1d(ssl_dim, hidden_channels, kernel_size=5, padding=2) @@ -389,11 +391,15 @@ class SynthesizerTrn(nn.Module): self.speaker_map = self.speaker_map.unsqueeze(0) self.character_mix = True - def forward(self, c, f0, uv, spec, g=None, c_lengths=None, spec_lengths=None): - g = self.emb_g(g).transpose(1, 2) + def forward(self, c, f0, uv, spec, g=None, c_lengths=None, spec_lengths=None, vol = None): + g = self.emb_g(g).transpose(1,2) + + # vol proj + vol = self.emb_vol(vol[:,:,None]).transpose(1,2) if vol!=None and self.vol_embedding else 0 + # ssl prenet x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to(c.dtype) - x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1, 2) + x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1,2) + vol # f0 predict lf0 = 2595. * torch.log10(1. + f0.unsqueeze(1) / 700.) / 500 @@ -412,20 +418,24 @@ class SynthesizerTrn(nn.Module): o = self.dec(z_slice, g=g, f0=pitch_slice) return o, ids_slice, spec_mask, (z, z_p, m_p, logs_p, m_q, logs_q), pred_lf0, norm_lf0, lf0 - - def infer(self, c, f0, uv, g=None, noice_scale=0.35, predict_f0=False): + + def infer(self, c, f0, uv, g=None, noice_scale=0.35, predict_f0=False, vol = None): c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device) - g = self.emb_g(g).transpose(1, 2) + g = self.emb_g(g).transpose(1,2) x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to(c.dtype) - x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1, 2) - + # vol proj + vol = self.emb_vol(vol[:,:,None]).transpose(1,2) if vol!=None and self.vol_embedding else 0 + + x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1,2) + vol + if predict_f0: lf0 = 2595. * torch.log10(1. + f0.unsqueeze(1) / 700.) / 500 norm_lf0 = utils.normalize_f0(lf0, x_mask, uv, random_scale=False) pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g) f0 = (700 * (torch.pow(10, pred_lf0 * 500 / 2595) - 1)).squeeze(1) - + z_p, m_p, logs_p, c_mask = self.enc_p(x, x_mask, f0=f0_to_coarse(f0), noice_scale=noice_scale) z = self.flow(z_p, c_mask, g=g, reverse=True) o = self.dec(z * c_mask, g=g, f0=f0) - return o, f0 + return o,f0 + diff --git a/preprocess_hubert_f0.py b/preprocess_hubert_f0.py index 9717f57..5924a08 100644 --- a/preprocess_hubert_f0.py +++ b/preprocess_hubert_f0.py @@ -78,12 +78,14 @@ def process_one(filename, hmodel,f0p,diff=False,mel_extractor=None): spec = torch.squeeze(spec, 0) torch.save(spec, spec_path) - if diff: + if diff or hps.model.vol_embedding: volume_path = filename + ".vol.npy" volume_extractor = utils.Volume_Extractor(hop_length) if not os.path.exists(volume_path): volume = volume_extractor.extract(audio_norm) np.save(volume_path, volume.to('cpu').numpy()) + + if diff: mel_path = filename + ".mel.npy" if not os.path.exists(mel_path) and mel_extractor is not None: mel_t = mel_extractor.extract(audio_norm.to(device), sampling_rate) diff --git a/train.py b/train.py index 6ff7f8e..e242431 100644 --- a/train.py +++ b/train.py @@ -155,7 +155,7 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade net_g.train() net_d.train() for batch_idx, items in enumerate(train_loader): - c, f0, spec, y, spk, lengths, uv = items + c, f0, spec, y, spk, lengths, uv,volume = items g = spk.cuda(rank, non_blocking=True) spec, y = spec.cuda(rank, non_blocking=True), y.cuda(rank, non_blocking=True) c = c.cuda(rank, non_blocking=True) @@ -173,7 +173,7 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade with autocast(enabled=hps.train.fp16_run): y_hat, ids_slice, z_mask, \ (z, z_p, m_p, logs_p, m_q, logs_q), pred_lf0, norm_lf0, lf0 = net_g(c, f0, uv, spec, g=g, c_lengths=lengths, - spec_lengths=lengths) + spec_lengths=lengths,vol = volume) y_mel = commons.slice_segments(mel, ids_slice, hps.train.segment_size // hps.data.hop_length) y_hat_mel = mel_spectrogram_torch( From 649ecd4c7eae440eecdbbc50923cfd7ce5b28634 Mon Sep 17 00:00:00 2001 From: ylzz1997 Date: Sun, 28 May 2023 22:29:27 +0800 Subject: [PATCH 2/9] Updata vol infer --- inference/infer_tool.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/inference/infer_tool.py b/inference/infer_tool.py index 9d0e016..632f3d2 100644 --- a/inference/infer_tool.py +++ b/inference/infer_tool.py @@ -136,6 +136,7 @@ class Svc(object): self.target_sample = self.hps_ms.data.sampling_rate self.hop_size = self.hps_ms.data.hop_length self.spk2id = self.hps_ms.spk + self.vol_embedding = self.hps_ms.model.vol_embedding try: self.speech_encoder = self.hps_ms.model.speech_encoder except Exception as e: @@ -233,16 +234,17 @@ class Svc(object): c = c.half() with torch.no_grad(): start = time.time() + vol = None if not self.only_diffusion: - audio,f0 = self.net_g_ms.infer(c, f0=f0, g=sid, uv=uv, predict_f0=auto_predict_f0, noice_scale=noice_scale) + vol = self.volume_extractor.extract(audio[None,:])[None,:].to(self.dev) if self.vol_embedding else None + audio,f0 = self.net_g_ms.infer(c, f0=f0, g=sid, uv=uv, predict_f0=auto_predict_f0, noice_scale=noice_scale,vol=vol) audio = audio[0,0].data.float() - if self.shallow_diffusion: - audio_mel = self.vocoder.extract(audio[None,:],self.target_sample) + audio_mel = self.vocoder.extract(audio[None,:],self.target_sample) if self.shallow_diffusion else None else: audio = torch.FloatTensor(wav).to(self.dev) audio_mel = None if self.only_diffusion or self.shallow_diffusion: - vol = self.volume_extractor.extract(audio[None,:])[None,:,None].to(self.dev) + vol = self.volume_extractor.extract(audio[None,:])[None,:,None].to(self.dev) if vol==None else vol[:,:,None] f0 = f0[:,:,None] c = c.transpose(-1,-2) audio_mel = self.diffusion_model( From fd64b8b30f7f78609e504d578f8d46c1a88b0180 Mon Sep 17 00:00:00 2001 From: ylzz1997 Date: Sun, 28 May 2023 22:55:02 +0800 Subject: [PATCH 3/9] Updata vol default --- inference/infer_tool.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/inference/infer_tool.py b/inference/infer_tool.py index 632f3d2..ee6b4d7 100644 --- a/inference/infer_tool.py +++ b/inference/infer_tool.py @@ -136,7 +136,10 @@ class Svc(object): self.target_sample = self.hps_ms.data.sampling_rate self.hop_size = self.hps_ms.data.hop_length self.spk2id = self.hps_ms.spk - self.vol_embedding = self.hps_ms.model.vol_embedding + try: + self.vol_embedding = self.hps_ms.model.vol_embedding + except Exception as e: + self.vol_embedding = False try: self.speech_encoder = self.hps_ms.model.speech_encoder except Exception as e: From 32337a0c5b97d5cc3fbe67c8d7e39b8c86f9fdaa Mon Sep 17 00:00:00 2001 From: ylzz1997 Date: Mon, 29 May 2023 01:22:09 +0800 Subject: [PATCH 4/9] Updata vol augument --- configs_template/config_template.json | 3 ++- data_utils.py | 21 +++++++++++++++++++-- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/configs_template/config_template.json b/configs_template/config_template.json index 86eafd7..b392fd1 100644 --- a/configs_template/config_template.json +++ b/configs_template/config_template.json @@ -22,7 +22,8 @@ "max_speclen": 512, "port": "8001", "keep_ckpts": 3, - "all_in_mem": false + "all_in_mem": false, + "vol_aug":false }, "data": { "training_files": "filelists/train.txt", diff --git a/data_utils.py b/data_utils.py index 0893e6e..54cf997 100644 --- a/data_utils.py +++ b/data_utils.py @@ -7,7 +7,7 @@ import torch.utils.data import modules.commons as commons import utils -from modules.mel_processing import spectrogram_torch, spec_to_mel_torch +from modules.mel_processing import spectrogram_torch, spec_to_mel_torch, mel_spectrogram_torch from utils import load_wav_to_torch, load_filepaths_and_text # import h5py @@ -25,6 +25,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): def __init__(self, audiopaths, hparams, all_in_mem: bool = False, vol_aug: bool = False): self.audiopaths = load_filepaths_and_text(audiopaths) + self.hparams = hparams self.max_wav_value = hparams.data.max_wav_value self.sampling_rate = hparams.data.sampling_rate self.filter_length = hparams.data.filter_length @@ -35,7 +36,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): self.spec_len = hparams.train.max_speclen self.spk_map = hparams.spk self.vol_emb = hparams.model.vol_embedding - + self.vol_aug = vol_aug random.seed(1234) random.shuffle(self.audiopaths) @@ -93,6 +94,22 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): # if spec.shape[1] < 30: # print("skip too short audio:", filename) # return None + + if random.choice([True, False]) and self.vol_aug and volume!=None: + max_amp = float(torch.max(torch.abs(audio_norm))) + 1e-5 + max_shift = min(1, np.log10(1/max_amp)) + log10_vol_shift = random.uniform(-1, max_shift) + audio_norm = audio_norm * (10 ** log10_vol_shift) + volume = volume * (10 ** log10_vol_shift) + spec = mel_spectrogram_torch(audio_norm, + self.hparams.data.filter_length, + self.hparams.data.n_mel_channels, + self.hparams.data.sampling_rate, + self.hparams.data.hop_length, + self.hparams.data.win_length, + self.hparams.data.mel_fmin, + self.hparams.data.mel_fmax) + if spec.shape[1] > 800: start = random.randint(0, spec.shape[1]-800) end = start + 790 From e3dc2b2b94302824c163ffb899d1c65f3d0024ef Mon Sep 17 00:00:00 2001 From: ylzz1997 Date: Mon, 29 May 2023 01:22:57 +0800 Subject: [PATCH 5/9] fix(vol_aug) --- data_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data_utils.py b/data_utils.py index 54cf997..52ad0a6 100644 --- a/data_utils.py +++ b/data_utils.py @@ -23,7 +23,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): 3) computes spectrograms from audio files. """ - def __init__(self, audiopaths, hparams, all_in_mem: bool = False, vol_aug: bool = False): + def __init__(self, audiopaths, hparams, all_in_mem: bool = False): self.audiopaths = load_filepaths_and_text(audiopaths) self.hparams = hparams self.max_wav_value = hparams.data.max_wav_value @@ -36,7 +36,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): self.spec_len = hparams.train.max_speclen self.spk_map = hparams.spk self.vol_emb = hparams.model.vol_embedding - self.vol_aug = vol_aug + self.vol_aug = hparams.train.vol_aug random.seed(1234) random.shuffle(self.audiopaths) From 966cb48134902575a3ce2251118a181079434137 Mon Sep 17 00:00:00 2001 From: ylzz1997 Date: Mon, 29 May 2023 01:46:18 +0800 Subject: [PATCH 6/9] Debug --- data_utils.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/data_utils.py b/data_utils.py index 52ad0a6..18dad0e 100644 --- a/data_utils.py +++ b/data_utils.py @@ -7,7 +7,7 @@ import torch.utils.data import modules.commons as commons import utils -from modules.mel_processing import spectrogram_torch, spec_to_mel_torch, mel_spectrogram_torch +from modules.mel_processing import spectrogram_torch, spec_to_mel_torch, spectrogram_torch from utils import load_wav_to_torch, load_filepaths_and_text # import h5py @@ -101,14 +101,12 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): log10_vol_shift = random.uniform(-1, max_shift) audio_norm = audio_norm * (10 ** log10_vol_shift) volume = volume * (10 ** log10_vol_shift) - spec = mel_spectrogram_torch(audio_norm, - self.hparams.data.filter_length, - self.hparams.data.n_mel_channels, - self.hparams.data.sampling_rate, - self.hparams.data.hop_length, - self.hparams.data.win_length, - self.hparams.data.mel_fmin, - self.hparams.data.mel_fmax) + spec = spectrogram_torch(audio_norm, + self.hparams.data.filter_length, + self.hparams.data.sampling_rate, + self.hparams.data.hop_length, + self.hparams.data.win_length, + center=False)[0] if spec.shape[1] > 800: start = random.randint(0, spec.shape[1]-800) From 4e982eb7d5d676f3dcba29bdca07177d67bc9eaa Mon Sep 17 00:00:00 2001 From: ylzz1997 Date: Mon, 29 May 2023 01:56:34 +0800 Subject: [PATCH 7/9] Debug --- train.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/train.py b/train.py index e242431..a6e219f 100644 --- a/train.py +++ b/train.py @@ -281,12 +281,13 @@ def evaluate(hps, generator, eval_loader, writer_eval): audio_dict = {} with torch.no_grad(): for batch_idx, items in enumerate(eval_loader): - c, f0, spec, y, spk, _, uv = items + c, f0, spec, y, spk, _, uv,volume = items g = spk[:1].cuda(0) spec, y = spec[:1].cuda(0), y[:1].cuda(0) c = c[:1].cuda(0) f0 = f0[:1].cuda(0) uv= uv[:1].cuda(0) + volume = volume[:1].cuda(0) mel = spec_to_mel_torch( spec, hps.data.filter_length, @@ -294,7 +295,7 @@ def evaluate(hps, generator, eval_loader, writer_eval): hps.data.sampling_rate, hps.data.mel_fmin, hps.data.mel_fmax) - y_hat,_ = generator.module.infer(c, f0, uv, g=g) + y_hat,_ = generator.module.infer(c, f0, uv, g=g,vol = volume) y_hat_mel = mel_spectrogram_torch( y_hat.squeeze(1).float(), From 5f255340815bb31691d168f6eeba060893c843f7 Mon Sep 17 00:00:00 2001 From: ylzz1997 Date: Mon, 29 May 2023 02:11:04 +0800 Subject: [PATCH 8/9] Debug --- inference/infer_tool.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inference/infer_tool.py b/inference/infer_tool.py index ee6b4d7..ea7e66e 100644 --- a/inference/infer_tool.py +++ b/inference/infer_tool.py @@ -239,7 +239,7 @@ class Svc(object): start = time.time() vol = None if not self.only_diffusion: - vol = self.volume_extractor.extract(audio[None,:])[None,:].to(self.dev) if self.vol_embedding else None + vol = self.volume_extractor.extract(torch.FloatTensor(wav).to(self.dev)[None,:])[None,:].to(self.dev) if self.vol_embedding else None audio,f0 = self.net_g_ms.infer(c, f0=f0, g=sid, uv=uv, predict_f0=auto_predict_f0, noice_scale=noice_scale,vol=vol) audio = audio[0,0].data.float() audio_mel = self.vocoder.extract(audio[None,:],self.target_sample) if self.shallow_diffusion else None From dfc2b6a6287c445a09d5efadfb24e6a406deb194 Mon Sep 17 00:00:00 2001 From: ylzz1997 Date: Mon, 29 May 2023 02:23:11 +0800 Subject: [PATCH 9/9] Debug --- data_utils.py | 4 ++-- train.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/data_utils.py b/data_utils.py index 18dad0e..07eccbd 100644 --- a/data_utils.py +++ b/data_utils.py @@ -23,7 +23,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): 3) computes spectrograms from audio files. """ - def __init__(self, audiopaths, hparams, all_in_mem: bool = False): + def __init__(self, audiopaths, hparams, all_in_mem: bool = False, vol_aug: bool = True): self.audiopaths = load_filepaths_and_text(audiopaths) self.hparams = hparams self.max_wav_value = hparams.data.max_wav_value @@ -36,7 +36,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): self.spec_len = hparams.train.max_speclen self.spk_map = hparams.spk self.vol_emb = hparams.model.vol_embedding - self.vol_aug = hparams.train.vol_aug + self.vol_aug = hparams.train.vol_aug and vol_aug random.seed(1234) random.shuffle(self.audiopaths) diff --git a/train.py b/train.py index a6e219f..123f48a 100644 --- a/train.py +++ b/train.py @@ -75,7 +75,7 @@ def run(rank, n_gpus, hps): train_loader = DataLoader(train_dataset, num_workers=num_workers, shuffle=False, pin_memory=True, batch_size=hps.train.batch_size, collate_fn=collate_fn) if rank == 0: - eval_dataset = TextAudioSpeakerLoader(hps.data.validation_files, hps, all_in_mem=all_in_mem) + eval_dataset = TextAudioSpeakerLoader(hps.data.validation_files, hps, all_in_mem=all_in_mem,vol_aug = False) eval_loader = DataLoader(eval_dataset, num_workers=1, shuffle=False, batch_size=1, pin_memory=False, drop_last=False, collate_fn=collate_fn)