fix f0

2023-08-23 03:05:25 +08:00 · 2023-08-23 03:05:25 +08:00 · 08617333ce
parent fd8e717112
commit 08617333ce
2 changed files with 4 additions and 2 deletions
--- a/inference/infer_tool.py
+++ b/inference/infer_tool.py
@ -6,6 +6,7 @@ import logging
 import os
 import pickle
 import time
+import typing
 from pathlib import Path

 import librosa
@ -199,6 +200,7 @@ class Svc(object):
            _ = self.net_g_ms.half().eval().to(self.dev)
        else:
            _ = self.net_g_ms.eval().to(self.dev)
+        del self.net_g_ms.enc_q
        if spk_mix_enable:
            self.net_g_ms.EnableCharacterMix(len(self.spk2id), self.dev)

@ -268,7 +270,7 @@ class Svc(object):
              second_encoding = False,
              loudness_envelope_adjustment = 1
              ):
-        if isinstance(raw_path, str):
+        if isinstance(raw_path, str) or isinstance(raw_path, io.BytesIO):
            wav, sr = torchaudio.load(raw_path)
            if not hasattr(self,"audio_resample_transform") or self.audio16k_resample_transform.orig_freq != sr:
                self.audio_resample_transform = torchaudio.transforms.Resample(sr,self.target_sample)
--- a/models.py
+++ b/models.py
@ -523,7 +523,7 @@ class SynthesizerTrn(nn.Module):
        if self.use_automatic_f0_prediction and predict_f0:
            lf0 = 2595. * torch.log10(1. + f0.unsqueeze(1) / 700.) / 500
            norm_lf0 = utils.normalize_f0(lf0, x_mask, uv, random_scale=False)
-            pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g)
+            pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g).to(f0)
            f0 = (700 * (torch.pow(10, pred_lf0 * 500 / 2595) - 1)).squeeze(1)
        
        z_p, m_p, logs_p, c_mask = self.enc_p(x, x_mask, f0=f0_to_coarse(f0), noice_scale=noice_scale)