From 5b249743cda709f7a514ddce0fb5fcb734624ad7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=CE=9D=CE=B1=CF=81=CE=BF=CF=85=CF=83=CE=AD=C2=B7=CE=BC?=
 =?UTF-8?q?=C2=B7=CE=B3=CE=B9=CE=BF=CF=85=CE=BC=CE=B5=CE=BC=CE=AF=C2=B7?=
 =?UTF-8?q?=CE=A7=CE=B9=CE=BD=CE=B1=CE=BA=CE=AC=CE=BD=CE=BD=CE=B1?=
 <40709280+NaruseMioShirakana@users.noreply.github.com>
Date: Fri, 10 Mar 2023 19:41:05 +0800
Subject: [PATCH] Add files via upload

---
 configs/config.json      |   97 +---
 inference/infer_tool.py  |    7 +-
 modules/modules.py       |  535 ++++++++-----------
 onnxexport/model_onnx.py | 1096 +++++++-------------------------------
 4 files changed, 437 insertions(+), 1298 deletions(-)

diff --git a/configs/config.json b/configs/config.json
index 550059a..f19d46d 100644
--- a/configs/config.json
+++ b/configs/config.json
@@ -1,103 +1,62 @@
 {
   "train": {
-    "log_interval": 50,
-    "eval_interval": 1000,
+    "log_interval": 200,
+    "eval_interval": 800,
     "seed": 1234,
-    "port": 8001,
     "epochs": 10000,
-    "learning_rate": 0.0002,
+    "learning_rate": 0.0001,
     "betas": [
       0.8,
       0.99
     ],
     "eps": 1e-09,
     "batch_size": 6,
-    "accumulation_steps": 1,
     "fp16_run": false,
-    "lr_decay": 0.998,
+    "lr_decay": 0.999875,
     "segment_size": 10240,
     "init_lr_ratio": 1,
     "warmup_epochs": 0,
     "c_mel": 45,
-    "keep_ckpts":4
+    "c_kl": 1.0,
+    "use_sr": true,
+    "max_speclen": 512,
+    "port": "8001",
+    "keep_ckpts": 3
   },
   "data": {
-    "data_dir": "dataset",
-    "dataset_type": "SingDataset",
-    "collate_type": "SingCollate",
-    "training_filelist": "filelists/train.txt",
-    "validation_filelist": "filelists/val.txt",
+    "training_files": "filelists/train.txt",
+    "validation_files": "filelists/val.txt",
     "max_wav_value": 32768.0,
     "sampling_rate": 44100,
-    "n_fft": 2048,
-    "fmin": 0,
-    "fmax": 22050,
+    "filter_length": 2048,
     "hop_length": 512,
-    "win_size": 2048,
-    "acoustic_dim": 80,
-    "c_dim": 256,
-    "min_level_db": -115,
-    "ref_level_db": 20,
-    "min_db": -115,
-    "max_abs_value": 4.0,
-    "n_speakers": 200
+    "win_length": 2048,
+    "n_mel_channels": 80,
+    "mel_fmin": 0.0,
+    "mel_fmax": 22050
   },
   "model": {
+    "inter_channels": 192,
     "hidden_channels": 192,
-    "spk_channels": 192,
     "filter_channels": 768,
     "n_heads": 2,
-    "n_layers": 4,
+    "n_layers": 6,
     "kernel_size": 3,
     "p_dropout": 0.1,
-    "prior_hidden_channels": 192,
-    "prior_filter_channels": 768,
-    "prior_n_heads": 2,
-    "prior_n_layers": 4,
-    "prior_kernel_size": 3,
-    "prior_p_dropout": 0.1,
     "resblock": "1",
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "upsample_rates": [ 8, 8, 2, 2, 2],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [16,16, 4, 4, 4],
+    "n_layers_q": 3,
     "use_spectral_norm": false,
-    "resblock_kernel_sizes": [
-      3,
-      7,
-      11
-    ],
-    "resblock_dilation_sizes": [
-      [
-        1,
-        3,
-        5
-      ],
-      [
-        1,
-        3,
-        5
-      ],
-      [
-        1,
-        3,
-        5
-      ]
-    ],
-    "upsample_rates": [
-      8,
-      8,
-      4,
-      2
-    ],
-    "upsample_initial_channel": 256,
-    "upsample_kernel_sizes": [
-      16,
-      16,
-      8,
-      4
-    ],
-    "n_harmonic": 64,
-    "n_bands": 65
+    "gin_channels": 256,
+    "ssl_dim": 256,
+    "n_speakers": 200
   },
   "spk": {
-    "jishuang": 0,
+    "nyaru": 0,
     "huiyu": 1,
     "nen": 2,
     "paimon": 3,
diff --git a/inference/infer_tool.py b/inference/infer_tool.py
index 03b97ad..415e956 100644
--- a/inference/infer_tool.py
+++ b/inference/infer_tool.py
@@ -127,8 +127,9 @@ class Svc(object):
     def load_model(self):
         # 获取模型配置
         self.net_g_ms = SynthesizerTrn(
-            self.hps_ms
-        )
+            self.hps_ms.data.filter_length // 2 + 1,
+            self.hps_ms.train.segment_size // self.hps_ms.data.hop_length,
+            **self.hps_ms.model)
         _ = utils.load_checkpoint(self.net_g_path, self.net_g_ms, None)
         if "half" in self.net_g_path and torch.cuda.is_available():
             _ = self.net_g_ms.half().eval().to(self.dev)
@@ -173,7 +174,7 @@ class Svc(object):
             c = c.half()
         with torch.no_grad():
             start = time.time()
-            audio = self.net_g_ms.infer(c, f0=f0, g=sid, uv=uv, predict_f0=auto_predict_f0, noice_scale=noice_scale)[0][0,0].data.float()
+            audio = self.net_g_ms.infer(c, f0=f0, g=sid, uv=uv, predict_f0=auto_predict_f0, noice_scale=noice_scale)[0,0].data.float()
             use_time = time.time() - start
             print("vits use time:{}".format(use_time))
         return audio, audio.shape[-1]
diff --git a/modules/modules.py b/modules/modules.py
index 6dcba59..54290fd 100644
--- a/modules/modules.py
+++ b/modules/modules.py
@@ -5,187 +5,182 @@ import scipy
 import torch
 from torch import nn
 from torch.nn import functional as F
-from torch.autograd import Function
-from typing import Any, Optional, Tuple
 
 from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
 from torch.nn.utils import weight_norm, remove_weight_norm
 
 import modules.commons as commons
 from modules.commons import init_weights, get_padding
-from modules.transforms import piecewise_rational_quadratic_transform
+
 
 LRELU_SLOPE = 0.1
 
 
 class LayerNorm(nn.Module):
-    def __init__(self, channels, eps=1e-5):
-        super().__init__()
-        self.channels = channels
-        self.eps = eps
+  def __init__(self, channels, eps=1e-5):
+    super().__init__()
+    self.channels = channels
+    self.eps = eps
 
-        self.gamma = nn.Parameter(torch.ones(channels))
-        self.beta = nn.Parameter(torch.zeros(channels))
-
-    def forward(self, x):
-        x = x.transpose(1, -1)
-        x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
-        return x.transpose(1, -1)
+    self.gamma = nn.Parameter(torch.ones(channels))
+    self.beta = nn.Parameter(torch.zeros(channels))
 
+  def forward(self, x):
+    x = x.transpose(1, -1)
+    x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
+    return x.transpose(1, -1)
 
+ 
 class ConvReluNorm(nn.Module):
-    def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
-        super().__init__()
-        self.in_channels = in_channels
-        self.hidden_channels = hidden_channels
-        self.out_channels = out_channels
-        self.kernel_size = kernel_size
-        self.n_layers = n_layers
-        self.p_dropout = p_dropout
-        assert n_layers > 1, "Number of layers should be larger than 0."
+  def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
+    super().__init__()
+    self.in_channels = in_channels
+    self.hidden_channels = hidden_channels
+    self.out_channels = out_channels
+    self.kernel_size = kernel_size
+    self.n_layers = n_layers
+    self.p_dropout = p_dropout
+    assert n_layers > 1, "Number of layers should be larger than 0."
 
-        self.conv_layers = nn.ModuleList()
-        self.norm_layers = nn.ModuleList()
-        self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size // 2))
-        self.norm_layers.append(LayerNorm(hidden_channels))
-        self.relu_drop = nn.Sequential(
-            nn.ReLU(),
-            nn.Dropout(p_dropout))
-        for _ in range(n_layers - 1):
-            self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size // 2))
-            self.norm_layers.append(LayerNorm(hidden_channels))
-        self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
-        self.proj.weight.data.zero_()
-        self.proj.bias.data.zero_()
+    self.conv_layers = nn.ModuleList()
+    self.norm_layers = nn.ModuleList()
+    self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2))
+    self.norm_layers.append(LayerNorm(hidden_channels))
+    self.relu_drop = nn.Sequential(
+        nn.ReLU(),
+        nn.Dropout(p_dropout))
+    for _ in range(n_layers-1):
+      self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2))
+      self.norm_layers.append(LayerNorm(hidden_channels))
+    self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
+    self.proj.weight.data.zero_()
+    self.proj.bias.data.zero_()
 
-    def forward(self, x, x_mask):
-        x_org = x
-        for i in range(self.n_layers):
-            x = self.conv_layers[i](x * x_mask)
-            x = self.norm_layers[i](x)
-            x = self.relu_drop(x)
-        x = x_org + self.proj(x)
-        return x * x_mask
+  def forward(self, x, x_mask):
+    x_org = x
+    for i in range(self.n_layers):
+      x = self.conv_layers[i](x * x_mask)
+      x = self.norm_layers[i](x)
+      x = self.relu_drop(x)
+    x = x_org + self.proj(x)
+    return x * x_mask
 
 
 class DDSConv(nn.Module):
-    """
-    Dialted and Depth-Separable Convolution
-    """
+  """
+  Dialted and Depth-Separable Convolution
+  """
+  def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
+    super().__init__()
+    self.channels = channels
+    self.kernel_size = kernel_size
+    self.n_layers = n_layers
+    self.p_dropout = p_dropout
 
-    def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
-        super().__init__()
-        self.channels = channels
-        self.kernel_size = kernel_size
-        self.n_layers = n_layers
-        self.p_dropout = p_dropout
+    self.drop = nn.Dropout(p_dropout)
+    self.convs_sep = nn.ModuleList()
+    self.convs_1x1 = nn.ModuleList()
+    self.norms_1 = nn.ModuleList()
+    self.norms_2 = nn.ModuleList()
+    for i in range(n_layers):
+      dilation = kernel_size ** i
+      padding = (kernel_size * dilation - dilation) // 2
+      self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size, 
+          groups=channels, dilation=dilation, padding=padding
+      ))
+      self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
+      self.norms_1.append(LayerNorm(channels))
+      self.norms_2.append(LayerNorm(channels))
 
-        self.drop = nn.Dropout(p_dropout)
-        self.convs_sep = nn.ModuleList()
-        self.convs_1x1 = nn.ModuleList()
-        self.norms_1 = nn.ModuleList()
-        self.norms_2 = nn.ModuleList()
-        for i in range(n_layers):
-            dilation = kernel_size ** i
-            padding = (kernel_size * dilation - dilation) // 2
-            self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size,
-                                            groups=channels, dilation=dilation, padding=padding
-                                            ))
-            self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
-            self.norms_1.append(LayerNorm(channels))
-            self.norms_2.append(LayerNorm(channels))
-
-    def forward(self, x, x_mask, g=None):
-        if g is not None:
-            x = x + g
-        for i in range(self.n_layers):
-            y = self.convs_sep[i](x * x_mask)
-            y = self.norms_1[i](y)
-            y = F.gelu(y)
-            y = self.convs_1x1[i](y)
-            y = self.norms_2[i](y)
-            y = F.gelu(y)
-            y = self.drop(y)
-            x = x + y
-        return x * x_mask
+  def forward(self, x, x_mask, g=None):
+    if g is not None:
+      x = x + g
+    for i in range(self.n_layers):
+      y = self.convs_sep[i](x * x_mask)
+      y = self.norms_1[i](y)
+      y = F.gelu(y)
+      y = self.convs_1x1[i](y)
+      y = self.norms_2[i](y)
+      y = F.gelu(y)
+      y = self.drop(y)
+      x = x + y
+    return x * x_mask
 
 
 class WN(torch.nn.Module):
-    def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, n_speakers=0, spk_channels=0,
-                 p_dropout=0):
-        super(WN, self).__init__()
-        assert (kernel_size % 2 == 1)
-        self.hidden_channels = hidden_channels
-        self.kernel_size = kernel_size,
-        self.dilation_rate = dilation_rate
-        self.n_layers = n_layers
-        self.n_speakers = n_speakers
-        self.spk_channels = spk_channels
-        self.p_dropout = p_dropout
+  def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
+    super(WN, self).__init__()
+    assert(kernel_size % 2 == 1)
+    self.hidden_channels =hidden_channels
+    self.kernel_size = kernel_size,
+    self.dilation_rate = dilation_rate
+    self.n_layers = n_layers
+    self.gin_channels = gin_channels
+    self.p_dropout = p_dropout
 
-        self.in_layers = torch.nn.ModuleList()
-        self.res_skip_layers = torch.nn.ModuleList()
-        self.drop = nn.Dropout(p_dropout)
+    self.in_layers = torch.nn.ModuleList()
+    self.res_skip_layers = torch.nn.ModuleList()
+    self.drop = nn.Dropout(p_dropout)
 
-        if n_speakers > 0:
-            cond_layer = torch.nn.Conv1d(spk_channels, 2 * hidden_channels * n_layers, 1)
-            self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
+    if gin_channels != 0:
+      cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1)
+      self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
 
-        for i in range(n_layers):
-            dilation = dilation_rate ** i
-            padding = int((kernel_size * dilation - dilation) / 2)
-            in_layer = torch.nn.Conv1d(hidden_channels, 2 * hidden_channels, kernel_size,
-                                       dilation=dilation, padding=padding)
-            in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
-            self.in_layers.append(in_layer)
+    for i in range(n_layers):
+      dilation = dilation_rate ** i
+      padding = int((kernel_size * dilation - dilation) / 2)
+      in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size,
+                                 dilation=dilation, padding=padding)
+      in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
+      self.in_layers.append(in_layer)
 
-            # last one is not necessary
-            if i < n_layers - 1:
-                res_skip_channels = 2 * hidden_channels
-            else:
-                res_skip_channels = hidden_channels
+      # last one is not necessary
+      if i < n_layers - 1:
+        res_skip_channels = 2 * hidden_channels
+      else:
+        res_skip_channels = hidden_channels
 
-            res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
-            res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
-            self.res_skip_layers.append(res_skip_layer)
+      res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
+      res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
+      self.res_skip_layers.append(res_skip_layer)
 
-    def forward(self, x, x_mask, g=None, **kwargs):
-        output = torch.zeros_like(x)
-        n_channels_tensor = torch.IntTensor([self.hidden_channels])
+  def forward(self, x, x_mask, g=None, **kwargs):
+    output = torch.zeros_like(x)
+    n_channels_tensor = torch.IntTensor([self.hidden_channels])
 
-        if g is not None:
-            g = self.cond_layer(g)
+    if g is not None:
+      g = self.cond_layer(g)
 
-        for i in range(self.n_layers):
-            x_in = self.in_layers[i](x)
-            if g is not None:
-                cond_offset = i * 2 * self.hidden_channels
-                g_l = g[:, cond_offset:cond_offset + 2 * self.hidden_channels, :]
-            else:
-                g_l = torch.zeros_like(x_in)
+    for i in range(self.n_layers):
+      x_in = self.in_layers[i](x)
+      if g is not None:
+        cond_offset = i * 2 * self.hidden_channels
+        g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:]
+      else:
+        g_l = torch.zeros_like(x_in)
 
-            acts = commons.fused_add_tanh_sigmoid_multiply(
-                x_in,
-                g_l,
-                n_channels_tensor)
-            acts = self.drop(acts)
+      acts = commons.fused_add_tanh_sigmoid_multiply(
+          x_in,
+          g_l,
+          n_channels_tensor)
+      acts = self.drop(acts)
 
-            res_skip_acts = self.res_skip_layers[i](acts)
-            if i < self.n_layers - 1:
-                res_acts = res_skip_acts[:, :self.hidden_channels, :]
-                x = (x + res_acts) * x_mask
-                output = output + res_skip_acts[:, self.hidden_channels:, :]
-            else:
-                output = output + res_skip_acts
-        return output * x_mask
+      res_skip_acts = self.res_skip_layers[i](acts)
+      if i < self.n_layers - 1:
+        res_acts = res_skip_acts[:,:self.hidden_channels,:]
+        x = (x + res_acts) * x_mask
+        output = output + res_skip_acts[:,self.hidden_channels:,:]
+      else:
+        output = output + res_skip_acts
+    return output * x_mask
 
-    def remove_weight_norm(self):
-        if self.n_speakers > 0:
-            torch.nn.utils.remove_weight_norm(self.cond_layer)
-        for l in self.in_layers:
-            torch.nn.utils.remove_weight_norm(l)
-        for l in self.res_skip_layers:
-            torch.nn.utils.remove_weight_norm(l)
+  def remove_weight_norm(self):
+    if self.gin_channels != 0:
+      torch.nn.utils.remove_weight_norm(self.cond_layer)
+    for l in self.in_layers:
+      torch.nn.utils.remove_weight_norm(l)
+    for l in self.res_skip_layers:
+     torch.nn.utils.remove_weight_norm(l)
 
 
 class ResBlock1(torch.nn.Module):
@@ -261,193 +256,87 @@ class ResBlock2(torch.nn.Module):
 
 
 class Log(nn.Module):
-    def forward(self, x, x_mask, reverse=False, **kwargs):
-        if not reverse:
-            y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
-            logdet = torch.sum(-y, [1, 2])
-            return y, logdet
-        else:
-            x = torch.exp(x) * x_mask
-            return x
-
+  def forward(self, x, x_mask, reverse=False, **kwargs):
+    if not reverse:
+      y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
+      logdet = torch.sum(-y, [1, 2])
+      return y, logdet
+    else:
+      x = torch.exp(x) * x_mask
+      return x
+    
 
 class Flip(nn.Module):
-    def forward(self, x, *args, reverse=False, **kwargs):
-        x = torch.flip(x, [1])
-        if not reverse:
-            logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
-            return x, logdet
-        else:
-            return x
+  def forward(self, x, *args, reverse=False, **kwargs):
+    x = torch.flip(x, [1])
+    if not reverse:
+      logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
+      return x, logdet
+    else:
+      return x
 
 
 class ElementwiseAffine(nn.Module):
-    def __init__(self, channels):
-        super().__init__()
-        self.channels = channels
-        self.m = nn.Parameter(torch.zeros(channels, 1))
-        self.logs = nn.Parameter(torch.zeros(channels, 1))
+  def __init__(self, channels):
+    super().__init__()
+    self.channels = channels
+    self.m = nn.Parameter(torch.zeros(channels,1))
+    self.logs = nn.Parameter(torch.zeros(channels,1))
 
-    def forward(self, x, x_mask, reverse=False, **kwargs):
-        if not reverse:
-            y = self.m + torch.exp(self.logs) * x
-            y = y * x_mask
-            logdet = torch.sum(self.logs * x_mask, [1, 2])
-            return y, logdet
-        else:
-            x = (x - self.m) * torch.exp(-self.logs) * x_mask
-            return x
+  def forward(self, x, x_mask, reverse=False, **kwargs):
+    if not reverse:
+      y = self.m + torch.exp(self.logs) * x
+      y = y * x_mask
+      logdet = torch.sum(self.logs * x_mask, [1,2])
+      return y, logdet
+    else:
+      x = (x - self.m) * torch.exp(-self.logs) * x_mask
+      return x
 
 
 class ResidualCouplingLayer(nn.Module):
-    def __init__(self,
-                 channels,
-                 hidden_channels,
-                 kernel_size,
-                 dilation_rate,
-                 n_layers,
-                 p_dropout=0,
-                 n_speakers=0,
-                 spk_channels=0,
-                 mean_only=False):
-        assert channels % 2 == 0, "channels should be divisible by 2"
-        super().__init__()
-        self.channels = channels
-        self.hidden_channels = hidden_channels
-        self.kernel_size = kernel_size
-        self.dilation_rate = dilation_rate
-        self.n_layers = n_layers
-        self.half_channels = channels // 2
-        self.mean_only = mean_only
+  def __init__(self,
+      channels,
+      hidden_channels,
+      kernel_size,
+      dilation_rate,
+      n_layers,
+      p_dropout=0,
+      gin_channels=0,
+      mean_only=False):
+    assert channels % 2 == 0, "channels should be divisible by 2"
+    super().__init__()
+    self.channels = channels
+    self.hidden_channels = hidden_channels
+    self.kernel_size = kernel_size
+    self.dilation_rate = dilation_rate
+    self.n_layers = n_layers
+    self.half_channels = channels // 2
+    self.mean_only = mean_only
 
-        self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
-        self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, n_speakers=n_speakers,
-                      spk_channels=spk_channels)
-        self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
-        self.post.weight.data.zero_()
-        self.post.bias.data.zero_()
+    self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
+    self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
+    self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
+    self.post.weight.data.zero_()
+    self.post.bias.data.zero_()
 
-    def forward(self, x, x_mask, g=None, reverse=False):
-        x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
-        h = self.pre(x0) * x_mask
-        h = self.enc(h, x_mask, g=g)
-        stats = self.post(h) * x_mask
-        if not self.mean_only:
-            m, logs = torch.split(stats, [self.half_channels] * 2, 1)
-        else:
-            m = stats
-            logs = torch.zeros_like(m)
+  def forward(self, x, x_mask, g=None, reverse=False):
+    x0, x1 = torch.split(x, [self.half_channels]*2, 1)
+    h = self.pre(x0) * x_mask
+    h = self.enc(h, x_mask, g=g)
+    stats = self.post(h) * x_mask
+    if not self.mean_only:
+      m, logs = torch.split(stats, [self.half_channels]*2, 1)
+    else:
+      m = stats
+      logs = torch.zeros_like(m)
 
-        if not reverse:
-            x1 = m + x1 * torch.exp(logs) * x_mask
-            x = torch.cat([x0, x1], 1)
-            logdet = torch.sum(logs, [1, 2])
-            return x, logdet
-        else:
-            x1 = (x1 - m) * torch.exp(-logs) * x_mask
-            x = torch.cat([x0, x1], 1)
-            return x
-
-
-class ResidualCouplingBlock(nn.Module):
-    def __init__(self,
-                 channels,
-                 hidden_channels,
-                 kernel_size,
-                 dilation_rate,
-                 n_layers,
-                 n_flows=4,
-                 n_speakers=0,
-                 gin_channels=0):
-        super().__init__()
-        self.channels = channels
-        self.hidden_channels = hidden_channels
-        self.kernel_size = kernel_size
-        self.dilation_rate = dilation_rate
-        self.n_layers = n_layers
-        self.n_flows = n_flows
-        self.gin_channels = gin_channels
-
-        self.flows = nn.ModuleList()
-        for i in range(n_flows):
-            self.flows.append(ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers,
-                                                    n_speakers=n_speakers, spk_channels=gin_channels, mean_only=True))
-            self.flows.append(Flip())
-
-    def forward(self, x, x_mask, g=None, reverse=False):
-        if not reverse:
-            for flow in self.flows:
-                x, _ = flow(x, x_mask, g=g, reverse=reverse)
-        else:
-            for flow in reversed(self.flows):
-                x = flow(x, x_mask, g=g, reverse=reverse)
-        return x
-
-
-class ConvFlow(nn.Module):
-    def __init__(self, in_channels, filter_channels, kernel_size, n_layers, num_bins=10, tail_bound=5.0):
-        super().__init__()
-        self.in_channels = in_channels
-        self.filter_channels = filter_channels
-        self.kernel_size = kernel_size
-        self.n_layers = n_layers
-        self.num_bins = num_bins
-        self.tail_bound = tail_bound
-        self.half_channels = in_channels // 2
-
-        self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
-        self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.)
-        self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1)
-        self.proj.weight.data.zero_()
-        self.proj.bias.data.zero_()
-
-    def forward(self, x, x_mask, g=None, reverse=False):
-        x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
-        h = self.pre(x0)
-        h = self.convs(h, x_mask, g=g)
-        h = self.proj(h) * x_mask
-
-        b, c, t = x0.shape
-        h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2)  # [b, cx?, t] -> [b, c, t, ?]
-
-        unnormalized_widths = h[..., :self.num_bins] / math.sqrt(self.filter_channels)
-        unnormalized_heights = h[..., self.num_bins:2 * self.num_bins] / math.sqrt(self.filter_channels)
-        unnormalized_derivatives = h[..., 2 * self.num_bins:]
-
-        x1, logabsdet = piecewise_rational_quadratic_transform(x1,
-                                                               unnormalized_widths,
-                                                               unnormalized_heights,
-                                                               unnormalized_derivatives,
-                                                               inverse=reverse,
-                                                               tails='linear',
-                                                               tail_bound=self.tail_bound
-                                                               )
-
-        x = torch.cat([x0, x1], 1) * x_mask
-        logdet = torch.sum(logabsdet * x_mask, [1, 2])
-        if not reverse:
-            return x, logdet
-        else:
-            return x
-
-
-class ResStack(nn.Module):
-    def __init__(self, channel, kernel_size=3, base=3, nums=4):
-        super(ResStack, self).__init__()
-
-        self.layers = nn.ModuleList([
-            nn.Sequential(
-                nn.LeakyReLU(),
-                nn.utils.weight_norm(nn.Conv1d(channel, channel,
-                                               kernel_size=kernel_size, dilation=base ** i, padding=base ** i)),
-                nn.LeakyReLU(),
-                nn.utils.weight_norm(nn.Conv1d(channel, channel,
-                                               kernel_size=kernel_size, dilation=1, padding=1)),
-            )
-            for i in range(nums)
-        ])
-
-    def forward(self, x):
-        for layer in self.layers:
-            x = x + layer(x)
-        return x
+    if not reverse:
+      x1 = m + x1 * torch.exp(logs) * x_mask
+      x = torch.cat([x0, x1], 1)
+      logdet = torch.sum(logs, [1,2])
+      return x, logdet
+    else:
+      x1 = (x1 - m) * torch.exp(-logs) * x_mask
+      x = torch.cat([x0, x1], 1)
+      return x
diff --git a/onnxexport/model_onnx.py b/onnxexport/model_onnx.py
index b50406c..e28bae9 100644
--- a/onnxexport/model_onnx.py
+++ b/onnxexport/model_onnx.py
@@ -1,379 +1,64 @@
-import sys
-import copy
-import math
 import torch
 from torch import nn
 from torch.nn import functional as F
-from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
-from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
-import numpy as np
 
-sys.path.append('../..')
+import modules.attentions as attentions
 import modules.commons as commons
 import modules.modules as modules
-import modules.attentions as attentions
 
-from modules.commons import init_weights, get_padding
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
 
-from modules.ddsp import mlp, gru, scale_function, remove_above_nyquist, upsample
-from modules.ddsp import harmonic_synth, amp_to_impulse_response, fft_convolve
-from modules.ddsp import resample
 import utils
-
-from modules.stft import TorchSTFT
-
-import torch.distributions as D
-
-from modules.losses import (
-    generator_loss,
-    discriminator_loss,
-    feature_loss,
-    kl_loss
-)
-
-LRELU_SLOPE = 0.1
+from modules.commons import init_weights, get_padding
+from vdecoder.hifigan.models import Generator
+from utils import f0_to_coarse
 
 
-class PostF0Decoder(nn.Module):
-    def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, spk_channels=0):
-        super().__init__()
-
-        self.in_channels = in_channels
-        self.filter_channels = filter_channels
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.gin_channels = spk_channels
-
-        self.drop = nn.Dropout(p_dropout)
-        self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size // 2)
-        self.norm_1 = modules.LayerNorm(filter_channels)
-        self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size // 2)
-        self.norm_2 = modules.LayerNorm(filter_channels)
-        self.proj = nn.Conv1d(filter_channels, 1, 1)
-
-        if spk_channels != 0:
-            self.cond = nn.Conv1d(spk_channels, in_channels, 1)
-
-    def forward(self, x, x_mask, g=None):
-        x = torch.detach(x)
-        if g is not None:
-            g = torch.detach(g)
-            x = x + self.cond(g)
-        x = self.conv_1(x * x_mask)
-        x = torch.relu(x)
-        x = self.norm_1(x)
-        x = self.drop(x)
-        x = self.conv_2(x * x_mask)
-        x = torch.relu(x)
-        x = self.norm_2(x)
-        x = self.drop(x)
-        x = self.proj(x * x_mask)
-        return x * x_mask
-
-
-class TextEncoder(nn.Module):
+class ResidualCouplingBlock(nn.Module):
     def __init__(self,
-                 c_dim,
-                 out_channels,
+                 channels,
                  hidden_channels,
-                 filter_channels,
-                 n_heads,
-                 n_layers,
                  kernel_size,
-                 p_dropout):
+                 dilation_rate,
+                 n_layers,
+                 n_flows=4,
+                 gin_channels=0):
         super().__init__()
-        self.out_channels = out_channels
+        self.channels = channels
         self.hidden_channels = hidden_channels
-        self.filter_channels = filter_channels
-        self.n_heads = n_heads
-        self.n_layers = n_layers
         self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.n_flows = n_flows
+        self.gin_channels = gin_channels
 
-        self.pre_net = torch.nn.Linear(c_dim, hidden_channels)
+        self.flows = nn.ModuleList()
+        for i in range(n_flows):
+            self.flows.append(
+                modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers,
+                                              gin_channels=gin_channels, mean_only=True))
+            self.flows.append(modules.Flip())
 
-        self.encoder = attentions.Encoder(
-            hidden_channels,
-            filter_channels,
-            n_heads,
-            n_layers,
-            kernel_size,
-            p_dropout)
-        self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
-
-    def forward(self, x, x_lengths):
-        x = x.transpose(1,-1)
-        x = self.pre_net(x)
-        x = torch.transpose(x, 1, -1)  # [b, h, t]
-        x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
-        x = self.encoder(x * x_mask, x_mask)
-        x = self.proj(x) * x_mask
-        return x, x_mask
-
-
-def pad_v2(input_ele, mel_max_length=None):
-    if mel_max_length:
-        max_len = mel_max_length
-    else:
-        max_len = max([input_ele[i].size(0) for i in range(len(input_ele))])
-
-    out_list = list()
-    for i, batch in enumerate(input_ele):
-        if len(batch.shape) == 1:
-            one_batch_padded = F.pad(
-                batch, (0, max_len - batch.size(0)), "constant", 0.0
-            )
-        elif len(batch.shape) == 2:
-            one_batch_padded = F.pad(
-                batch, (0, 0, 0, max_len - batch.size(0)), "constant", 0.0
-            )
-        out_list.append(one_batch_padded)
-    out_padded = torch.stack(out_list)
-    return out_padded
-
-
-class LengthRegulator(nn.Module):
-    """ Length Regulator """
-
-    def __init__(self):
-        super(LengthRegulator, self).__init__()
-
-    def LR(self, x, duration, max_len):
-        x = torch.transpose(x, 1, 2)
-        output = list()
-        mel_len = list()
-        for batch, expand_target in zip(x, duration):
-            expanded = self.expand(batch, expand_target)
-            output.append(expanded)
-            mel_len.append(expanded.shape[0])
-
-        if max_len is not None:
-            output = pad_v2(output, max_len)
+    def forward(self, x, x_mask, g=None, reverse=False):
+        if not reverse:
+            for flow in self.flows:
+                x, _ = flow(x, x_mask, g=g, reverse=reverse)
         else:
-            output = pad_v2(output)
-        output = torch.transpose(output, 1, 2)
-        return output, torch.LongTensor(mel_len)
-
-    def expand(self, batch, predicted):
-        predicted = torch.squeeze(predicted)
-        out = list()
-
-        for i, vec in enumerate(batch):
-            expand_size = predicted[i].item()
-            state_info_index = torch.unsqueeze(torch.arange(0, expand_size), 1).float()
-            state_info_length = torch.unsqueeze(torch.Tensor([expand_size] * expand_size), 1).float()
-            state_info = torch.cat([state_info_index, state_info_length], 1).to(vec.device)
-            new_vec = vec.expand(max(int(expand_size), 0), -1)
-            new_vec = torch.cat([new_vec, state_info], 1)
-            out.append(new_vec)
-        out = torch.cat(out, 0)
-        return out
-
-    def forward(self, x, duration, max_len):
-        output, mel_len = self.LR(x, duration, max_len)
-        return output, mel_len
-
-
-class PriorDecoder(nn.Module):
-    def __init__(self,
-                 out_bn_channels,
-                 hidden_channels,
-                 filter_channels,
-                 n_heads,
-                 n_layers,
-                 kernel_size,
-                 p_dropout,
-                 n_speakers=0,
-                 spk_channels=0):
-        super().__init__()
-        self.out_bn_channels = out_bn_channels
-        self.hidden_channels = hidden_channels
-        self.filter_channels = filter_channels
-        self.n_heads = n_heads
-        self.n_layers = n_layers
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.spk_channels = spk_channels
-
-        self.prenet = nn.Conv1d(hidden_channels , hidden_channels, 3, padding=1)
-        self.decoder = attentions.FFT(
-            hidden_channels,
-            filter_channels,
-            n_heads,
-            n_layers,
-            kernel_size,
-            p_dropout)
-        self.proj = nn.Conv1d(hidden_channels, out_bn_channels, 1)
-
-        if n_speakers != 0:
-            self.cond = nn.Conv1d(spk_channels, hidden_channels, 1)
-
-    def forward(self, x, x_lengths, spk_emb=None):
-        x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
-
-        x = self.prenet(x) * x_mask
-
-        if (spk_emb is not None):
-            x = x + self.cond(spk_emb)
-
-        x = self.decoder(x * x_mask, x_mask)
-
-        bn = self.proj(x) * x_mask
-
-        return bn, x_mask
-
-
-class Decoder(nn.Module):
-    def __init__(self,
-                 out_channels,
-                 hidden_channels,
-                 filter_channels,
-                 n_heads,
-                 n_layers,
-                 kernel_size,
-                 p_dropout,
-                 n_speakers=0,
-                 spk_channels=0,
-                 in_channels=None):
-        super().__init__()
-        self.out_channels = out_channels
-        self.hidden_channels = hidden_channels
-        self.filter_channels = filter_channels
-        self.n_heads = n_heads
-        self.n_layers = n_layers
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.spk_channels = spk_channels
-
-        self.prenet = nn.Conv1d(in_channels if in_channels is not None else hidden_channels, hidden_channels, 3, padding=1)
-        self.decoder = attentions.FFT(
-            hidden_channels,
-            filter_channels,
-            n_heads,
-            n_layers,
-            kernel_size,
-            p_dropout)
-        self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
-
-        if n_speakers != 0:
-            self.cond = nn.Conv1d(spk_channels, hidden_channels, 1)
-
-    def forward(self, x, x_lengths, spk_emb=None):
-        x = torch.detach(x)
-        x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
-
-        x = self.prenet(x) * x_mask
-
-        if (spk_emb is not None):
-            x = x + self.cond(spk_emb)
-
-        x = self.decoder(x * x_mask, x_mask)
-
-        x = self.proj(x) * x_mask
-
-        return x, x_mask
-
-class F0Decoder(nn.Module):
-    def __init__(self,
-                 out_channels,
-                 hidden_channels,
-                 filter_channels,
-                 n_heads,
-                 n_layers,
-                 kernel_size,
-                 p_dropout,
-                 n_speakers=0,
-                 spk_channels=0,
-                 in_channels=None):
-        super().__init__()
-        self.out_channels = out_channels
-        self.hidden_channels = hidden_channels
-        self.filter_channels = filter_channels
-        self.n_heads = n_heads
-        self.n_layers = n_layers
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.spk_channels = spk_channels
-
-        self.prenet = nn.Conv1d(in_channels if in_channels is not None else hidden_channels, hidden_channels, 3, padding=1)
-        self.decoder = attentions.FFT(
-            hidden_channels,
-            filter_channels,
-            n_heads,
-            n_layers,
-            kernel_size,
-            p_dropout)
-        self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
-        self.f0_prenet = nn.Conv1d(1, hidden_channels , 3, padding=1)
-
-        if n_speakers != 0:
-            self.cond = nn.Conv1d(spk_channels, hidden_channels, 1)
-
-    def forward(self, x, norm_f0, x_lengths, spk_emb=None):
-        x = torch.detach(x)
-        x += self.f0_prenet(norm_f0)
-        x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
-
-        x = self.prenet(x) * x_mask
-
-        if (spk_emb is not None):
-            x = x + self.cond(spk_emb)
-
-        x = self.decoder(x * x_mask, x_mask)
-
-        x = self.proj(x) * x_mask
-
-        return x, x_mask
-
-
-class ConvReluNorm(nn.Module):
-    def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
-        super().__init__()
-        self.in_channels = in_channels
-        self.hidden_channels = hidden_channels
-        self.out_channels = out_channels
-        self.kernel_size = kernel_size
-        self.n_layers = n_layers
-        self.p_dropout = p_dropout
-        assert n_layers > 1, "Number of layers should be larger than 0."
-
-        self.conv_layers = nn.ModuleList()
-        self.norm_layers = nn.ModuleList()
-        self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size // 2))
-        self.norm_layers.append(LayerNorm(hidden_channels))
-        self.relu_drop = nn.Sequential(
-            nn.ReLU(),
-            nn.Dropout(p_dropout))
-        for _ in range(n_layers - 1):
-            self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size // 2))
-            self.norm_layers.append(LayerNorm(hidden_channels))
-        self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
-        self.proj.weight.data.zero_()
-        self.proj.bias.data.zero_()
-
-    def forward(self, x):
-        x = self.conv_layers[0](x)
-        x = self.norm_layers[0](x)
-        x = self.relu_drop(x)
-
-        for i in range(1, self.n_layers):
-            x_ = self.conv_layers[i](x)
-            x_ = self.norm_layers[i](x_)
-            x_ = self.relu_drop(x_)
-            x = (x + x_) / 2
-        x = self.proj(x)
+            for flow in reversed(self.flows):
+                x = flow(x, x_mask, g=g, reverse=reverse)
         return x
 
 
-class PosteriorEncoder(nn.Module):
+class Encoder(nn.Module):
     def __init__(self,
-                 hps,
                  in_channels,
                  out_channels,
                  hidden_channels,
                  kernel_size,
                  dilation_rate,
-                 n_layers):
+                 n_layers,
+                 gin_channels=0):
         super().__init__()
         self.in_channels = in_channels
         self.out_channels = out_channels
@@ -381,320 +66,57 @@ class PosteriorEncoder(nn.Module):
         self.kernel_size = kernel_size
         self.dilation_rate = dilation_rate
         self.n_layers = n_layers
+        self.gin_channels = gin_channels
 
         self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
-        self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, n_speakers=hps.data.n_speakers, spk_channels=hps.model.spk_channels)
-        # self.enc = ConvReluNorm(hidden_channels,
-        #                         hidden_channels,
-        #                         hidden_channels,
-        #                         kernel_size,
-        #                         n_layers,
-        #                         0.1)
+        self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
         self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
 
     def forward(self, x, x_lengths, g=None):
+        # print(x.shape,x_lengths.shape)
         x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
         x = self.pre(x) * x_mask
         x = self.enc(x, x_mask, g=g)
         stats = self.proj(x) * x_mask
-        return stats, x_mask
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
+        return z, m, logs, x_mask
 
 
-class ResBlock3(torch.nn.Module):
-    def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
-        super(ResBlock3, self).__init__()
-        self.convs = nn.ModuleList([
-            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
-                               padding=get_padding(kernel_size, dilation[0])))
-        ])
-        self.convs.apply(init_weights)
-
-    def forward(self, x, x_mask=None):
-        for c in self.convs:
-            xt = F.leaky_relu(x, LRELU_SLOPE)
-            if x_mask is not None:
-                xt = xt * x_mask
-            xt = c(xt)
-            x = xt + x
-        if x_mask is not None:
-            x = x * x_mask
-        return x
-
-    def remove_weight_norm(self):
-        for l in self.convs:
-            remove_weight_norm(l)
-
-
-class Generator_Harm(torch.nn.Module):
-    def __init__(self, hps):
-        super(Generator_Harm, self).__init__()
-        self.hps = hps
-
-        self.prenet = Conv1d(hps.model.hidden_channels, hps.model.hidden_channels, 3, padding=1)
-
-        self.net = ConvReluNorm(hps.model.hidden_channels,
-                                hps.model.hidden_channels,
-                                hps.model.hidden_channels,
-                                hps.model.kernel_size,
-                                8,
-                                hps.model.p_dropout)
-
-        # self.rnn = nn.LSTM(input_size=hps.model.hidden_channels,
-        #    hidden_size=hps.model.hidden_channels,
-        #    num_layers=1,
-        #    bias=True,
-        #    batch_first=True,
-        #    dropout=0.5,
-        #    bidirectional=True)
-        self.postnet = Conv1d(hps.model.hidden_channels, hps.model.n_harmonic + 1, 3, padding=1)
-
-    def forward(self, f0, harm, mask):
-        pitch = f0.transpose(1, 2)
-        harm = self.prenet(harm)
-
-        harm = self.net(harm) * mask
-        # harm = harm.transpose(1, 2)
-        # harm, (hs, hc) = self.rnn(harm)
-        # harm = harm.transpose(1, 2)
-
-        harm = self.postnet(harm)
-        harm = harm.transpose(1, 2)
-        param = harm
-
-        param = scale_function(param)
-        total_amp = param[..., :1]
-        amplitudes = param[..., 1:]
-        amplitudes = remove_above_nyquist(
-            amplitudes,
-            pitch,
-            self.hps.data.sampling_rate,
-        )
-        amplitudes /= amplitudes.sum(-1, keepdim=True)
-        amplitudes *= total_amp
-
-        amplitudes = upsample(amplitudes, self.hps.data.hop_length)
-        pitch = upsample(pitch, self.hps.data.hop_length)
-
-        n_harmonic = amplitudes.shape[-1]
-        omega = torch.cumsum(2 * math.pi * pitch / self.hps.data.sampling_rate, 1)
-        omegas = omega * torch.arange(1, n_harmonic + 1).to(omega)
-        signal_harmonics = (torch.sin(omegas) * amplitudes)
-        signal_harmonics = signal_harmonics.transpose(1, 2)
-        return signal_harmonics
-
-
-class Generator(torch.nn.Module):
-    def __init__(self, hps, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates,
-                 upsample_initial_channel, upsample_kernel_sizes, n_speakers=0, spk_channels=0):
-        super(Generator, self).__init__()
-        self.num_kernels = len(resblock_kernel_sizes)
-        self.num_upsamples = len(upsample_rates)
-        self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
-        self.upsample_rates = upsample_rates
-        self.n_speakers = n_speakers
-
-        resblock = modules.ResBlock1 if resblock == '1' else modules.R
-
-        self.downs = nn.ModuleList()
-        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
-            i = len(upsample_rates) - 1 - i
-            u = upsample_rates[i]
-            k = upsample_kernel_sizes[i]
-            # print("down: ",upsample_initial_channel//(2**(i+1))," -> ", upsample_initial_channel//(2**i))
-            self.downs.append(weight_norm(
-                Conv1d(hps.model.n_harmonic + 2, hps.model.n_harmonic + 2,
-                       k, u, padding=k // 2)))
-
-        self.resblocks_downs = nn.ModuleList()
-        for i in range(len(self.downs)):
-            j = len(upsample_rates) - 1 - i
-            self.resblocks_downs.append(ResBlock3(hps.model.n_harmonic + 2, 3, (1, 3)))
-
-        self.concat_pre = Conv1d(upsample_initial_channel + hps.model.n_harmonic + 2, upsample_initial_channel, 3, 1,
-                                 padding=1)
-        self.concat_conv = nn.ModuleList()
-        for i in range(len(upsample_rates)):
-            ch = upsample_initial_channel // (2 ** (i + 1))
-            self.concat_conv.append(Conv1d(ch + hps.model.n_harmonic + 2, ch, 3, 1, padding=1, bias=False))
-
-        self.ups = nn.ModuleList()
-        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
-            self.ups.append(weight_norm(
-                ConvTranspose1d(upsample_initial_channel // (2 ** i), upsample_initial_channel // (2 ** (i + 1)),
-                                k, u, padding=(k - u) // 2)))
-
-        self.resblocks = nn.ModuleList()
-        for i in range(len(self.ups)):
-            ch = upsample_initial_channel // (2 ** (i + 1))
-            for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
-                self.resblocks.append(resblock(ch, k, d))
-
-        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
-        self.ups.apply(init_weights)
-
-        if self.n_speakers != 0:
-            self.cond = nn.Conv1d(spk_channels, upsample_initial_channel, 1)
-
-    def forward(self, x, ddsp, g=None):
-
-        x = self.conv_pre(x)
-
-        if g is not None:
-            x = x + self.cond(g)
-
-        se = ddsp
-        res_features = [se]
-        for i in range(self.num_upsamples):
-            in_size = se.size(2)
-            se = self.downs[i](se)
-            se = self.resblocks_downs[i](se)
-            up_rate = self.upsample_rates[self.num_upsamples - 1 - i]
-            se = se[:, :, : in_size // up_rate]
-            res_features.append(se)
-
-        x = torch.cat([x, se], 1)
-        x = self.concat_pre(x)
-
-        for i in range(self.num_upsamples):
-            x = F.leaky_relu(x, modules.LRELU_SLOPE)
-            in_size = x.size(2)
-            x = self.ups[i](x)
-            # 保证维度正确，丢掉多余通道
-            x = x[:, :, : in_size * self.upsample_rates[i]]
-
-            x = torch.cat([x, res_features[self.num_upsamples - 1 - i]], 1)
-            x = self.concat_conv[i](x)
-
-            xs = None
-            for j in range(self.num_kernels):
-                if xs is None:
-                    xs = self.resblocks[i * self.num_kernels + j](x)
-                else:
-                    xs += self.resblocks[i * self.num_kernels + j](x)
-            x = xs / self.num_kernels
-
-        x = F.leaky_relu(x)
-        x = self.conv_post(x)
-        x = torch.tanh(x)
-
-        return x
-
-    def remove_weight_norm(self):
-        print('Removing weight norm...')
-        for l in self.ups:
-            remove_weight_norm(l)
-        for l in self.resblocks:
-            l.remove_weight_norm()
-
-
-from scipy.signal import get_window
-def init_kernels(win_len, win_inc, fft_len, win_type=None, invers=False):
-    if win_type == 'None' or win_type is None:
-        window = np.ones(win_len)
-    else:
-        window = get_window(win_type, win_len, fftbins=True)#**0.5
-    
-    N = fft_len
-    fourier_basis = np.fft.rfft(np.eye(N))[:win_len]
-    real_kernel = np.real(fourier_basis)
-    imag_kernel = np.imag(fourier_basis)
-    kernel = np.concatenate([real_kernel, imag_kernel], 1).T
-    
-    if invers :
-        kernel = np.linalg.pinv(kernel).T 
-
-    kernel = kernel*window
-    kernel = kernel[:, None, :]
-    return torch.from_numpy(kernel.astype(np.float32)), torch.from_numpy(window[None,:,None].astype(np.float32))
-
-
-class ConviSTFT(nn.Module):
-    def __init__(self, win_len, win_inc, fft_len=None, win_type='hamming', feature_type='real', fix=True):
-        super(ConviSTFT, self).__init__() 
-        if fft_len == None:
-            self.fft_len = np.int(2**np.ceil(np.log2(win_len)))
-        else:
-            self.fft_len = fft_len
-        kernel, window = init_kernels(win_len, win_inc, self.fft_len, win_type, invers=True)
-        #self.weight = nn.Parameter(kernel, requires_grad=(not fix))
-        self.register_buffer('weight', kernel)
-        self.feature_type = feature_type
-        self.win_type = win_type
-        self.win_len = win_len
-        self.stride = win_inc
-        self.dim = self.fft_len
-        self.register_buffer('window', window)
-        self.register_buffer('enframe', torch.eye(win_len)[:,None,:])
-
-    def forward(self, inputs, t):
-        outputs = F.conv_transpose1d(inputs, self.weight, stride=self.stride) 
-        coff = F.conv_transpose1d(t, self.enframe, stride=self.stride)
-        outputs = outputs/(coff+1e-8)
-        #outputs = torch.where(coff == 0, outputs, outputs/coff)
-        outputs = outputs[...,768:-768] 
-        return outputs
-
-
-class Generator_Noise(torch.nn.Module):
-    def __init__(self, hps):
-        super(Generator_Noise, self).__init__()
-        self.hps = hps
-        self.win_size = hps.data.win_size
-        self.hop_size = hps.data.hop_length
-        self.fft_size = hps.data.n_fft
-        self.istft_pre = Conv1d(hps.model.hidden_channels, hps.model.hidden_channels, 3, padding=1)
-
-        self.net = ConvReluNorm(hps.model.hidden_channels,
-                                hps.model.hidden_channels,
-                                hps.model.hidden_channels,
-                                hps.model.kernel_size,
-                                8,
-                                hps.model.p_dropout)
-
-        self.istft_amplitude = torch.nn.Conv1d(hps.model.hidden_channels, self.fft_size // 2 + 1, 1, 1)
-        self.window = torch.hann_window(self.win_size)
-        self.istft = ConviSTFT(self.win_size, self.hop_size ,self.fft_size)
-
-    def forward(self, x, mask, t_window):
-        istft_x = x
-        istft_x = self.istft_pre(istft_x)
-
-        istft_x = self.net(istft_x) * mask
-
-        amp = self.istft_amplitude(istft_x).unsqueeze(-1)
-        phase = (torch.rand(amp.shape) * 2 * 3.14 - 3.14).to(amp)
-
-        real = amp * torch.cos(phase)
-        imag = amp * torch.sin(phase)
-
-        '''
-        spec = torch.cat([real, imag], 1).squeeze(3)
-        print(spec.shape)
-        istft_x = self.istft(spec)
-
-        spec = torch.cat([real, imag], 3)
-        istft_x = torch.istft(spec, self.fft_size, self.hop_size, self.win_size, self.window.to(amp), True,
-                              length=x.shape[2] * self.hop_size, return_complex=False)
-        '''
-        spec = torch.cat([real, imag], 1).squeeze(3)
-        istft_x = self.istft(spec, t_window)
-
-        return istft_x
-
-
-class LayerNorm(nn.Module):
-    def __init__(self, channels, eps=1e-5):
+class TextEncoder(nn.Module):
+    def __init__(self,
+                 out_channels,
+                 hidden_channels,
+                 kernel_size,
+                 n_layers,
+                 gin_channels=0,
+                 filter_channels=None,
+                 n_heads=None,
+                 p_dropout=None):
         super().__init__()
-        self.channels = channels
-        self.eps = eps
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.n_layers = n_layers
+        self.gin_channels = gin_channels
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+        self.f0_emb = nn.Embedding(256, hidden_channels)
 
-        self.gamma = nn.Parameter(torch.ones(channels))
-        self.beta = nn.Parameter(torch.zeros(channels))
+        self.enc_ = attentions.Encoder(
+            hidden_channels,
+            filter_channels,
+            n_heads,
+            n_layers,
+            kernel_size,
+            p_dropout)
 
-    def forward(self, x):
-        x = x.transpose(1, -1)
-        x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
-        return x.transpose(1, -1)
+    def forward(self, x, x_mask, f0=None, z=None):
+        x = x + self.f0_emb(f0).transpose(1, 2)
+        x = self.enc_(x * x_mask, x_mask)
+        stats = self.proj(x) * x_mask
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        z = (m + z * torch.exp(logs)) * x_mask
+        return z, m, logs, x_mask
 
 
 class DiscriminatorP(torch.nn.Module):
@@ -762,284 +184,152 @@ class DiscriminatorS(torch.nn.Module):
         return x, fmap
 
 
-class MultiFrequencyDiscriminator(nn.Module):
+class F0Decoder(nn.Module):
     def __init__(self,
-                 hop_lengths=[128, 256, 512],
-                 hidden_channels=[256, 512, 512],
-                 domain='double', mel_scale=True):
-        super(MultiFrequencyDiscriminator, self).__init__()
+                 out_channels,
+                 hidden_channels,
+                 filter_channels,
+                 n_heads,
+                 n_layers,
+                 kernel_size,
+                 p_dropout,
+                 spk_channels=0):
+        super().__init__()
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.spk_channels = spk_channels
 
-        self.stfts = nn.ModuleList([
-            TorchSTFT(fft_size=x * 4, hop_size=x, win_size=x * 4,
-                      normalized=True, domain=domain, mel_scale=mel_scale)
-            for x in hop_lengths])
+        self.prenet = nn.Conv1d(hidden_channels, hidden_channels, 3, padding=1)
+        self.decoder = attentions.FFT(
+            hidden_channels,
+            filter_channels,
+            n_heads,
+            n_layers,
+            kernel_size,
+            p_dropout)
+        self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
+        self.f0_prenet = nn.Conv1d(1, hidden_channels, 3, padding=1)
+        self.cond = nn.Conv1d(spk_channels, hidden_channels, 1)
 
-        self.domain = domain
-        if domain == 'double':
-            self.discriminators = nn.ModuleList([
-                BaseFrequenceDiscriminator(2, c)
-                for x, c in zip(hop_lengths, hidden_channels)])
-        else:
-            self.discriminators = nn.ModuleList([
-                BaseFrequenceDiscriminator(1, c)
-                for x, c in zip(hop_lengths, hidden_channels)])
-
-    def forward(self, x):
-        scores, feats = list(), list()
-        for stft, layer in zip(self.stfts, self.discriminators):
-            # print(stft)
-            mag, phase = stft.transform(x.squeeze())
-            if self.domain == 'double':
-                mag = torch.stack(torch.chunk(mag, 2, dim=1), dim=1)
-            else:
-                mag = mag.unsqueeze(1)
-
-            score, feat = layer(mag)
-            scores.append(score)
-            feats.append(feat)
-        return scores, feats
-
-
-class BaseFrequenceDiscriminator(nn.Module):
-    def __init__(self, in_channels, hidden_channels=512):
-        super(BaseFrequenceDiscriminator, self).__init__()
-
-        self.discriminator = nn.ModuleList()
-        self.discriminator += [
-            nn.Sequential(
-                nn.ReflectionPad2d((1, 1, 1, 1)),
-                nn.utils.weight_norm(nn.Conv2d(
-                    in_channels, hidden_channels // 32,
-                    kernel_size=(3, 3), stride=(1, 1)))
-            ),
-            nn.Sequential(
-                nn.LeakyReLU(0.2, True),
-                nn.ReflectionPad2d((1, 1, 1, 1)),
-                nn.utils.weight_norm(nn.Conv2d(
-                    hidden_channels // 32, hidden_channels // 16,
-                    kernel_size=(3, 3), stride=(2, 2)))
-            ),
-            nn.Sequential(
-                nn.LeakyReLU(0.2, True),
-                nn.ReflectionPad2d((1, 1, 1, 1)),
-                nn.utils.weight_norm(nn.Conv2d(
-                    hidden_channels // 16, hidden_channels // 8,
-                    kernel_size=(3, 3), stride=(1, 1)))
-            ),
-            nn.Sequential(
-                nn.LeakyReLU(0.2, True),
-                nn.ReflectionPad2d((1, 1, 1, 1)),
-                nn.utils.weight_norm(nn.Conv2d(
-                    hidden_channels // 8, hidden_channels // 4,
-                    kernel_size=(3, 3), stride=(2, 2)))
-            ),
-            nn.Sequential(
-                nn.LeakyReLU(0.2, True),
-                nn.ReflectionPad2d((1, 1, 1, 1)),
-                nn.utils.weight_norm(nn.Conv2d(
-                    hidden_channels // 4, hidden_channels // 2,
-                    kernel_size=(3, 3), stride=(1, 1)))
-            ),
-            nn.Sequential(
-                nn.LeakyReLU(0.2, True),
-                nn.ReflectionPad2d((1, 1, 1, 1)),
-                nn.utils.weight_norm(nn.Conv2d(
-                    hidden_channels // 2, hidden_channels,
-                    kernel_size=(3, 3), stride=(2, 2)))
-            ),
-            nn.Sequential(
-                nn.LeakyReLU(0.2, True),
-                nn.ReflectionPad2d((1, 1, 1, 1)),
-                nn.utils.weight_norm(nn.Conv2d(
-                    hidden_channels, 1,
-                    kernel_size=(3, 3), stride=(1, 1)))
-            )
-        ]
-
-    def forward(self, x):
-        hiddens = []
-        for layer in self.discriminator:
-            x = layer(x)
-            hiddens.append(x)
-        return x, hiddens[-1]
-
-
-class Discriminator(torch.nn.Module):
-    def __init__(self, hps, use_spectral_norm=False):
-        super(Discriminator, self).__init__()
-        periods = [2, 3, 5, 7, 11]
-
-        discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
-        discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
-        self.discriminators = nn.ModuleList(discs)
-        # self.disc_multfrequency = MultiFrequencyDiscriminator(hop_lengths=[int(hps.data.sampling_rate * 2.5 / 1000),
-        #                                                                    int(hps.data.sampling_rate * 5 / 1000),
-        #                                                                    int(hps.data.sampling_rate * 7.5 / 1000),
-        #                                                                    int(hps.data.sampling_rate * 10 / 1000),
-        #                                                                    int(hps.data.sampling_rate * 12.5 / 1000),
-        #                                                                    int(hps.data.sampling_rate * 15 / 1000)],
-        #                                                       hidden_channels=[256, 256, 256, 256, 256])
-
-    def forward(self, y, y_hat):
-        y_d_rs = []
-        y_d_gs = []
-        fmap_rs = []
-        fmap_gs = []
-        for i, d in enumerate(self.discriminators):
-            y_d_r, fmap_r = d(y)
-            y_d_g, fmap_g = d(y_hat)
-            y_d_rs.append(y_d_r)
-            y_d_gs.append(y_d_g)
-            fmap_rs.append(fmap_r)
-            fmap_gs.append(fmap_g)
-        # scores_r, fmaps_r = self.disc_multfrequency(y)
-        # scores_g, fmaps_g = self.disc_multfrequency(y_hat)
-        # for i in range(len(scores_r)):
-        #     y_d_rs.append(scores_r[i])
-        #     y_d_gs.append(scores_g[i])
-        #     fmap_rs.append(fmaps_r[i])
-        #     fmap_gs.append(fmaps_g[i])
-        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+    def forward(self, x, norm_f0, x_mask, spk_emb=None):
+        x = torch.detach(x)
+        if spk_emb is not None:
+            x = x + self.cond(spk_emb)
+        x += self.f0_prenet(norm_f0)
+        x = self.prenet(x) * x_mask
+        x = self.decoder(x * x_mask, x_mask)
+        x = self.proj(x) * x_mask
+        return x
 
 
 class SynthesizerTrn(nn.Module):
     """
-    Model
-    """
+  Synthesizer for Training
+  """
 
-    def __init__(self, hps):
+    def __init__(self,
+                 spec_channels,
+                 segment_size,
+                 inter_channels,
+                 hidden_channels,
+                 filter_channels,
+                 n_heads,
+                 n_layers,
+                 kernel_size,
+                 p_dropout,
+                 resblock,
+                 resblock_kernel_sizes,
+                 resblock_dilation_sizes,
+                 upsample_rates,
+                 upsample_initial_channel,
+                 upsample_kernel_sizes,
+                 gin_channels,
+                 ssl_dim,
+                 n_speakers,
+                 sampling_rate=44100,
+                 **kwargs):
         super().__init__()
-        self.hps = hps
+        self.spec_channels = spec_channels
+        self.inter_channels = inter_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.resblock = resblock
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.upsample_rates = upsample_rates
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.segment_size = segment_size
+        self.gin_channels = gin_channels
+        self.ssl_dim = ssl_dim
+        self.emb_g = nn.Embedding(n_speakers, gin_channels)
 
-        self.text_encoder = TextEncoder(
-            hps.data.c_dim,
-            hps.model.prior_hidden_channels,
-            hps.model.prior_hidden_channels,
-            hps.model.prior_filter_channels,
-            hps.model.prior_n_heads,
-            hps.model.prior_n_layers,
-            hps.model.prior_kernel_size,
-            hps.model.prior_p_dropout)
+        self.pre = nn.Conv1d(ssl_dim, hidden_channels, kernel_size=5, padding=2)
 
-        self.decoder = PriorDecoder(
-            hps.model.hidden_channels * 2,
-            hps.model.prior_hidden_channels,
-            hps.model.prior_filter_channels,
-            hps.model.prior_n_heads,
-            hps.model.prior_n_layers,
-            hps.model.prior_kernel_size,
-            hps.model.prior_p_dropout,
-            n_speakers=hps.data.n_speakers,
-            spk_channels=hps.model.spk_channels
+        self.enc_p = TextEncoder(
+            inter_channels,
+            hidden_channels,
+            filter_channels=filter_channels,
+            n_heads=n_heads,
+            n_layers=n_layers,
+            kernel_size=kernel_size,
+            p_dropout=p_dropout
         )
-
+        hps = {
+            "sampling_rate": sampling_rate,
+            "inter_channels": inter_channels,
+            "resblock": resblock,
+            "resblock_kernel_sizes": resblock_kernel_sizes,
+            "resblock_dilation_sizes": resblock_dilation_sizes,
+            "upsample_rates": upsample_rates,
+            "upsample_initial_channel": upsample_initial_channel,
+            "upsample_kernel_sizes": upsample_kernel_sizes,
+            "gin_channels": gin_channels,
+        }
+        self.dec = Generator(h=hps)
+        self.enc_q = Encoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
+        self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
         self.f0_decoder = F0Decoder(
             1,
-            hps.model.prior_hidden_channels,
-            hps.model.prior_filter_channels,
-            hps.model.prior_n_heads,
-            hps.model.prior_n_layers,
-            hps.model.prior_kernel_size,
-            hps.model.prior_p_dropout,
-            n_speakers=hps.data.n_speakers,
-            spk_channels=hps.model.spk_channels
+            hidden_channels,
+            filter_channels,
+            n_heads,
+            n_layers,
+            kernel_size,
+            p_dropout,
+            spk_channels=gin_channels
         )
+        self.emb_uv = nn.Embedding(2, hidden_channels)
+        self.predict_f0 = False
 
-        self.mel_decoder = Decoder(
-            hps.data.acoustic_dim,
-            hps.model.prior_hidden_channels,
-            hps.model.prior_filter_channels,
-            hps.model.prior_n_heads,
-            hps.model.prior_n_layers,
-            hps.model.prior_kernel_size,
-            hps.model.prior_p_dropout,
-            n_speakers=hps.data.n_speakers,
-            spk_channels=hps.model.spk_channels
-        )
-
-        self.posterior_encoder = PosteriorEncoder(
-            hps,
-            hps.data.acoustic_dim,
-            hps.model.hidden_channels,
-            hps.model.hidden_channels, 3, 1, 8)
-
-        self.dropout = nn.Dropout(0.2)
-
-        self.LR = LengthRegulator()
-
-        self.dec = Generator(hps,
-                             hps.model.hidden_channels,
-                             hps.model.resblock,
-                             hps.model.resblock_kernel_sizes,
-                             hps.model.resblock_dilation_sizes,
-                             hps.model.upsample_rates,
-                             hps.model.upsample_initial_channel,
-                             hps.model.upsample_kernel_sizes,
-                             n_speakers=hps.data.n_speakers,
-                             spk_channels=hps.model.spk_channels)
-
-        self.dec_harm = Generator_Harm(hps)
-
-        self.dec_noise = Generator_Noise(hps)
-
-        self.f0_prenet = nn.Conv1d(1, hps.model.prior_hidden_channels , 3, padding=1)
-        self.energy_prenet = nn.Conv1d(1, hps.model.prior_hidden_channels , 3, padding=1)
-        self.mel_prenet = nn.Conv1d(hps.data.acoustic_dim, hps.model.prior_hidden_channels , 3, padding=1)
-
-        if hps.data.n_speakers > 1:
-            self.emb_spk = nn.Embedding(hps.data.n_speakers, hps.model.spk_channels)
-        self.flow = modules.ResidualCouplingBlock(hps.model.prior_hidden_channels, hps.model.hidden_channels, 5, 1, 4,n_speakers=hps.data.n_speakers, gin_channels=hps.model.spk_channels)
-
-    def forward(self, c, f0, mel2ph, t_window, noise=None, g=None):
-        if len(g.shape) == 2:
-            g = g.squeeze(0)
-        if len(f0.shape) == 2:
-            f0 = f0.unsqueeze(0)
-        g = self.emb_spk(g).unsqueeze(-1)  # [b, h, 1]
+    def forward(self, c, f0, mel2ph, uv, noise=None, g=None):
 
         decoder_inp = F.pad(c, [0, 0, 1, 0])
         mel2ph_ = mel2ph.unsqueeze(2).repeat([1, 1, c.shape[-1]])
         c = torch.gather(decoder_inp, 1, mel2ph_).transpose(1, 2)  # [B, T, H]
 
         c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
+        g = g.unsqueeze(0)
+        g = self.emb_g(g).transpose(1, 2)
+        x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to(c.dtype)
+        x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1, 2)
 
-        # Encoder
-        decoder_input, x_mask = self.text_encoder(c, c_lengths)
-        y_lengths = c_lengths
-
-        LF0 = 2595. * torch.log10(1. + f0 / 700.)
-        LF0 = LF0 / 500
-
-        # aam
-        predict_mel, predict_bn_mask = self.mel_decoder(decoder_input + self.f0_prenet(LF0), y_lengths, spk_emb=g)
-        predict_energy = predict_mel.sum(1).unsqueeze(1) / self.hps.data.acoustic_dim
-
-        decoder_input = decoder_input + \
-                        self.f0_prenet(LF0) + \
-                        self.energy_prenet(predict_energy) + \
-                        self.mel_prenet(predict_mel)
-        decoder_output, y_mask = self.decoder(decoder_input, y_lengths, spk_emb=g)
-
-        prior_info = decoder_output
-
-        m_p = prior_info[:, :self.hps.model.hidden_channels, :]
-        logs_p = prior_info[:, self.hps.model.hidden_channels:, :]
-        z_p = m_p + torch.exp(logs_p) * noise
-        z = self.flow(z_p, y_mask, g=g, reverse=True)
-
-        prior_z = z
-
-        noise_x = self.dec_noise(prior_z, y_mask, t_window)
-
-        harm_x = self.dec_harm(f0, prior_z, y_mask)
-
-        pitch = upsample(f0.transpose(1, 2), self.hps.data.hop_length)
-        omega = torch.cumsum(2 * math.pi * pitch / self.hps.data.sampling_rate, 1)
-        sin = torch.sin(omega).transpose(1, 2)
-
-        decoder_condition = torch.cat([harm_x, noise_x, sin], axis=1)
-
-        # dsp based HiFiGAN vocoder
-        o = self.dec(prior_z, decoder_condition, g=g)
+        if self.predict_f0:
+            lf0 = 2595. * torch.log10(1. + f0.unsqueeze(1) / 700.) / 500
+            norm_lf0 = utils.normalize_f0(lf0, x_mask, uv, random_scale=False)
+            pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g)
+            f0 = (700 * (torch.pow(10, pred_lf0 * 500 / 2595) - 1)).squeeze(1)
 
+        z_p, m_p, logs_p, c_mask = self.enc_p(x, x_mask, f0=f0_to_coarse(f0), z=noise)
+        z = self.flow(z_p, c_mask, g=g, reverse=True)
+        o = self.dec(z * c_mask, g=g, f0=f0)
         return o