so-vits-svc/onnx_export.py

import torch
from torchaudio.models.wav2vec2.utils import import_fairseq_model
from fairseq import checkpoint_utils
from onnxexport.model_onnx import SynthesizerTrn
import utils

def get_hubert_model():
    vec_path = "hubert/checkpoint_best_legacy_500.pt"
    print("load model(s) from {}".format(vec_path))
    models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
        [vec_path],
        suffix="",
    )
    model = models[0]
    model.eval()
    return model


def main(HubertExport, NetExport):
    path = "SoVits4.0V2"

    '''if HubertExport:
        device = torch.device("cpu")
        vec_path = "hubert/checkpoint_best_legacy_500.pt"
        models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
            [vec_path],
            suffix="",
        )
        original = models[0]
        original.eval()
        model = original
        test_input = torch.rand(1, 1, 16000)
        model(test_input)
        torch.onnx.export(model,
                          test_input,
                          "hubert4.0.onnx",
                          export_params=True,
                          opset_version=16,
                          do_constant_folding=True,
                          input_names=['source'],
                          output_names=['embed'],
                          dynamic_axes={
                              'source':
                                  {
                                      2: "sample_length"
                                  },
                          }
                          )'''
    if NetExport:
        device = torch.device("cpu")
        hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json")
        SVCVITS = SynthesizerTrn(
            hps)
        _ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", SVCVITS, None)
        _ = SVCVITS.eval().to(device)
        for i in SVCVITS.parameters():
            i.requires_grad = False
        test_hidden_unit = torch.rand(1, 10, 256) # rand
        test_pitch = torch.rand(1, 10) # rand
        test_mel2ph = torch.LongTensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]).unsqueeze(0)
        #test_mel2ph = torch.LongTensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]).unsqueeze(0)
        test_uv = torch.zeros(1, 2048, 10, dtype=torch.float32)
        test_uv += 0.0064
        test_noise = torch.randn(1, 192, 10) # randn
        test_sid = torch.LongTensor([0])
        input_names = ["c", "f0", "mel2ph", "t_window", "noise", "sid"]
        output_names = ["audio", ]
        SVCVITS.eval()

        SVCVITS(test_hidden_unit.to(device),
                              test_pitch.to(device),
                              test_mel2ph.to(device),
                              test_uv.to(device),
                              test_noise.to(device),
                              test_sid.to(device))

        torch.onnx.export(SVCVITS,
                          (
                              test_hidden_unit.to(device),
                              test_pitch.to(device),
                              test_mel2ph.to(device),
                              test_uv.to(device),
                              test_noise.to(device),
                              test_sid.to(device)
                          ),
                          f"checkpoints/{path}/model.onnx",
                          dynamic_axes={
                              "c": [0, 1],
                              "f0": [1],
                              "mel2ph": [1],
                              "t_window": [2],
                              "noise": [2],
                          },
                          do_constant_folding=False,
                          opset_version=16,
                          verbose=False,
                          input_names=input_names,
                          output_names=output_names)


if __name__ == '__main__':
    main(False, True)