This commit is contained in:
ylzz1997 2023-05-17 01:10:43 +08:00
parent 7fbca2ee1f
commit 8cc7645379
6 changed files with 18 additions and 15 deletions

View File

@ -7,7 +7,6 @@ data:
encoder_sample_rate: 16000 encoder_sample_rate: 16000
encoder_hop_size: 320 encoder_hop_size: 320
encoder_out_channels: 768 # 256 if using 'hubertsoft' encoder_out_channels: 768 # 256 if using 'hubertsoft'
train_path: dataset/44k # Create a folder named "audio" under this path and put the audio clip in it
training_files: "filelists/train.txt" training_files: "filelists/train.txt"
validation_files: "filelists/val.txt" validation_files: "filelists/val.txt"
extensions: # List of extension included in the data collection extensions: # List of extension included in the data collection
@ -27,7 +26,7 @@ infer:
speedup: 10 speedup: 10
method: 'dpm-solver' # 'pndm' or 'dpm-solver' method: 'dpm-solver' # 'pndm' or 'dpm-solver'
env: env:
expdir: exp/diffusion-test expdir: logs/44k/diffusion
gpu_id: 0 gpu_id: 0
train: train:
num_workers: 2 # If your cpu and gpu are both very strong, set to 0 may be faster! num_workers: 2 # If your cpu and gpu are both very strong, set to 0 may be faster!

0
diffusion/__init__.py Normal file
View File

View File

@ -5,6 +5,7 @@ import numpy as np
import librosa import librosa
import torch import torch
import random import random
from utils import repeat_expand_2d
from tqdm import tqdm from tqdm import tqdm
from torch.utils.data import Dataset from torch.utils.data import Dataset
@ -51,7 +52,7 @@ def traverse_dir(
def get_data_loaders(args, whole_audio=False): def get_data_loaders(args, whole_audio=False):
data_train = AudioDataset( data_train = AudioDataset(
filelists_path = args.training_files, filelists = args.data.training_files,
waveform_sec=args.data.duration, waveform_sec=args.data.duration,
hop_size=args.data.block_size, hop_size=args.data.block_size,
sample_rate=args.data.sampling_rate, sample_rate=args.data.sampling_rate,
@ -72,7 +73,7 @@ def get_data_loaders(args, whole_audio=False):
pin_memory=True if args.train.cache_device=='cpu' else False pin_memory=True if args.train.cache_device=='cpu' else False
) )
data_valid = AudioDataset( data_valid = AudioDataset(
filelists_path = args.validation_files, filelists = args.data.validation_files,
waveform_sec=args.data.duration, waveform_sec=args.data.duration,
hop_size=args.data.block_size, hop_size=args.data.block_size,
sample_rate=args.data.sampling_rate, sample_rate=args.data.sampling_rate,
@ -123,15 +124,15 @@ class AudioDataset(Dataset):
else: else:
print('Load the f0, volume data filelists:', filelists) print('Load the f0, volume data filelists:', filelists)
with open(filelists,"r") as f: with open(filelists,"r") as f:
self.paths = f.readlines() self.paths = f.read().splitlines()
for name_ext in tqdm(self.paths, total=len(self.paths)): for name_ext in tqdm(self.paths, total=len(self.paths)):
name = os.path.splitext(name_ext)[0] name = os.path.splitext(name_ext)[0]
path_audio = name_ext path_audio = name_ext
duration = librosa.get_duration(filename = path_audio, sr = self.sample_rate) duration = librosa.get_duration(filename = path_audio, sr = self.sample_rate)
path_f0 = name_ext + ".f0.npy" path_f0 = name_ext + ".f0.npy"
f0 = np.load(path_f0) f0,_ = np.load(path_f0,allow_pickle=True)
f0 = torch.from_numpy(f0).float().unsqueeze(-1).to(device) f0 = torch.from_numpy(np.array(f0,dtype=float)).float().unsqueeze(-1).to(device)
path_volume = name_ext + ".vol.npy" path_volume = name_ext + ".vol.npy"
volume = np.load(path_volume) volume = np.load(path_volume)
@ -169,8 +170,9 @@ class AudioDataset(Dataset):
path_units = name_ext + ".soft.pt" path_units = name_ext + ".soft.pt"
units = torch.load(path_units).to(device) units = torch.load(path_units).to(device)
units = units[0]
units = repeat_expand_2d(units,f0.size(0)).transpose(0,1)
if fp16: if fp16:
mel = mel.half() mel = mel.half()
aug_mel = aug_mel.half() aug_mel = aug_mel.half()

View File

@ -3,8 +3,8 @@ import time
import numpy as np import numpy as np
import torch import torch
import librosa import librosa
from logger.saver import Saver from diffusion.logger.saver import Saver
from logger import utils from diffusion.logger import utils
from torch import autocast from torch import autocast
from torch.cuda.amp import GradScaler from torch.cuda.amp import GradScaler

View File

@ -34,10 +34,10 @@ def process_one(filename, hmodel,f0p,diff=False,mel_extractor=None):
wav, sr = librosa.load(filename, sr=sampling_rate) wav, sr = librosa.load(filename, sr=sampling_rate)
audio_norm = torch.FloatTensor(wav) audio_norm = torch.FloatTensor(wav)
audio_norm = audio_norm.unsqueeze(0) audio_norm = audio_norm.unsqueeze(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
soft_path = filename + ".soft.pt" soft_path = filename + ".soft.pt"
if not os.path.exists(soft_path): if not os.path.exists(soft_path):
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
wav16k = librosa.resample(wav, orig_sr=sampling_rate, target_sr=16000) wav16k = librosa.resample(wav, orig_sr=sampling_rate, target_sr=16000)
wav16k = torch.from_numpy(wav16k).to(device) wav16k = torch.from_numpy(wav16k).to(device)
c = hmodel.encoder(wav16k) c = hmodel.encoder(wav16k)

View File

@ -19,7 +19,7 @@ from modules.commons import sequence_mask
MATPLOTLIB_FLAG = False MATPLOTLIB_FLAG = False
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) logging.basicConfig(stream=sys.stdout, level=logging.WARN)
logger = logging logger = logging
f0_bin = 256 f0_bin = 256
@ -415,10 +415,12 @@ class Volume_Extractor:
def __init__(self, hop_size = 512): def __init__(self, hop_size = 512):
self.hop_size = hop_size self.hop_size = hop_size
def extract(self, audio): # audio: 1d numpy array def extract(self, audio): # audio: 2d tensor array
n_frames = int(len(audio) // self.hop_size) + 1 if isinstance(audio,torch.Tensor):
audio = torch.Tensor(audio)
n_frames = int(audio.size(-1) // self.hop_size)
audio2 = audio ** 2 audio2 = audio ** 2
audio2 = torch.nn.functional.pad(audio2, (int(self.hop_size // 2), int((self.hop_size + 1) // 2)), mode = 'reflect') audio2 = torch.nn.functional.pad(audio2, (int(self.hop_size // 2), int((self.hop_size + 1) // 2)), mode = 'reflect')
volume = torch.FloatTensor([torch.mean(audio2[int(n * self.hop_size) : int((n + 1) * self.hop_size)]) for n in range(n_frames)]) volume = torch.FloatTensor([torch.mean(audio2[:,int(n * self.hop_size) : int((n + 1) * self.hop_size)]) for n in range(n_frames)])
volume = torch.sqrt(volume) volume = torch.sqrt(volume)
return volume return volume