Debug Kmeans

This commit is contained in:
ylzz1997 2023-05-20 19:43:33 +08:00
parent 268cd92da8
commit 7c754dc420
2 changed files with 2 additions and 82 deletions

View File

@ -1,80 +0,0 @@
import time,pdb
import tqdm
from time import time as ttime
import os
from pathlib import Path
import logging
import argparse
from cluster.kmeans import KMeansGPU
import torch
import numpy as np
from sklearn.cluster import KMeans,MiniBatchKMeans
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
from time import time as ttime
import pynvml,torch
def train_cluster(in_dir, n_clusters, use_minibatch=True, verbose=False,use_gpu=False):#gpu_minibatch真拉虽然库支持但是也不考虑
logger.info(f"Loading features from {in_dir}")
features = []
nums = 0
for path in tqdm.tqdm(in_dir.glob("*.soft.pt")):
# for name in os.listdir(in_dir):
# path="%s/%s"%(in_dir,name)
features.append(torch.load(path,map_location="cpu").squeeze(0).numpy().T)
# print(features[-1].shape)
features = np.concatenate(features, axis=0)
print(nums, features.nbytes/ 1024**2, "MB , shape:",features.shape, features.dtype)
features = features.astype(np.float32)
logger.info(f"Clustering features of shape: {features.shape}")
t = time.time()
if(use_gpu==False):
if use_minibatch:
kmeans = MiniBatchKMeans(n_clusters=n_clusters,verbose=verbose, batch_size=4096, max_iter=80).fit(features)
else:
kmeans = KMeans(n_clusters=n_clusters,verbose=verbose).fit(features)
else:
kmeans = KMeansGPU(n_clusters=n_clusters, mode='euclidean', verbose=2 if verbose else 0,max_iter=500,tol=1e-2)#
features=torch.from_numpy(features)#.to(device)
labels = kmeans.fit_predict(features)#
print(time.time()-t, "s")
x = {
"n_features_in_": kmeans.n_features_in_ if use_gpu==False else features.shape[0],
"_n_threads": kmeans._n_threads if use_gpu==False else 4,
"cluster_centers_": kmeans.cluster_centers_ if use_gpu==False else kmeans.centroids.cpu().numpy(),
}
print("end")
return x
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--dataset', type=Path, default="./dataset/44k",
help='path of training data directory')
parser.add_argument('--output', type=Path, default="logs/44k",
help='path of model output directory')
args = parser.parse_args()
checkpoint_dir = args.output
dataset = args.dataset
n_clusters = 1000
ckpt = {}
for spk in os.listdir(dataset):
if os.path.isdir(dataset/spk):
print(f"train kmeans for {spk}...")
in_dir = dataset/spk
x = train_cluster(in_dir, n_clusters,use_minibatch=False,verbose=False,use_gpu=True)
ckpt[spk] = x
checkpoint_path = checkpoint_dir / f"kmeans_{n_clusters}.pt"
checkpoint_path.parent.mkdir(exist_ok=True, parents=True)
torch.save(
ckpt,
checkpoint_path,
)

View File

@ -42,7 +42,7 @@ def train_cluster(in_dir, n_clusters, use_minibatch=True, verbose=False,use_gpu=
print(time.time()-t, "s")
x = {
"n_features_in_": kmeans.n_features_in_ if use_gpu==False else features.shape[0],
"n_features_in_": kmeans.n_features_in_ if use_gpu==False else features.shape[1],
"_n_threads": kmeans._n_threads if use_gpu==False else 4,
"cluster_centers_": kmeans.cluster_centers_ if use_gpu==False else kmeans.centroids.cpu().numpy(),
}
@ -65,7 +65,7 @@ if __name__ == "__main__":
checkpoint_dir = args.output
dataset = args.dataset
use_gpu = args.gpu
n_clusters = 1000
n_clusters = 10000
ckpt = {}
for spk in os.listdir(dataset):