updated_kmeans_pytorch_gpu_version

updated_kmeans_pytorch_gpu_version
This commit is contained in:
RVC-Boss 2023-05-18 00:01:20 +08:00 committed by GitHub
parent 1f52a0b3ec
commit 9c2e463a81
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 256 additions and 0 deletions

55
cluster/km_train.py Normal file
View File

@ -0,0 +1,55 @@
import time,pdb
import tqdm
from time import time as ttime
import os
from pathlib import Path
import logging
import argparse
from cluster.kmeans import KMeansGPU
import torch
import numpy as np
from sklearn.cluster import KMeans
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
from time import time as ttime
import pynvml,torch
def train_cluster(in_dir, n_clusters, use_minibatch=True, verbose=False,use_gpu=False):#gpu_minibatch真拉虽然库支持但是也不考虑
logger.info(f"Loading features from {in_dir}")
features = []
nums = 0
for path in tqdm.tqdm(in_dir.glob("*.soft.pt")):
# for name in os.listdir(in_dir):
# path="%s/%s"%(in_dir,name)
features.append(torch.load(path,map_location="cpu").squeeze(0).numpy().T)
# print(features[-1].shape)
features = np.concatenate(features, axis=0)
print(nums, features.nbytes/ 1024**2, "MB , shape:",features.shape, features.dtype)
features = features.astype(np.float32)
logger.info(f"Clustering features of shape: {features.shape}")
t = time.time()
if(use_gpu==False):
if use_minibatch:
kmeans = MiniBatchKMeans(n_clusters=n_clusters,verbose=verbose, batch_size=4096, max_iter=80).fit(features)
else:
kmeans = KMeans(n_clusters=n_clusters,verbose=verbose).fit(features)
else:
kmeans = KMeansGPU(n_clusters=n_clusters, mode='euclidean', verbose=2 if verbose else 0,max_iter=500,tol=1e-2)#
features=torch.from_numpy(features)#.to(device)
labels = kmeans.fit_predict(features)#
print(time.time()-t, "s")
x = {
"n_features_in_": kmeans.n_features_in_ if use_gpu==False else features.shape[0],
"_n_threads": kmeans._n_threads if use_gpu==False else 4,
"cluster_centers_": kmeans.cluster_centers_ if use_gpu==False else kmeans.centroids.cpu().numpy(),
}
print("end")
return x
if __name__ == "__main__":
res=train_cluster("/data/docker/dataset/12b-co256tensor",1000,use_minibatch=False,verbose=False,use_gpu=True)
pdb.set_trace()

201
cluster/kmeans.py Normal file
View File

@ -0,0 +1,201 @@
import math,pdb
import torch,pynvml
from torch.nn.functional import normalize
from time import time
import numpy as np
# device=torch.device("cuda:0")
def _kpp(data: torch.Tensor, k: int, sample_size: int = -1):
""" Picks k points in the data based on the kmeans++ method.
Parameters
----------
data : torch.Tensor
Expect a rank 1 or 2 array. Rank 1 is assumed to describe 1-D
data, rank 2 multidimensional data, in which case one
row is one observation.
k : int
Number of samples to generate.
sample_size : int
sample data to avoid memory overflow during calculation
Returns
-------
init : ndarray
A 'k' by 'N' containing the initial centroids.
References
----------
.. [1] D. Arthur and S. Vassilvitskii, "k-means++: the advantages of
careful seeding", Proceedings of the Eighteenth Annual ACM-SIAM Symposium
on Discrete Algorithms, 2007.
.. [2] scipy/cluster/vq.py: _kpp
"""
batch_size=data.shape[0]
if batch_size>sample_size:
data = data[torch.randint(0, batch_size,[sample_size], device=data.device)]
dims = data.shape[1] if len(data.shape) > 1 else 1
init = torch.zeros((k, dims)).to(data.device)
r = torch.distributions.uniform.Uniform(0, 1)
for i in range(k):
if i == 0:
init[i, :] = data[torch.randint(data.shape[0], [1])]
else:
D2 = torch.cdist(init[:i, :][None, :], data[None, :], p=2)[0].amin(dim=0)
probs = D2 / torch.sum(D2)
cumprobs = torch.cumsum(probs, dim=0)
init[i, :] = data[torch.searchsorted(cumprobs, r.sample([1]).to(data.device))]
return init
class KMeansGPU:
'''
Kmeans clustering algorithm implemented with PyTorch
Parameters:
n_clusters: int,
Number of clusters
max_iter: int, default: 100
Maximum number of iterations
tol: float, default: 0.0001
Tolerance
verbose: int, default: 0
Verbosity
mode: {'euclidean', 'cosine'}, default: 'euclidean'
Type of distance measure
init_method: {'random', 'point', '++'}
Type of initialization
minibatch: {None, int}, default: None
Batch size of MinibatchKmeans algorithm
if None perform full KMeans algorithm
Attributes:
centroids: torch.Tensor, shape: [n_clusters, n_features]
cluster centroids
'''
def __init__(self, n_clusters, max_iter=200, tol=1e-4, verbose=0, mode="euclidean",device=torch.device("cuda:0")):
self.n_clusters = n_clusters
self.max_iter = max_iter
self.tol = tol
self.verbose = verbose
self.mode = mode
self.device=device
pynvml.nvmlInit()
gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(device.index)
info = pynvml.nvmlDeviceGetMemoryInfo(gpu_handle)
self.minibatch=int(33e6/self.n_clusters*info.free/ 1024 / 1024 / 1024)
print("free_mem/GB:",info.free/ 1024 / 1024 / 1024,"minibatch:",self.minibatch)
@staticmethod
def cos_sim(a, b):
"""
Compute cosine similarity of 2 sets of vectors
Parameters:
a: torch.Tensor, shape: [m, n_features]
b: torch.Tensor, shape: [n, n_features]
"""
return normalize(a, dim=-1) @ normalize(b, dim=-1).transpose(-2, -1)
@staticmethod
def euc_sim(a, b):
"""
Compute euclidean similarity of 2 sets of vectors
Parameters:
a: torch.Tensor, shape: [m, n_features]
b: torch.Tensor, shape: [n, n_features]
"""
return 2 * a @ b.transpose(-2, -1) -(a**2).sum(dim=1)[..., :, None] - (b**2).sum(dim=1)[..., None, :]
def max_sim(self, a, b):
"""
Compute maximum similarity (or minimum distance) of each vector
in a with all of the vectors in b
Parameters:
a: torch.Tensor, shape: [m, n_features]
b: torch.Tensor, shape: [n, n_features]
"""
if self.mode == 'cosine':
sim_func = self.cos_sim
elif self.mode == 'euclidean':
sim_func = self.euc_sim
sim = sim_func(a, b)
max_sim_v, max_sim_i = sim.max(dim=-1)
return max_sim_v, max_sim_i
def fit_predict(self, X):
"""
Combination of fit() and predict() methods.
This is faster than calling fit() and predict() seperately.
Parameters:
X: torch.Tensor, shape: [n_samples, n_features]
centroids: {torch.Tensor, None}, default: None
if given, centroids will be initialized with given tensor
if None, centroids will be randomly chosen from X
Return:
labels: torch.Tensor, shape: [n_samples]
mini_=33kk/k*remain
mini=min(mini_,fea_shape)
offset=log2(k/1000)*1.5
kpp_all=min(mini_*10/offset,fea_shape)
kpp_sample=min(mini_/12/offset,fea_shape)
"""
assert isinstance(X, torch.Tensor), "input must be torch.Tensor"
assert X.dtype in [torch.half, torch.float, torch.double], "input must be floating point"
assert X.ndim == 2, "input must be a 2d tensor with shape: [n_samples, n_features] "
# print("verbose:%s"%self.verbose)
offset = np.power(1.5,np.log(self.n_clusters / 1000))/np.log(2)
with torch.no_grad():
batch_size= X.shape[0]
# print(self.minibatch, int(self.minibatch * 10 / offset), batch_size)
start_time = time()
if (self.minibatch*10//offset< batch_size):
x = X[torch.randint(0, batch_size,[int(self.minibatch*10/offset)])].to(self.device)
else:
x = X.to(self.device)
# print(x.device)
self.centroids = _kpp(x, self.n_clusters, min(int(self.minibatch/12/offset),batch_size))
del x
torch.cuda.empty_cache()
# self.centroids = self.centroids.to(self.device)
num_points_in_clusters = torch.ones(self.n_clusters, device=self.device, dtype=X.dtype)#全1
closest = None#[3098036]#int64
if(self.minibatch>=batch_size//2 and self.minibatch<batch_size):
X = X[torch.randint(0, batch_size,[self.minibatch])].to(self.device)
elif(self.minibatch>=batch_size):
X=X.to(self.device)
for i in range(self.max_iter):
iter_time = time()
if self.minibatch<batch_size//2:#可用minibatch数太小每次都得从内存倒腾到显存
x = X[torch.randint(0, batch_size, [self.minibatch])].to(self.device)
else:#否则直接全部缓存
x = X
closest = self.max_sim(a=x, b=self.centroids)[1].to(torch.int16)#[3098036]#int64#0~999
matched_clusters, counts = closest.unique(return_counts=True)#int64#1k
expanded_closest = closest[None].expand(self.n_clusters, -1)#[1000, 3098036]#int16#0~999
mask = (expanded_closest==torch.arange(self.n_clusters, device=self.device)[:, None]).to(X.dtype)#==后者是int64*1000
c_grad = mask @ x / mask.sum(-1)[..., :, None]
c_grad[c_grad!=c_grad] = 0 # remove NaNs
error = (c_grad - self.centroids).pow(2).sum()
if self.minibatch is not None:
lr = 1/num_points_in_clusters[:,None] * 0.9 + 0.1
else:
lr = 1
matched_clusters=matched_clusters.long()
num_points_in_clusters[matched_clusters] += counts#IndexError: tensors used as indices must be long, byte or bool tensors
self.centroids = self.centroids * (1-lr) + c_grad * lr
if self.verbose >= 2:
print('iter:', i, 'error:', error.item(), 'time spent:', round(time()-iter_time, 4))
if error <= self.tol:
break
if self.verbose >= 1:
print(f'used {i+1} iterations ({round(time()-start_time, 4)}s) to cluster {batch_size} items into {self.n_clusters} clusters')
return closest