updated_kmeans_pytorch_gpu_version

2023-05-18 00:01:20 +08:00 · 2023-05-18 00:01:20 +08:00 · 9c2e463a81
parent 1f52a0b3ec
commit 9c2e463a81
2 changed files with 256 additions and 0 deletions
--- a/cluster/km_train.py
+++ b/cluster/km_train.py
@ -0,0 +1,55 @@
+import time,pdb
+import tqdm
+from time import time as ttime
+import os
+from pathlib import Path
+import logging
+import argparse
+from cluster.kmeans import KMeansGPU
+import torch
+import numpy as np
+from sklearn.cluster import KMeans
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+from time import time as ttime
+import pynvml,torch
+
+def train_cluster(in_dir, n_clusters, use_minibatch=True, verbose=False,use_gpu=False):#gpu_minibatch真拉，虽然库支持但是也不考虑
+    logger.info(f"Loading features from {in_dir}")
+    features = []
+    nums = 0
+    for path in tqdm.tqdm(in_dir.glob("*.soft.pt")):
+    # for name in os.listdir(in_dir):
+    #     path="%s/%s"%(in_dir,name)
+        features.append(torch.load(path,map_location="cpu").squeeze(0).numpy().T)
+        # print(features[-1].shape)
+    features = np.concatenate(features, axis=0)
+    print(nums, features.nbytes/ 1024**2, "MB , shape:",features.shape, features.dtype)
+    features = features.astype(np.float32)
+    logger.info(f"Clustering features of shape: {features.shape}")
+    t = time.time()
+    if(use_gpu==False):
+        if use_minibatch:
+            kmeans = MiniBatchKMeans(n_clusters=n_clusters,verbose=verbose, batch_size=4096, max_iter=80).fit(features)
+        else:
+            kmeans = KMeans(n_clusters=n_clusters,verbose=verbose).fit(features)
+    else:
+            kmeans = KMeansGPU(n_clusters=n_clusters, mode='euclidean', verbose=2 if verbose else 0,max_iter=500,tol=1e-2)#
+            features=torch.from_numpy(features)#.to(device)
+            labels = kmeans.fit_predict(features)#
+
+    print(time.time()-t, "s")
+
+    x = {
+            "n_features_in_": kmeans.n_features_in_ if use_gpu==False else features.shape[0],
+            "_n_threads": kmeans._n_threads if use_gpu==False else 4,
+            "cluster_centers_": kmeans.cluster_centers_ if use_gpu==False else kmeans.centroids.cpu().numpy(),
+    }
+    print("end")
+
+    return x
+
+if __name__ == "__main__":
+    res=train_cluster("/data/docker/dataset/12b-co256tensor",1000,use_minibatch=False,verbose=False,use_gpu=True)
+    pdb.set_trace()
--- a/cluster/kmeans.py
+++ b/cluster/kmeans.py
@ -0,0 +1,201 @@
+import math,pdb
+import torch,pynvml
+from torch.nn.functional import normalize
+from time import time
+import numpy as np
+# device=torch.device("cuda:0")
+def _kpp(data: torch.Tensor, k: int, sample_size: int = -1):
+    """ Picks k points in the data based on the kmeans++ method.
+
+    Parameters
+    ----------
+    data : torch.Tensor
+        Expect a rank 1 or 2 array. Rank 1 is assumed to describe 1-D
+        data, rank 2 multidimensional data, in which case one
+        row is one observation.
+    k : int
+        Number of samples to generate.
+    sample_size : int
+        sample data to avoid memory overflow during calculation
+
+    Returns
+    -------
+    init : ndarray
+        A 'k' by 'N' containing the initial centroids.
+
+    References
+    ----------
+    .. [1] D. Arthur and S. Vassilvitskii, "k-means++: the advantages of
+       careful seeding", Proceedings of the Eighteenth Annual ACM-SIAM Symposium
+       on Discrete Algorithms, 2007.
+    .. [2] scipy/cluster/vq.py: _kpp
+    """
+    batch_size=data.shape[0]
+    if batch_size>sample_size:
+        data = data[torch.randint(0, batch_size,[sample_size], device=data.device)]
+    dims = data.shape[1] if len(data.shape) > 1 else 1
+    init = torch.zeros((k, dims)).to(data.device)
+    r = torch.distributions.uniform.Uniform(0, 1)
+    for i in range(k):
+        if i == 0:
+            init[i, :] = data[torch.randint(data.shape[0], [1])]
+        else:
+            D2 = torch.cdist(init[:i, :][None, :], data[None, :], p=2)[0].amin(dim=0)
+            probs = D2 / torch.sum(D2)
+            cumprobs = torch.cumsum(probs, dim=0)
+            init[i, :] = data[torch.searchsorted(cumprobs, r.sample([1]).to(data.device))]
+    return init
+class KMeansGPU:
+  '''
+  Kmeans clustering algorithm implemented with PyTorch
+
+  Parameters:
+    n_clusters: int, 
+      Number of clusters
+
+    max_iter: int, default: 100
+      Maximum number of iterations
+
+    tol: float, default: 0.0001
+      Tolerance
+    
+    verbose: int, default: 0
+      Verbosity
+
+    mode: {'euclidean', 'cosine'}, default: 'euclidean'
+      Type of distance measure
+      
+    init_method: {'random', 'point', '++'}
+      Type of initialization
+
+    minibatch: {None, int}, default: None
+      Batch size of MinibatchKmeans algorithm
+      if None perform full KMeans algorithm
+      
+  Attributes:
+    centroids: torch.Tensor, shape: [n_clusters, n_features]
+      cluster centroids
+  '''
+  def __init__(self, n_clusters, max_iter=200, tol=1e-4, verbose=0, mode="euclidean",device=torch.device("cuda:0")):
+    self.n_clusters = n_clusters
+    self.max_iter = max_iter
+    self.tol = tol
+    self.verbose = verbose
+    self.mode = mode
+    self.device=device
+    pynvml.nvmlInit()
+    gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(device.index)
+    info = pynvml.nvmlDeviceGetMemoryInfo(gpu_handle)
+    self.minibatch=int(33e6/self.n_clusters*info.free/ 1024 / 1024 / 1024)
+    print("free_mem/GB:",info.free/ 1024 / 1024 / 1024,"minibatch:",self.minibatch)
+    
+  @staticmethod
+  def cos_sim(a, b):
+    """
+      Compute cosine similarity of 2 sets of vectors
+
+      Parameters:
+      a: torch.Tensor, shape: [m, n_features]
+
+      b: torch.Tensor, shape: [n, n_features]
+    """
+    return normalize(a, dim=-1) @ normalize(b, dim=-1).transpose(-2, -1)
+
+  @staticmethod
+  def euc_sim(a, b):
+    """
+      Compute euclidean similarity of 2 sets of vectors
+      Parameters:
+      a: torch.Tensor, shape: [m, n_features]
+      b: torch.Tensor, shape: [n, n_features]
+    """
+    return 2 * a @ b.transpose(-2, -1) -(a**2).sum(dim=1)[..., :, None] - (b**2).sum(dim=1)[..., None, :]
+
+  def max_sim(self, a, b):
+    """
+      Compute maximum similarity (or minimum distance) of each vector
+      in a with all of the vectors in b
+      Parameters:
+      a: torch.Tensor, shape: [m, n_features]
+      b: torch.Tensor, shape: [n, n_features]
+    """
+    if self.mode == 'cosine':
+      sim_func = self.cos_sim
+    elif self.mode == 'euclidean':
+      sim_func = self.euc_sim
+    sim = sim_func(a, b)
+    max_sim_v, max_sim_i = sim.max(dim=-1)
+    return max_sim_v, max_sim_i
+
+  def fit_predict(self, X):
+    """
+      Combination of fit() and predict() methods.
+      This is faster than calling fit() and predict() seperately.
+      Parameters:
+      X: torch.Tensor, shape: [n_samples, n_features]
+      centroids: {torch.Tensor, None}, default: None
+        if given, centroids will be initialized with given tensor
+        if None, centroids will be randomly chosen from X
+      Return:
+      labels: torch.Tensor, shape: [n_samples]
+
+            mini_=33kk/k*remain
+            mini=min(mini_,fea_shape)
+            offset=log2(k/1000)*1.5
+            kpp_all=min(mini_*10/offset,fea_shape)
+            kpp_sample=min(mini_/12/offset,fea_shape)
+    """
+    assert isinstance(X, torch.Tensor), "input must be torch.Tensor"
+    assert X.dtype in [torch.half, torch.float, torch.double], "input must be floating point"
+    assert X.ndim == 2, "input must be a 2d tensor with shape: [n_samples, n_features] "
+    # print("verbose:%s"%self.verbose)
+
+    offset = np.power(1.5,np.log(self.n_clusters / 1000))/np.log(2)
+    with torch.no_grad():
+      batch_size= X.shape[0]
+      # print(self.minibatch, int(self.minibatch * 10 / offset), batch_size)
+      start_time = time()
+      if (self.minibatch*10//offset< batch_size):
+        x = X[torch.randint(0, batch_size,[int(self.minibatch*10/offset)])].to(self.device)
+      else:
+        x = X.to(self.device)
+      # print(x.device)
+      self.centroids = _kpp(x, self.n_clusters, min(int(self.minibatch/12/offset),batch_size))
+      del x
+      torch.cuda.empty_cache()
+      # self.centroids = self.centroids.to(self.device)
+      num_points_in_clusters = torch.ones(self.n_clusters, device=self.device, dtype=X.dtype)#全1
+      closest = None#[3098036]#int64
+      if(self.minibatch>=batch_size//2 and self.minibatch<batch_size):
+        X = X[torch.randint(0, batch_size,[self.minibatch])].to(self.device)
+      elif(self.minibatch>=batch_size):
+        X=X.to(self.device)
+      for i in range(self.max_iter):
+        iter_time = time()
+        if self.minibatch<batch_size//2:#可用minibatch数太小，每次都得从内存倒腾到显存
+          x = X[torch.randint(0, batch_size, [self.minibatch])].to(self.device)
+        else:#否则直接全部缓存
+          x = X
+
+        closest = self.max_sim(a=x, b=self.centroids)[1].to(torch.int16)#[3098036]#int64#0~999
+        matched_clusters, counts = closest.unique(return_counts=True)#int64#1k
+        expanded_closest = closest[None].expand(self.n_clusters, -1)#[1000, 3098036]#int16#0~999
+        mask = (expanded_closest==torch.arange(self.n_clusters, device=self.device)[:, None]).to(X.dtype)#==后者是int64*1000
+        c_grad = mask @ x / mask.sum(-1)[..., :, None]
+        c_grad[c_grad!=c_grad] = 0 # remove NaNs
+        error = (c_grad - self.centroids).pow(2).sum()
+        if self.minibatch is not None:
+          lr = 1/num_points_in_clusters[:,None] * 0.9 + 0.1
+        else:
+          lr = 1
+        matched_clusters=matched_clusters.long()
+        num_points_in_clusters[matched_clusters] += counts#IndexError: tensors used as indices must be long, byte or bool tensors
+        self.centroids = self.centroids * (1-lr) + c_grad * lr
+        if self.verbose >= 2:
+          print('iter:', i, 'error:', error.item(), 'time spent:', round(time()-iter_time, 4))
+        if error <= self.tol:
+          break
+
+      if self.verbose >= 1:
+        print(f'used {i+1} iterations ({round(time()-start_time, 4)}s) to cluster {batch_size} items into {self.n_clusters} clusters')
+    return closest