Merge pull request #181 from RVC-Boss/4.0-Vec768-Layer12

updated_kmeans_pytorch_gpu_version
2023-05-18 00:04:35 +08:00 · 2023-05-18 00:04:35 +08:00 · ea68821ac1
parent 1f52a0b3ec 9c2e463a81
commit ea68821ac1
2 changed files with 256 additions and 0 deletions
--- a/cluster/km_train.py
+++ b/cluster/km_train.py
@ -0,0 +1,55 @@
 import time,pdb
 import tqdm
 from time import time as ttime
 import os
 from pathlib import Path
 import logging
 import argparse
 from cluster.kmeans import KMeansGPU
 import torch
 import numpy as np
 from sklearn.cluster import KMeans
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 from time import time as ttime
 import pynvml,torch
 def train_cluster(in_dir, n_clusters, use_minibatch=True, verbose=False,use_gpu=False):#gpu_minibatch真拉，虽然库支持但是也不考虑
    logger.info(f"Loading features from {in_dir}")
    features = []
    nums = 0
    for path in tqdm.tqdm(in_dir.glob("*.soft.pt")):
    # for name in os.listdir(in_dir):
    #     path="%s/%s"%(in_dir,name)
        features.append(torch.load(path,map_location="cpu").squeeze(0).numpy().T)
        # print(features[-1].shape)
    features = np.concatenate(features, axis=0)
    print(nums, features.nbytes/ 1024**2, "MB , shape:",features.shape, features.dtype)
    features = features.astype(np.float32)
    logger.info(f"Clustering features of shape: {features.shape}")
    t = time.time()
    if(use_gpu==False):
        if use_minibatch:
            kmeans = MiniBatchKMeans(n_clusters=n_clusters,verbose=verbose, batch_size=4096, max_iter=80).fit(features)
        else:
            kmeans = KMeans(n_clusters=n_clusters,verbose=verbose).fit(features)
    else:
            kmeans = KMeansGPU(n_clusters=n_clusters, mode='euclidean', verbose=2 if verbose else 0,max_iter=500,tol=1e-2)#
            features=torch.from_numpy(features)#.to(device)
            labels = kmeans.fit_predict(features)#
    print(time.time()-t, "s")
    x = {
            "n_features_in_": kmeans.n_features_in_ if use_gpu==False else features.shape[0],
            "_n_threads": kmeans._n_threads if use_gpu==False else 4,
            "cluster_centers_": kmeans.cluster_centers_ if use_gpu==False else kmeans.centroids.cpu().numpy(),
    }
    print("end")
    return x
 if __name__ == "__main__":
    res=train_cluster("/data/docker/dataset/12b-co256tensor",1000,use_minibatch=False,verbose=False,use_gpu=True)
    pdb.set_trace()
--- a/cluster/kmeans.py
+++ b/cluster/kmeans.py
@ -0,0 +1,201 @@
 import math,pdb
 import torch,pynvml
 from torch.nn.functional import normalize
 from time import time
 import numpy as np
 # device=torch.device("cuda:0")
 def _kpp(data: torch.Tensor, k: int, sample_size: int = -1):
    """ Picks k points in the data based on the kmeans++ method.
    Parameters
    ----------
    data : torch.Tensor
        Expect a rank 1 or 2 array. Rank 1 is assumed to describe 1-D
        data, rank 2 multidimensional data, in which case one
        row is one observation.
    k : int
        Number of samples to generate.
    sample_size : int
        sample data to avoid memory overflow during calculation
    Returns
    -------
    init : ndarray
        A 'k' by 'N' containing the initial centroids.
    References
    ----------
    .. [1] D. Arthur and S. Vassilvitskii, "k-means++: the advantages of
       careful seeding", Proceedings of the Eighteenth Annual ACM-SIAM Symposium
       on Discrete Algorithms, 2007.
    .. [2] scipy/cluster/vq.py: _kpp
    """
    batch_size=data.shape[0]
    if batch_size>sample_size:
        data = data[torch.randint(0, batch_size,[sample_size], device=data.device)]
    dims = data.shape[1] if len(data.shape) > 1 else 1
    init = torch.zeros((k, dims)).to(data.device)
    r = torch.distributions.uniform.Uniform(0, 1)
    for i in range(k):
        if i == 0:
            init[i, :] = data[torch.randint(data.shape[0], [1])]
        else:
            D2 = torch.cdist(init[:i, :][None, :], data[None, :], p=2)[0].amin(dim=0)
            probs = D2 / torch.sum(D2)
            cumprobs = torch.cumsum(probs, dim=0)
            init[i, :] = data[torch.searchsorted(cumprobs, r.sample([1]).to(data.device))]
    return init
 class KMeansGPU:
  '''
  Kmeans clustering algorithm implemented with PyTorch
  Parameters:
    n_clusters: int, 
      Number of clusters
    max_iter: int, default: 100
      Maximum number of iterations
    tol: float, default: 0.0001
      Tolerance
    verbose: int, default: 0
      Verbosity
    mode: {'euclidean', 'cosine'}, default: 'euclidean'
      Type of distance measure
    init_method: {'random', 'point', '++'}
      Type of initialization
    minibatch: {None, int}, default: None
      Batch size of MinibatchKmeans algorithm
      if None perform full KMeans algorithm
  Attributes:
    centroids: torch.Tensor, shape: [n_clusters, n_features]
      cluster centroids
  '''
  def __init__(self, n_clusters, max_iter=200, tol=1e-4, verbose=0, mode="euclidean",device=torch.device("cuda:0")):
    self.n_clusters = n_clusters
    self.max_iter = max_iter
    self.tol = tol
    self.verbose = verbose
    self.mode = mode
    self.device=device
    pynvml.nvmlInit()
    gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(device.index)
    info = pynvml.nvmlDeviceGetMemoryInfo(gpu_handle)
    self.minibatch=int(33e6/self.n_clusters*info.free/ 1024 / 1024 / 1024)
    print("free_mem/GB:",info.free/ 1024 / 1024 / 1024,"minibatch:",self.minibatch)
  @staticmethod
  def cos_sim(a, b):
    """
      Compute cosine similarity of 2 sets of vectors
      Parameters:
      a: torch.Tensor, shape: [m, n_features]
      b: torch.Tensor, shape: [n, n_features]
    """
    return normalize(a, dim=-1) @ normalize(b, dim=-1).transpose(-2, -1)
  @staticmethod
  def euc_sim(a, b):
    """
      Compute euclidean similarity of 2 sets of vectors
      Parameters:
      a: torch.Tensor, shape: [m, n_features]
      b: torch.Tensor, shape: [n, n_features]
    """
    return 2 * a @ b.transpose(-2, -1) -(a**2).sum(dim=1)[..., :, None] - (b**2).sum(dim=1)[..., None, :]
  def max_sim(self, a, b):
    """
      Compute maximum similarity (or minimum distance) of each vector
      in a with all of the vectors in b
      Parameters:
      a: torch.Tensor, shape: [m, n_features]
      b: torch.Tensor, shape: [n, n_features]
    """
    if self.mode == 'cosine':
      sim_func = self.cos_sim
    elif self.mode == 'euclidean':
      sim_func = self.euc_sim
    sim = sim_func(a, b)
    max_sim_v, max_sim_i = sim.max(dim=-1)
    return max_sim_v, max_sim_i
  def fit_predict(self, X):
    """
      Combination of fit() and predict() methods.
      This is faster than calling fit() and predict() seperately.
      Parameters:
      X: torch.Tensor, shape: [n_samples, n_features]
      centroids: {torch.Tensor, None}, default: None
        if given, centroids will be initialized with given tensor
        if None, centroids will be randomly chosen from X
      Return:
      labels: torch.Tensor, shape: [n_samples]
            mini_=33kk/k*remain
            mini=min(mini_,fea_shape)
            offset=log2(k/1000)*1.5
            kpp_all=min(mini_*10/offset,fea_shape)
            kpp_sample=min(mini_/12/offset,fea_shape)
    """
    assert isinstance(X, torch.Tensor), "input must be torch.Tensor"
    assert X.dtype in [torch.half, torch.float, torch.double], "input must be floating point"
    assert X.ndim == 2, "input must be a 2d tensor with shape: [n_samples, n_features] "
    # print("verbose:%s"%self.verbose)
    offset = np.power(1.5,np.log(self.n_clusters / 1000))/np.log(2)
    with torch.no_grad():
      batch_size= X.shape[0]
      # print(self.minibatch, int(self.minibatch * 10 / offset), batch_size)
      start_time = time()
      if (self.minibatch*10//offset< batch_size):
        x = X[torch.randint(0, batch_size,[int(self.minibatch*10/offset)])].to(self.device)
      else:
        x = X.to(self.device)
      # print(x.device)
      self.centroids = _kpp(x, self.n_clusters, min(int(self.minibatch/12/offset),batch_size))
      del x
      torch.cuda.empty_cache()
      # self.centroids = self.centroids.to(self.device)
      num_points_in_clusters = torch.ones(self.n_clusters, device=self.device, dtype=X.dtype)#全1
      closest = None#[3098036]#int64
      if(self.minibatch>=batch_size//2 and self.minibatch<batch_size):
        X = X[torch.randint(0, batch_size,[self.minibatch])].to(self.device)
      elif(self.minibatch>=batch_size):
        X=X.to(self.device)
      for i in range(self.max_iter):
        iter_time = time()
        if self.minibatch<batch_size//2:#可用minibatch数太小，每次都得从内存倒腾到显存
          x = X[torch.randint(0, batch_size, [self.minibatch])].to(self.device)
        else:#否则直接全部缓存
          x = X
        closest = self.max_sim(a=x, b=self.centroids)[1].to(torch.int16)#[3098036]#int64#0~999
        matched_clusters, counts = closest.unique(return_counts=True)#int64#1k
        expanded_closest = closest[None].expand(self.n_clusters, -1)#[1000, 3098036]#int16#0~999
        mask = (expanded_closest==torch.arange(self.n_clusters, device=self.device)[:, None]).to(X.dtype)#==后者是int64*1000
        c_grad = mask @ x / mask.sum(-1)[..., :, None]
        c_grad[c_grad!=c_grad] = 0 # remove NaNs
        error = (c_grad - self.centroids).pow(2).sum()
        if self.minibatch is not None:
          lr = 1/num_points_in_clusters[:,None] * 0.9 + 0.1
        else:
          lr = 1
        matched_clusters=matched_clusters.long()
        num_points_in_clusters[matched_clusters] += counts#IndexError: tensors used as indices must be long, byte or bool tensors
        self.centroids = self.centroids * (1-lr) + c_grad * lr
        if self.verbose >= 2:
          print('iter:', i, 'error:', error.item(), 'time spent:', round(time()-iter_time, 4))
        if error <= self.tol:
          break
      if self.verbose >= 1:
        print(f'used {i+1} iterations ({round(time()-start_time, 4)}s) to cluster {batch_size} items into {self.n_clusters} clusters')
    return closest