Package parsimony :: Package algorithms :: Module cluster
[hide private]
[frames] | no frames]

Source Code for Module parsimony.algorithms.cluster

  1  # -*- coding: utf-8 -*- 
  2  """ 
  3  The :mod:`parsimony.algorithms.cluster` module includes several algorithms 
  4  that performed clustering. 
  5   
  6  Algorithms may not store states. I.e., if they are classes, do not keep 
  7  references to objects with state in the algorithm objects. It should be 
  8  possible to copy and share algorithms between e.g. estimators, and thus they 
  9  should not depend on any state. 
 10   
 11  Created on Mon Feb  2 11:33:18 2015 
 12   
 13  Copyright (c) 2013-2014, CEA/DSV/I2BM/Neurospin. All rights reserved. 
 14   
 15  @author:  Tommy Löfstedt 
 16  @email:   lofstedt.tommy@gmail.com 
 17  @license: BSD 3-clause. 
 18  """ 
 19  import numpy as np 
 20   
 21  try: 
 22      from . import bases  # Only works when imported as a package. 
 23  except ValueError: 
 24      import parsimony.algorithms.bases as bases  # When run as a program. 
 25  import parsimony.utils.maths as maths 
 26  import parsimony.utils.consts as consts 
 27   
 28  __all__ = ["KMeans"] 
 29   
 30   
31 -class KMeans(bases.ImplicitAlgorithm):
32 """The K-means clustering algorithm. 33 34 Performs clustering using the K-means clustering algorithm, also known as 35 Lloyd's algorithm. 36 37 Parameters 38 ---------- 39 K : Positive integer. The number of clusters to find. 40 41 init : KMeans.INIT or numpy array with shape (K, p). The initialisation 42 method to use for the centres. If init is a numpy array, it 43 contains the actual centres to use. 44 45 repeat : Positive integer. Default is 10. The number of times to repeat the 46 initialisation and run the algorithm. The set of centre points that 47 give the lowest within-cluster sum of squares is returned. 48 49 max_iter : Non-negative integer. Maximum allowed number of iterations. 50 Default is 100. 51 52 eps : Positive float. The tolerance used by the stopping criterion. 53 54 Returns 55 ------- 56 centers : Numpy array. The mean cluster centres. 57 58 Examples 59 -------- 60 >>> import numpy as np 61 """
62 - class INIT(object):
63 random = "random" # Random assignment.
64
65 - def __init__(self, K, init=INIT.random, repeat=10, 66 max_iter=100, eps=consts.TOLERANCE):
67 self.K = max(1, int(K)) 68 self.init = init 69 self.repeat = max(1, int(repeat)) 70 self.max_iter = max(0, int(max_iter)) 71 self.eps = max(consts.TOLERANCE, float(eps))
72
73 - def run(self, X):
74 """Runs the K-means clustering algorithm on the given data matrix. 75 76 Parameters 77 ---------- 78 X : Numpy array of shape (n, p). The matrix of points to cluster. 79 """ 80 K = min(self.K, X.shape[0]) # If K > # points. 81 82 best_wcss = np.infty 83 best_mus = None 84 for repeat in xrange(self.repeat): 85 86 mus = self._init_mus(X, K) 87 88 for it in xrange(self.max_iter): 89 closest = self._closest_centers(X, mus, K) 90 old_mus = mus 91 mus = self._new_centers(X, closest, K) 92 93 if maths.norm(old_mus - mus) / maths.norm(old_mus) < self.eps: 94 break 95 96 if self.repeat == 1: 97 best_mus = mus 98 else: 99 wcss = self._wcss(X, mus, closest, K) 100 101 if wcss < best_wcss: 102 best_wcss = wcss 103 best_mus = mus 104 105 return best_mus
106
107 - def _init_mus(self, X, K):
108 if isinstance(self.init, np.ndarray): 109 mus = self.init 110 # TODO: Warn if repeat > 1? 111 elif self.init == KMeans.INIT.random: 112 xmax = np.max(X, axis=0) 113 xmin = np.min(X, axis=0) 114 diff = xmax - xmin 115 mus = np.zeros((K, X.shape[1])) 116 for i in xrange(K): 117 mu = np.multiply(diff, np.random.rand(X.shape[1])) + xmin 118 mus[i, :] = mu 119 else: 120 raise ValueError("'init' is not of a valid type.") 121 122 return mus
123
124 - def _closest_centers(self, X, mus, K):
125 """Given a set of points and a set of centres, compute the closest 126 centre to each point. 127 """ 128 dists = np.zeros((X.shape[0], K)) 129 i = 0 130 for mu in mus: 131 dist = np.sum((X - mu) ** 2.0, axis=1) 132 dists[:, i] = dist 133 i += 1 134 135 closest = np.argmin(dists, axis=1).tolist() 136 137 return closest
138
139 - def _new_centers(self, X, closest, K):
140 """Given a set of points, X, and their previously closest centre, 141 encoded in closest, computes their new centres. 142 """ 143 mus = np.zeros((K, X.shape[1])) 144 closest = np.array(closest) 145 global_mean = None 146 for i in xrange(K): 147 idx = closest == i 148 X_ = X[idx, :] 149 if X_.shape[0] == 0: # No point was closest to this centre! 150 if global_mean is None: 151 global_mean = np.mean(X, axis=0) 152 mu = global_mean # TODO: Correct solution? 153 else: 154 mu = np.mean(X_, axis=0) 155 mus[i, :] = mu 156 157 return mus
158
159 - def _wcss(self, X, mus, closest, K):
160 """Within-cluster sum of squares loss function. 161 """ 162 closest = np.array(closest) 163 wcss = 0.0 164 for i in xrange(K): 165 idx = closest == i 166 wcss += np.sum((X[idx, :] - mus[i, :]) ** 2.0) 167 168 return wcss
169 170 if __name__ == "__main__": 171 import doctest 172 doctest.testmod() 173