1
2 """
3 The :mod:`parsimony.algorithms.cluster` module includes several algorithms
4 that performed clustering.
5
6 Algorithms may not store states. I.e., if they are classes, do not keep
7 references to objects with state in the algorithm objects. It should be
8 possible to copy and share algorithms between e.g. estimators, and thus they
9 should not depend on any state.
10
11 Created on Mon Feb 2 11:33:18 2015
12
13 Copyright (c) 2013-2014, CEA/DSV/I2BM/Neurospin. All rights reserved.
14
15 @author: Tommy Löfstedt
16 @email: lofstedt.tommy@gmail.com
17 @license: BSD 3-clause.
18 """
19 import numpy as np
20
21 try:
22 from . import bases
23 except ValueError:
24 import parsimony.algorithms.bases as bases
25 import parsimony.utils.maths as maths
26 import parsimony.utils.consts as consts
27
28 __all__ = ["KMeans"]
29
30
31 -class KMeans(bases.ImplicitAlgorithm):
32 """The K-means clustering algorithm.
33
34 Performs clustering using the K-means clustering algorithm, also known as
35 Lloyd's algorithm.
36
37 Parameters
38 ----------
39 K : Positive integer. The number of clusters to find.
40
41 init : KMeans.INIT or numpy array with shape (K, p). The initialisation
42 method to use for the centres. If init is a numpy array, it
43 contains the actual centres to use.
44
45 repeat : Positive integer. Default is 10. The number of times to repeat the
46 initialisation and run the algorithm. The set of centre points that
47 give the lowest within-cluster sum of squares is returned.
48
49 max_iter : Non-negative integer. Maximum allowed number of iterations.
50 Default is 100.
51
52 eps : Positive float. The tolerance used by the stopping criterion.
53
54 Returns
55 -------
56 centers : Numpy array. The mean cluster centres.
57
58 Examples
59 --------
60 >>> import numpy as np
61 """
64
67 self.K = max(1, int(K))
68 self.init = init
69 self.repeat = max(1, int(repeat))
70 self.max_iter = max(0, int(max_iter))
71 self.eps = max(consts.TOLERANCE, float(eps))
72
74 """Runs the K-means clustering algorithm on the given data matrix.
75
76 Parameters
77 ----------
78 X : Numpy array of shape (n, p). The matrix of points to cluster.
79 """
80 K = min(self.K, X.shape[0])
81
82 best_wcss = np.infty
83 best_mus = None
84 for repeat in xrange(self.repeat):
85
86 mus = self._init_mus(X, K)
87
88 for it in xrange(self.max_iter):
89 closest = self._closest_centers(X, mus, K)
90 old_mus = mus
91 mus = self._new_centers(X, closest, K)
92
93 if maths.norm(old_mus - mus) / maths.norm(old_mus) < self.eps:
94 break
95
96 if self.repeat == 1:
97 best_mus = mus
98 else:
99 wcss = self._wcss(X, mus, closest, K)
100
101 if wcss < best_wcss:
102 best_wcss = wcss
103 best_mus = mus
104
105 return best_mus
106
108 if isinstance(self.init, np.ndarray):
109 mus = self.init
110
111 elif self.init == KMeans.INIT.random:
112 xmax = np.max(X, axis=0)
113 xmin = np.min(X, axis=0)
114 diff = xmax - xmin
115 mus = np.zeros((K, X.shape[1]))
116 for i in xrange(K):
117 mu = np.multiply(diff, np.random.rand(X.shape[1])) + xmin
118 mus[i, :] = mu
119 else:
120 raise ValueError("'init' is not of a valid type.")
121
122 return mus
123
125 """Given a set of points and a set of centres, compute the closest
126 centre to each point.
127 """
128 dists = np.zeros((X.shape[0], K))
129 i = 0
130 for mu in mus:
131 dist = np.sum((X - mu) ** 2.0, axis=1)
132 dists[:, i] = dist
133 i += 1
134
135 closest = np.argmin(dists, axis=1).tolist()
136
137 return closest
138
140 """Given a set of points, X, and their previously closest centre,
141 encoded in closest, computes their new centres.
142 """
143 mus = np.zeros((K, X.shape[1]))
144 closest = np.array(closest)
145 global_mean = None
146 for i in xrange(K):
147 idx = closest == i
148 X_ = X[idx, :]
149 if X_.shape[0] == 0:
150 if global_mean is None:
151 global_mean = np.mean(X, axis=0)
152 mu = global_mean
153 else:
154 mu = np.mean(X_, axis=0)
155 mus[i, :] = mu
156
157 return mus
158
159 - def _wcss(self, X, mus, closest, K):
160 """Within-cluster sum of squares loss function.
161 """
162 closest = np.array(closest)
163 wcss = 0.0
164 for i in xrange(K):
165 idx = closest == i
166 wcss += np.sum((X[idx, :] - mus[i, :]) ** 2.0)
167
168 return wcss
169
170 if __name__ == "__main__":
171 import doctest
172 doctest.testmod()
173