1
2 """
3 Created on Mon Mar 10 13:52:23 2014
4
5 Copyright (c) 2013-2014, CEA/DSV/I2BM/Neurospin. All rights reserved.
6
7 @author: Edouard Duchesnay
8 @email: edouard.duchesnay@cea.fr
9 @license: BSD 3-clause.
10 """
11 import numpy as np
12
13
15 """Estimate class weights for unbalanced datasets.
16
17 Parameters
18 ----------
19 class_weight : dict, 'auto' or None
20 If 'auto', class weights will be given inverse proportional
21 to the frequency of the class in the data. sample_weight will sum
22 to n_sample.
23 If a dictionary is given, keys are classes and values
24 are corresponding class weights. With two classes in {1, 0},
25 class_weight = {0:0.5, 1:0.5} is equivalent to class_weight == "auto"
26 If None is given, the class weights will be uniform sample_weight==1.
27
28 y : array-like, shape (n_samples,)
29 Array of original class labels per sample;
30
31 Returns
32 -------
33 weight_vect : ndarray, shape (n_samples,)
34 Array with weight_vect[i] the weight for i-th sample
35
36 Example
37 -------
38 >>> y = [1, 1, 1, 0, 0, 2]
39 >>> w = class_weight_to_sample_weight("auto", y)
40 >>> print w.sum() == len(y)
41 True
42 >>> print ["%i:%.2f" % (l, np.sum(w[y==l])) for l in np.unique(y)]
43 ['0:2.00', '1:2.00', '2:2.00']
44 >>> y = [1, 1, 1, 0, 0, 2]
45 >>> w2 = class_weight_to_sample_weight({0:1./3, 1:1./3, 2:1./3}, y)
46 >>> np.all(w2 == w)
47 True
48 """
49 if class_weight is None or len(class_weight) == 0:
50
51 return np.ones(y.shape, dtype=np.float64)
52
53
54 y = np.asarray(y)
55 classes = np.unique(y)
56 nk = np.bincount(y.astype(int).ravel())
57 n = float(y.shape[0])
58 if class_weight == 'auto':
59 pk = 1. / classes.shape[0]
60 else:
61 if not isinstance(class_weight, dict):
62 raise ValueError("class_weight must be dict, 'auto', or None,"
63 " got: %r" % class_weight)
64 pk = np.array([class_weight[k] for k in classes])
65 wk = n / nk * pk
66 sample_weight = wk[np.searchsorted(classes, y)]
67 return sample_weight
68
69
71 """ensure binary classification with 0, 1 labels"""
72 nlevels = 2
73 classes = np.unique(y)
74 if len(classes) > nlevels:
75 raise ValueError("Multinomial classification with more " \
76 "than %i labels is not possible" % nlevels)
77 classes_recoded = np.arange(len(classes))
78 if np.all(classes_recoded == classes):
79 return y
80
81 y_recoded = np.zeros(y.shape, dtype=np.float64)
82 for i in xrange(len(classes)):
83 y_recoded[y == classes[i]] = classes_recoded[i]
84 return y_recoded
85