1
2 """
3 Created on Tue Jun 18 09:22:40 2013
4
5 Copyright (c) 2013-2014, CEA/DSV/I2BM/Neurospin. All rights reserved.
6
7 @author: Tommy Löfstedt
8 @email: tommy.loefstedt@cea.fr
9 @license: BSD 3-clause.
10 """
11 import random
12
13 import numpy as np
14
15 import correlation_matrices
16
17 __all__ = ['load']
18
19
20 -def load(size=[[100, 100]], rho=[0.05], delta=0.1, eps=None, density=0.5,
21 snr=100.0, locally_smooth=False):
22 """ Generates random data for regression purposes. Builds data with a
23 regression model on the form
24
25 y = X.beta + e.
26
27 Parameters
28 ----------
29 size : A list or a list of lists. The shapes of the block matrices to
30 generate. The numbers of rows must be the same.
31
32 rho : A scalar or a list of the average correlation between off-diagonal
33 elements of S.
34
35 delta : Baseline noise between groups. Only used if the number of groups is
36 greater than one and locally_smooth=False. The baseline noise is
37 computed as
38
39 delta * rho_min,
40
41 and you must prvide a delta such that 0 <= delta < 1.
42
43 eps : Maximum entry-wise random noise. This parameter determines the
44 distribution of the noise. The noise is approximately normally
45 distributed. If locally_smooth=False the mean is
46
47 delta * rho_min
48
49 and the variance is
50
51 (eps * (1 - max(rho))) ** 2.0 / 10.
52
53 If locally_smooth=True, the mean is zero and the variance is
54
55 (eps * (1.0 - max(rho)) / (1.0 + max(rho))) ** 2.0 / 10.
56
57 You can thus control the noise by this parameter, but note that you
58 must have
59
60 0 <= eps < 1.
61
62 density : Determines how much of the regression vector is set to zero. If
63 density=1.0, the regression vector is dense and if density=0.0
64 would mean a zero vector. However, note that you should let
65
66 density * p >= 1,
67
68 where p is the number of columns in size.
69
70 snr : The signal-to-noise ratio. The dependent variable is computed as
71
72 y = X.beta + e
73
74 and Var(e) = (||X.beta||² / (n - 1)) / snr.
75
76 locally_smooth : If True, uses ToeplitzCorrelation (with "local
77 smoothing"); if False, uses ConstantCorrelation.
78
79 Returns
80 -------
81 X : The matrix of independent variables.
82
83 y : The dependent variable.
84
85 beta : The regression vector.
86
87 e : The noise/residual vector.
88 """
89 if not isinstance(rho, (list, tuple)):
90 size = [size]
91 rho = [rho]
92
93 K = len(rho)
94
95 p = [0] * K
96 n = None
97 for k in xrange(K):
98 if n != None and size[k][0] != n:
99 raise ValueError("The groups must have the same number of samples")
100 n = size[k][0]
101 pk = size[k][1]
102 p[k] = pk
103
104 if eps == None:
105 eps = np.sqrt(10.0 / float(n))
106
107 if locally_smooth:
108 S = correlation_matrices.ToeplitzCorrelation(p, rho, eps)
109 else:
110 S = correlation_matrices.ConstantCorrelation(p, rho, delta, eps)
111
112 p = sum(p)
113
114
115 mean = np.zeros(p)
116 X = np.random.multivariate_normal(mean, S, n)
117
118
119 beta = (np.random.rand(p, 1) - 0.5) * 2.0
120 ind = range(p)
121 random.shuffle(ind)
122 ind = ind[:int(round(len(ind) * (1.0 - density)))]
123 beta[ind] = 0
124
125
126 y = np.dot(X, beta)
127
128
129 var = (np.sum(y ** 2.0) / float(n - 1)) / float(snr)
130 e = np.random.randn(n, 1)
131 e *= np.sqrt(var)
132 y += e
133
134 return X, y, beta, e
135
136
137 if __name__ == "__main__":
138
139 n = 100
140 p = 100
141
142
143
144
145 X, S = load()
146 XX = np.dot(X.T, X) / (float(n) - 1.0)
147