parsimony.functions.combinedfunctions

1 # -*- coding: utf-8 -*- 2 """ 3 The :mod:`parsimony.functions.objectives.functions` module contains ready-made 4 common combinations of loss functions and penalties that can be used right 5 away to analyse real data. 6 7 Created on Mon Apr 22 10:54:29 2013 8 9 Copyright (c) 2013-2014, CEA/DSV/I2BM/Neurospin. All rights reserved. 10 11 @author: Tommy Löfstedt, Vincent Guillemot, Edouard Duchesnay and 12 Fouad Hadj-Selem 13 @email: lofstedt.tommy@gmail.com, edouard.duchesnay@cea.fr 14 @license: BSD 3-clause. 15 """ 16 import numpy as np 17 18 from . import properties 19 #import nesterov.properties as nesterov_properties 20 #from .nesterov.l1 import L1 as SmoothedL1 21 #import nesterov 22 from .nesterov.l1tv import L1TV 23 from .nesterov.tv import TotalVariation 24 from .nesterov.gl import GroupLassoOverlap 25 from .penalties import ZeroFunction, L1, LinearVariableConstraint 26 from .penalties import RidgeSquaredError 27 from .losses import RidgeRegression 28 from .losses import RidgeLogisticRegression 29 from .losses import LatentVariableVariance 30 import parsimony.utils.linalgs as linalgs 31 import parsimony.utils.consts as consts 32 import parsimony.utils.maths as maths 33 from parsimony.utils import deprecated 34 35 __all__ = ["CombinedFunction", 36 "LinearRegressionL1L2TV", "LinearRegressionL1L2GL", 37 "LogisticRegressionL1L2TV", "LogisticRegressionL1L2GL", 38 "LinearRegressionL2SmoothedL1TV", 39 "AugmentedLinearRegressionL1L2TV", 40 "PrincipalComponentAnalysisL1TV"]

41 42 # TODO: Add penalty_start and mean to all of these! 43 44 45 -class CombinedFunction(properties.CompositeFunction, 46 properties.Gradient, 47 properties.ProximalOperator, 48 properties.ProjectionOperator, 49 properties.StepSize):

50 """Combines one or more loss functions, any number of penalties and zero 51 or one proximal operator. 52 53 This function thus represents 54 55 f(x) = f_1(x) [ + f_2(x) ... ] [ + p_1(x) ... ] [ + P(x)], 56 57 subject to [ C_1(x) <= c_1, 58 C_2(x) <= c_2, 59 ... ], 60 61 where f_i are differentiable Functions, p_j are differentiable penalties 62 and P is a ProximalOperator. All functions and penalties must thus be 63 Gradient, unless it is a ProximalOperator. 64 65 If no ProximalOperator is given, then prox is the identity. 66 """

67 - def __init__(self, functions=[], penalties=[], prox=[], constraints=[]):

68 69 self._f = list(functions) 70 self._p = list(penalties) 71 self._prox = list(prox) 72 if len(self._prox) == 0: 73 self._prox.append(ZeroFunction()) 74 self._c = list(constraints)

75

76 - def reset(self):

77 78 for f in self._f: 79 f.reset() 80 81 for p in self._p: 82 p.reset() 83 84 for prox in self._prox: 85 prox.reset() 86 87 for c in self._c: 88 c.reset()

89

90 - def add_loss(self, function):

91 92 if not isinstance(function, properties.Gradient): 93 raise ValueError("Loss functions must have gradients.") 94 95 self._f.append(function)

96 97 @deprecated("add_loss")

98 - def add_function(self, function):

99 100 return self.add_loss(function)

101

102 - def add_penalty(self, penalty):

103 104 if not isinstance(penalty, properties.Penalty): 105 raise ValueError("Not a penalty.") 106 elif isinstance(penalty, properties.Gradient): 107 self._p.append(penalty) 108 elif isinstance(penalty, properties.ProximalOperator): 109 if len(self._c) > 0: 110 # TODO: Yes we can! Fix this! 111 raise ValueError("Cannot have both ProximalOperator and " 112 "ProjectionOperator.") 113 else: 114 # TODO: We currently only allow one proximal operator. Fix! 115 self._prox[0] = penalty 116 else: 117 raise ValueError("The penalty is not smooth, nor has it a " 118 "proximal operator.")

119

120 - def add_constraint(self, constraint):

121 122 if not isinstance(constraint, properties.Constraint): 123 raise ValueError("Not a constraint.") 124 elif not isinstance(constraint, properties.ProjectionOperator): 125 raise ValueError("Constraints must have projection operators.") 126 elif not isinstance(self._prox[0], ZeroFunction): 127 # TODO: Yes we can! Fix this! 128 raise ValueError("Cannot have both ProjectionOperator and " 129 "ProximalOperator.") 130 else: 131 self._c.append(constraint)

132

133 - def add_smooth_penalty(self, penalty):

134 135 if not isinstance(penalty, properties.Penalty): 136 raise ValueError("Not a penalty.") 137 elif not isinstance(penalty, properties.Gradient): 138 raise ValueError("The penalty is not smooth, nor has it a " 139 "proximal operator.") 140 else: 141 self._p.append(penalty)

142

143 - def add_prox(self, penalty):

144 145 if not isinstance(penalty, properties.ProximalOperator): 146 raise ValueError("Not a proximal operator.") 147 elif len(self._c) > 0: 148 # TODO: Yes we can! Fix this! 149 raise ValueError("Cannot have both ProximalOperator and " 150 "ProjectionOperator.") 151 else: 152 # TODO: We currently only allow one proximal operator. Fix this! 153 self._prox[0] = penalty

154

155 - def add_nesterov(self, penalty):

156 157 if not isinstance(penalty, properties.NesterovFunction): 158 raise ValueError("Not a Nesterov function.") 159 else: 160 self._p.append(penalty)

161

162 - def f(self, x):

163 """Function value. 164 """ 165 val = 0.0 166 for f in self._f: 167 val += f.f(x) 168 169 for p in self._p: 170 val += p.f(x) 171 172 for prox in self._prox: 173 val += prox.f(x) 174 175 return val

176

177 - def grad(self, x):

178 """Gradient of the differentiable part of the function. 179 180 From the interface "Gradient". 181 """ 182 grad = 0.0 183 184 # Add gradients from the loss functions. 185 for f in self._f: 186 grad += f.grad(x) 187 188 # Add gradients from the penalties. 189 for p in self._p: 190 grad += p.grad(x) 191 192 return grad

193

194 - def prox(self, x, factor=1.0, **kwargs):

195 """The proximal operator of the non-differentiable part of the 196 function. 197 198 From the interface "ProximalOperator". 199 """ 200 # TODO: We currently only allow one proximal operator. Fix this! 201 return self._prox[0].prox(x, factor=factor, **kwargs)

202

203 - def proj(self, x):

204 raise NotImplementedError("Not yet implemented.")

205

206 - def step(self, x):

207 """The step size to use in descent methods. 208 209 From the interface "StepSize". 210 211 Parameters 212 ---------- 213 x : Numpy array. The point at which to evaluate the step size. 214 """ 215 all_lipschitz = True 216 for f in self._f: 217 if not isinstance(f, properties.LipschitzContinuousGradient): 218 all_lipschitz = False 219 break 220 221 for p in self._p: 222 if not isinstance(p, properties.LipschitzContinuousGradient): 223 all_lipschitz = False 224 break 225 226 step = 0.0 227 if all_lipschitz: 228 L = 0.0 229 for f in self._f: 230 L += f.L() 231 for p in self._p: 232 L += p.L() 233 234 if all_lipschitz and L > consts.TOLERANCE: 235 step = 1.0 / L 236 else: 237 # If not all functions have Lipschitz continuous gradients, try 238 # to find the step size through backtracking line search. 239 from parsimony.algorithms.utils import BacktrackingLineSearch 240 import parsimony.functions.penalties as penalties 241 242 p = -self.grad(x) 243 line_search = BacktrackingLineSearch( 244 condition=penalties.SufficientDescentCondition, max_iter=30) 245 step = line_search.run(self, x, p, rho=0.5, a=0.1, 246 condition_params=dict(c=1e-4)) 247 248 return step

249

250 251 -class LinearRegressionL1L2TV(properties.CompositeFunction, 252 properties.NesterovFunction, 253 properties.ProximalOperator, 254 properties.Continuation, 255 properties.DualFunction, 256 properties.StronglyConvex, 257 properties.StepSize):

258 """Combination (sum) of LinearRegression, L1, L2 and TotalVariation. 259 260 Parameters: 261 ---------- 262 X : Numpy array. The X matrix for the linear regression. 263 264 y : Numpy array. The y vector for the linear regression. 265 266 l1 : Non-negative float. The Lagrange multiplier, or regularisation 267 constant, for the L1 penalty. 268 269 l2 : Non-negative float. The Lagrange multiplier, or regularisation 270 constant, for the ridge penalty. 271 272 tv : Non-negative float. The Lagrange multiplier, or regularisation 273 constant, of the smoothed TV function. 274 275 A : Numpy array (usually sparse). The linear operator for the Nesterov 276 formulation of TV. May not be None! 277 278 mu : Non-negative float. The regularisation constant for the smoothing 279 of the TV function. 280 281 penalty_start : Non-negative integer. The number of columns, variables 282 etc., to except from penalisation. Equivalently, the first 283 index to be penalised. Default is 0, all columns are included. 284 285 mean : Boolean. Whether to compute the squared loss or the mean 286 squared loss. Default is True, the mean squared loss. 287 """

288 - def __init__(self, X, y, l1, l2, tv, A=None, mu=0.0, penalty_start=0, 289 mean=True):

290 self.X = X 291 self.y = y 292 293 self.rr = RidgeRegression(X, y, l2, penalty_start=penalty_start, 294 mean=mean) 295 self.l1 = L1(l1, penalty_start=penalty_start) 296 self.tv = TotalVariation(tv, A=A, mu=mu, penalty_start=penalty_start) 297 298 self.penalty_start = penalty_start 299 self.mean = mean 300 301 self.reset()

302

303 - def reset(self):

304 305 self.rr.reset() 306 self.l1.reset() 307 self.tv.reset() 308 309 self._Xty = None 310 self._invXXkI = None 311 self._XtinvXXtkI = None

312

313 - def set_params(self, **kwargs):

314 315 mu = kwargs.pop("mu", self.get_mu()) 316 self.set_mu(mu) 317 318 super(LinearRegressionL1L2TV, self).set_params(**kwargs)

319

320 - def get_mu(self):

321 """Returns the regularisation constant for the smoothing. 322 323 From the interface "NesterovFunction". 324 """ 325 return self.tv.get_mu()

326

327 - def set_mu(self, mu):

328 """Sets the regularisation constant for the smoothing. 329 330 From the interface "NesterovFunction". 331 332 Parameters: 333 ---------- 334 mu : Non-negative float. The regularisation constant for the smoothing 335 to use from now on. 336 337 Returns: 338 ------- 339 old_mu : Non-negative float. The old regularisation constant for the 340 smoothing that was overwritten and is no longer used. 341 """ 342 return self.tv.set_mu(mu)

343

344 - def f(self, beta):

345 """Function value. 346 """ 347 return self.rr.f(beta) \ 348 + self.l1.f(beta) \ 349 + self.tv.f(beta)

350

351 - def fmu(self, beta, mu=None):

352 """Function value. 353 """ 354 return self.rr.f(beta) \ 355 + self.l1.f(beta) \ 356 + self.tv.fmu(beta, mu)

357

358 - def phi(self, alpha, beta):

359 """ Function value with known alpha. 360 """ 361 return self.rr.f(beta) \ 362 + self.l1.f(beta) \ 363 + self.tv.phi(alpha, beta)

364

365 - def grad(self, beta):

366 """Gradient of the differentiable part of the function. 367 368 From the interface "Gradient". 369 """ 370 return self.rr.grad(beta) \ 371 + self.tv.grad(beta)

372

373 - def L(self):

374 """Lipschitz constant of the gradient. 375 376 From the interface "LipschitzContinuousGradient". 377 """ 378 return self.rr.L() \ 379 + self.tv.L()

380

381 - def prox(self, beta, factor=1.0, **kwargs):

382 """The proximal operator of the non-differentiable part of the 383 function. 384 385 From the interface "ProximalOperator". 386 """ 387 return self.l1.prox(beta, factor)

388

389 - def estimate_mu(self, beta):

390 """Computes a "good" value of mu with respect to the given beta. 391 392 From the interface "NesterovFunction". 393 """ 394 return self.tv.estimate_mu(beta)

395

396 - def M(self):

397 """The maximum value of the regularisation of the dual variable. We 398 have 399 400 M = max_{alpha in K} 0.5*|alpha|²_2. 401 402 From the interface "NesterovFunction". 403 """ 404 return self.tv.M()

405

406 - def mu_opt(self, eps):

407 """The optimal value of mu given epsilon. 408 409 From the interface "Continuation". 410 """ 411 gM = self.tv.l * self.tv.M() 412 413 # Mu is set to 1.0, because it is in fact not here "anymore". It is 414 # factored out in this solution. 415 old_mu = self.tv.set_mu(1.0) 416 gA2 = self.tv.L() # Gamma is in here! 417 self.tv.set_mu(old_mu) 418 419 Lg = self.rr.L() 420 421 return (-gM * gA2 + np.sqrt((gM * gA2) ** 2.0 422 + gM * Lg * gA2 * eps)) \ 423 / (gM * Lg)

424

425 - def eps_opt(self, mu):

426 """The optimal value of epsilon given mu. 427 428 From the interface "Continuation". 429 """ 430 gM = self.tv.l * self.tv.M() 431 432 # Mu is set to 1.0, because it is in fact not here "anymore". It is 433 # factored out in this solution. 434 old_mu = self.tv.set_mu(1.0) 435 gA2 = self.tv.L() # Gamma is in here! 436 self.tv.set_mu(old_mu) 437 438 Lg = self.rr.L() 439 440 return (2.0 * gM * gA2 * mu 441 + gM * Lg * mu ** 2.0) \ 442 / gA2

443

444 - def eps_max(self, mu):

445 """The maximum value of epsilon. 446 447 From the interface "Continuation". 448 449 Parameters 450 ---------- 451 mu : Positive float. The regularisation constant of the smoothing. 452 453 Returns 454 ------- 455 eps : Positive float. The upper limit, the maximum, precision. 456 """ 457 gM = self.tv.l * self.tv.M() 458 459 return float(mu) * gM

460

461 - def mu_max(self, eps):

462 """The maximum value of mu. 463 464 From the interface "Continuation". 465 466 Parameters 467 ---------- 468 eps : Positive float. The maximum precision of the smoothing. 469 470 Returns 471 ------- 472 mu : Positive float. The upper limit, the maximum, of the 473 regularisation constant of the smoothing. 474 """ 475 gM = self.tv.l * self.tv.M() 476 477 return float(eps) / gM

478

479 - def betahat(self, alphak, betak, # mu_min=consts.TOLERANCE, 480 eps=consts.TOLERANCE, max_iter=consts.MAX_ITER):

481 """ Returns the beta that minimises the dual function. Used when we 482 compute the gap. 483 484 From the interface "DualFunction". 485 """ 486 # if self._Xty is None: 487 # self._Xty = np.dot(self.X.T, self.y) 488 489 Ata_tv = self.tv.l * self.tv.Aa(alphak) 490 if self.penalty_start > 0: 491 Ata_tv = np.vstack((np.zeros((self.penalty_start, 1)), 492 Ata_tv)) 493 494 # Ata_l1 = self.l1.l * SmoothedL1.project([betak / mu_min])[0] 495 # v = (self._Xty - Ata_tv - Ata_l1) 496 # 497 # shape = self.X.shape 498 # 499 # if shape[0] > shape[1]: # If n > p 500 # 501 # # Ridge solution 502 # if self._invXXkI is None: 503 # XtXkI = np.dot(self.X.T, self.X) 504 # index = np.arange(min(XtXkI.shape)) 505 # XtXkI[index, index] += self.rr.k 506 # self._invXXkI = np.linalg.inv(XtXkI) 507 # 508 # beta_hat = np.dot(self._invXXkI, v) 509 # 510 # else: # If p > n 511 # # Ridge solution using the Woodbury matrix identity: 512 # if self._XtinvXXtkI is None: 513 # XXtkI = np.dot(self.X, self.X.T) 514 # index = np.arange(min(XXtkI.shape)) 515 # XXtkI[index, index] += self.rr.k 516 # invXXtkI = np.linalg.inv(XXtkI) 517 # self._XtinvXXtkI = np.dot(self.X.T, invXXtkI) 518 # 519 # beta_hat = (v - np.dot(self._XtinvXXtkI, np.dot(self.X, v))) \ 520 # / self.rr.k 521 522 beta_hat = betak 523 524 from parsimony.functions import CombinedFunction 525 import parsimony.algorithms.proximal as proximal 526 import parsimony.functions.losses as losses 527 import parsimony.functions.penalties as penalties 528 # import parsimony.functions.nesterov as nesterov 529 530 function = CombinedFunction() 531 function.add_function(losses.RidgeRegression(self.X, self.y, 532 self.rr.k, 533 penalty_start=self.penalty_start, 534 mean=self.mean)) 535 function.add_function(losses.LinearFunction(Ata_tv)) 536 # function.add_function(losses.LinearFunction(Ata_l1)) 537 # A = nesterov.l1.linear_operator_from_variables(self.X.shape[1], 538 # penalty_start=self.penalty_start) 539 # function.add_penalty(nesterov.l1.L1(self.l1.l, A=A, mu=mu_min, 540 # penalty_start=self.penalty_start)) 541 function.add_prox(penalties.L1(self.l1.l, 542 penalty_start=self.penalty_start)) 543 544 fista = proximal.FISTA(eps=eps, max_iter=max_iter) 545 beta_hat_ = fista.run(function, beta_hat) 546 547 # print np.linalg.norm(beta_hat - beta_hat_) 548 549 # print "f:", function.f(beta_hat) 550 # print "f:", function.f(beta_hat_) 551 552 beta_hat = beta_hat_ 553 554 return beta_hat

555

556 - def gap(self, beta, beta_hat=None, 557 eps=consts.TOLERANCE, max_iter=consts.MAX_ITER):

558 """Compute the duality gap. 559 560 From the interface "DualFunction". 561 """ 562 if self.penalty_start > 0: 563 beta_ = beta[self.penalty_start:, :] 564 else: 565 beta_ = beta 566 567 # A = self.A() 568 # alpha = [0] * len(A) 569 # anorm = 0.0 570 # for j in xrange(len(alpha)): 571 # alpha[j] = A[j].dot(beta_) 572 # anorm += alpha[j] ** 2.0 573 # anorm **= 0.5 574 # i = anorm >= consts.TOLERANCE 575 # anorm_i = anorm[i] 576 # for j in xrange(len(alpha)): 577 # alpha[j][i] = np.divide(alpha[j][i], anorm_i) 578 # i = anorm < consts.TOLERANCE 579 # for j in xrange(len(alpha)): 580 # alpha[j][i] = 0.0 581 582 alpha = self.tv.alpha(beta) 583 g = self.fmu(beta) 584 585 n = float(self.X.shape[0]) 586 587 if self.mean: 588 a = (np.dot(self.X, beta) - self.y) * (1.0 / n) 589 f_ = (n / 2.0) * maths.norm(a) ** 2.0 + np.dot(self.y.T, a)[0, 0] 590 else: 591 a = np.dot(self.X, beta) - self.y 592 f_ = (1.0 / 2.0) * maths.norm(a) ** 2.0 + np.dot(self.y.T, a)[0, 0] 593 594 lAta = self.tv.l * self.tv.Aa(alpha) 595 if self.penalty_start > 0: 596 lAta = np.vstack((np.zeros((self.penalty_start, 1)), 597 lAta)) 598 599 alpha_sqsum = 0.0 600 for a_ in alpha: 601 alpha_sqsum += np.sum(a_ ** 2.0) 602 603 z = -np.dot(self.X.T, a) 604 h_ = (1.0 / (2 * self.rr.k)) \ 605 * np.sum(maths.positive(np.abs(z - lAta) - self.l1.l) ** 2.0) \ 606 + (0.5 * self.tv.l * self.tv.get_mu() * alpha_sqsum) 607 608 # print "g :", g 609 # print "f_:", f_ 610 # print "h_:", h_ 611 gap = g + f_ + h_ 612 613 # print "Fenchel duality gap:", gap 614 615 return gap

616

617 - def A(self):

618 """Linear operator of the Nesterov function. 619 620 From the interface "NesterovFunction". 621 """ 622 return self.tv.A()

623

624 - def Aa(self, alpha):

625 """Computes A'.alpha. 626 627 From the interface "NesterovFunction". 628 """ 629 return self.tv.Aa(alpha)

630

631 - def project(self, a):

632 """ Projection onto the compact space of the Nesterov function. 633 634 From the interface "NesterovFunction". 635 """ 636 return self.tv.project(a)

637

638 - def parameter(self):

639 """Returns the strongly convex parameter for the function. 640 641 From the interface "StronglyConvex". 642 """ 643 return self.rr.k

644

645 - def step(self, x):

646 """The step size to use in descent methods. 647 648 From the interface "StepSize". 649 650 Parameters 651 ---------- 652 x : Numpy array. The point at which to evaluate the step size. 653 """ 654 return 1.0 / self.L()

655

656 657 -class LinearRegressionL1L2GL(LinearRegressionL1L2TV):

658 """Combination (sum) of RidgeRegression, L1 and Overlapping Group Lasso. 659 """

660 - def __init__(self, X, y, l1, l2, gl, A=None, mu=0.0, penalty_start=0, 661 mean=True):

662 """ 663 Parameters: 664 ---------- 665 X : Numpy array (n-by-p). The X matrix for the linear regression. 666 667 y : Numpy array (n-by-1). The y vector for the linear regression. 668 669 l1 : Non-negative float. The Lagrange multiplier, or regularisation 670 constant, for the L1 penalty. 671 672 l2 : Non-negative float. The Lagrange multiplier, or regularisation 673 constant, for the ridge penalty. 674 675 gl : Non-negative float. The Lagrange multiplier, or regularisation 676 constant, of the overlapping group L1-L2 function. 677 678 A : Numpy array (usually sparse). The linear operator for the Nesterov 679 formulation for group L1-L2. May not be None! 680 681 mu : Non-negative float. The regularisation constant for the smoothing 682 of the overlapping group L1-L2 function. 683 684 penalty_start : Non-negative integer. The number of columns, variables 685 etc., to except from penalisation. Equivalently, the first 686 index to be penalised. Default is 0, all columns are included. 687 688 mean : Boolean. Whether to compute the squared loss or the mean 689 squared loss. Default is True, the mean squared loss. 690 """ 691 self.X = X 692 self.y = y 693 694 self.rr = RidgeRegression(X, y, l2, penalty_start=penalty_start, 695 mean=mean) 696 self.l1 = L1(l1, penalty_start=penalty_start) 697 self.gl = GroupLassoOverlap(gl, A=A, mu=mu, 698 penalty_start=penalty_start) 699 700 self.penalty_start = penalty_start 701 self.mean = mean 702 703 self.reset()

704

705 - def reset(self):

706 707 self.rr.reset() 708 self.l1.reset() 709 self.gl.reset() 710 711 self._Xty = None 712 self._invXXkI = None 713 self._XtinvXXtkI = None

714

715 - def set_params(self, **kwargs):

716 717 # TODO: This is not good. Solve this better! 718 mu = kwargs.pop("mu", self.get_mu()) 719 self.set_mu(mu) 720 721 super(LinearRegressionL1L2GL, self).set_params(**kwargs)

722

723 - def get_mu(self):

724 """Returns the regularisation constant for the smoothing. 725 726 From the interface "NesterovFunction". 727 """ 728 return self.gl.get_mu()

729

730 - def set_mu(self, mu):

731 """Sets the regularisation constant for the smoothing. 732 733 From the interface "NesterovFunction". 734 735 Parameters: 736 ---------- 737 mu : Non-negative float. The regularisation constant for the smoothing 738 to use from now on. 739 740 Returns: 741 ------- 742 old_mu : Non-negative float. The old regularisation constant for the 743 smoothing that was overwritten and is no longer used. 744 """ 745 return self.gl.set_mu(mu)

746

747 - def f(self, beta):

748 """Function value. 749 """ 750 return self.rr.f(beta) \ 751 + self.l1.f(beta) \ 752 + self.gl.f(beta)

753

754 - def fmu(self, beta, mu=None):

755 """Function value. 756 """ 757 return self.rr.f(beta) \ 758 + self.l1.f(beta) \ 759 + self.gl.fmu(beta, mu)

760

761 - def phi(self, alpha, beta):

762 """ Function value with known alpha. 763 """ 764 return self.rr.f(beta) \ 765 + self.l1.f(beta) \ 766 + self.gl.phi(alpha, beta)

767

768 - def grad(self, beta):

769 """Gradient of the differentiable part of the function. 770 771 From the interface "Gradient". 772 """ 773 return self.rr.grad(beta) \ 774 + self.gl.grad(beta)

775

776 - def L(self):

777 """Lipschitz constant of the gradient. 778 779 From the interface "LipschitzContinuousGradient". 780 """ 781 return self.rr.L() \ 782 + self.gl.L()

783

784 - def prox(self, beta, factor=1.0, **kwargs):

785 """The proximal operator of the non-differentiable part of the 786 function. 787 788 From the interface "ProximalOperator". 789 """ 790 return self.l1.prox(beta, factor, **kwargs)

791

792 - def estimate_mu(self, beta):

793 """Computes a "good" value of mu with respect to the given beta. 794 795 From the interface "NesterovFunction". 796 """ 797 return self.gl.estimate_mu(beta)

798

799 - def M(self):

800 """The maximum value of the regularisation of the dual variable. We 801 have 802 803 M = max_{alpha in K} 0.5*|alpha|²_2. 804 805 From the interface "NesterovFunction". 806 """ 807 return self.gl.M()

808

809 - def mu_opt(self, eps):

810 """The optimal value of mu given epsilon. 811 812 From the interface "Continuation". 813 """ 814 gM = self.gl.l * self.gl.M() 815 816 # Mu is set to 1.0, because it is in fact not here "anymore". It is 817 # factored out in this solution. 818 old_mu = self.gl.set_mu(1.0) 819 gA2 = self.gl.L() # Gamma is in here! 820 self.gl.set_mu(old_mu) 821 822 Lg = self.rr.L() 823 824 return (-gM * gA2 + np.sqrt((gM * gA2) ** 2.0 825 + gM * Lg * gA2 * eps)) \ 826 / (gM * Lg)

827

828 - def eps_opt(self, mu):

829 """The optimal value of epsilon given mu. 830 831 From the interface "Continuation". 832 """ 833 gM = self.gl.l * self.gl.M() 834 835 # Mu is set to 1.0, because it is in fact not here "anymore". It is 836 # factored out in this solution. 837 old_mu = self.gl.set_mu(1.0) 838 gA2 = self.gl.L() # Gamma is in here! 839 self.gl.set_mu(old_mu) 840 841 Lg = self.rr.L() 842 843 return (2.0 * gM * gA2 * mu 844 + gM * Lg * mu ** 2.0) \ 845 / gA2

846

847 - def eps_max(self, mu):

848 """The maximum value of epsilon. 849 850 From the interface "Continuation". 851 852 Parameters 853 ---------- 854 mu : Positive float. The regularisation constant of the smoothing. 855 856 Returns 857 ------- 858 eps : Positive float. The upper limit, the maximum, precision. 859 """ 860 gM = self.gl.l * self.gl.M() 861 862 return float(mu) * gM

863

864 - def mu_max(self, eps):

865 """The maximum value of mu. 866 867 From the interface "Continuation". 868 869 Parameters 870 ---------- 871 eps : Positive float. The maximum precision of the smoothing. 872 873 Returns 874 ------- 875 mu : Positive float. The upper limit, the maximum, of the 876 regularisation constant of the smoothing. 877 """ 878 gM = self.gl.l * self.gl.M() 879 880 return float(eps) / gM

881

882 - def betahat(self, alphak, betak, # mu_min=consts.TOLERANCE, 883 eps=consts.TOLERANCE, max_iter=consts.MAX_ITER):

884 """ Returns the beta that minimises the dual function. Used when we 885 compute the gap. 886 887 From the interface "DualFunction". 888 """ 889 # if self.penalty_start > 0: 890 # betak_ = betak[self.penalty_start:, :] 891 # else: 892 # betak_ = betak 893 894 # if self._Xty is None: 895 # self._Xty = np.dot(self.X.T, self.y) 896 897 Ata_gl = self.gl.l * self.gl.Aa(alphak) 898 if self.penalty_start > 0: 899 Ata_gl = np.vstack((np.zeros((self.penalty_start, 1)), 900 Ata_gl)) 901 902 ## Al1 = nesterov.l1.linear_operator_from_variables(self.X.shape[1], 903 ## penalty_start=self.penalty_start) 904 ## smoothed_l1 = nesterov.l1.L1(self.l1.l, A=Al1, mu=mu_min, 905 ## penalty_start=self.penalty_start) 906 ## Ata_l1 = self.l1.l * smoothed_l1.Aa(smoothed_l1.alpha(betak_)) 907 # Ata_l1 = self.l1.l * SmoothedL1.project([betak_ / mu_min])[0] 908 # if self.penalty_start > 0: 909 # Ata_l1 = np.vstack((np.zeros((self.penalty_start, 1)), 910 # Ata_l1)) 911 # v = (self._Xty - Ata_gl - Ata_l1) 912 # 913 # shape = self.X.shape 914 # 915 # if shape[0] > shape[1]: # If n > p 916 # 917 # # Ridge solution 918 # if self._invXXkI is None: 919 # XtXkI = np.dot(self.X.T, self.X) 920 # index = np.arange(min(XtXkI.shape)) 921 # XtXkI[index, index] += self.rr.k 922 # self._invXXkI = np.linalg.inv(XtXkI) 923 # 924 # beta_hat = np.dot(self._invXXkI, v) 925 # 926 # else: # If p > n 927 # # Ridge solution using the Woodbury matrix identity: 928 # if self._XtinvXXtkI is None: 929 # XXtkI = np.dot(self.X, self.X.T) 930 # index = np.arange(min(XXtkI.shape)) 931 # XXtkI[index, index] += self.rr.k 932 # invXXtkI = np.linalg.inv(XXtkI) 933 # self._XtinvXXtkI = np.dot(self.X.T, invXXtkI) 934 # 935 # beta_hat = (v - np.dot(self._XtinvXXtkI, np.dot(self.X, v))) \ 936 # / self.rr.k 937 938 beta_hat = betak 939 940 from parsimony.functions import CombinedFunction 941 import parsimony.algorithms.proximal as proximal 942 import parsimony.functions.losses as losses 943 import parsimony.functions.penalties as penalties 944 # import parsimony.functions.nesterov as nesterov 945 946 function = CombinedFunction() 947 function.add_function(losses.RidgeRegression(self.X, self.y, 948 self.rr.k, 949 penalty_start=self.penalty_start, 950 mean=self.mean)) 951 function.add_function(losses.LinearFunction(Ata_gl)) 952 # function.add_function(losses.LinearFunction(Ata_l1)) 953 # A = nesterov.l1.linear_operator_from_variables(self.X.shape[1], 954 # penalty_start=self.penalty_start) 955 # function.add_penalty(nesterov.l1.L1(self.l1.l, A=A, mu=mu_min, 956 # penalty_start=self.penalty_start)) 957 function.add_prox(penalties.L1(self.l1.l, 958 penalty_start=self.penalty_start)) 959 960 fista = proximal.FISTA(eps=eps, max_iter=max_iter) 961 beta_hat_ = fista.run(function, beta_hat) 962 963 # print np.linalg.norm(beta_hat - beta_hat_) 964 965 # print "f:", function.f(beta_hat) 966 # print "f:", function.f(beta_hat_) 967 968 beta_hat = beta_hat_ 969 970 return beta_hat

971

972 - def gap(self, beta, beta_hat=None, 973 eps=consts.TOLERANCE, max_iter=consts.MAX_ITER):

974 """Compute the duality gap. 975 976 From the interface "DualFunction". 977 """ 978 # if self.penalty_start > 0: 979 # beta_ = beta[self.penalty_start:, :] 980 # else: 981 # beta_ = beta 982 983 # A = self.A() 984 # alpha = [0] * len(A) 985 # for j in xrange(len(alpha)): 986 # astar = A[j].dot(beta_) 987 # 988 # normas = np.sqrt(np.sum(astar ** 2.0)) 989 # if normas > consts.TOLERANCE: 990 # astar /= normas 991 # else: 992 # astar *= 0.0 993 # 994 # alpha[j] = astar 995 # 996 # g = self.f(beta) 997 998 alpha = self.gl.alpha(beta) 999 g = self.fmu(beta) 1000 1001 n = float(self.X.shape[0]) 1002 1003 if self.mean: 1004 a = (np.dot(self.X, beta) - self.y) * (1.0 / n) 1005 f_ = (n / 2.0) * maths.norm(a) ** 2.0 + np.dot(self.y.T, a)[0, 0] 1006 else: 1007 a = np.dot(self.X, beta) - self.y 1008 f_ = (1.0 / 2.0) * maths.norm(a) ** 2.0 + np.dot(self.y.T, a)[0, 0] 1009 1010 lAta = self.gl.l * self.gl.Aa(alpha) 1011 if self.penalty_start > 0: 1012 lAta = np.vstack((np.zeros((self.penalty_start, 1)), 1013 lAta)) 1014 1015 alpha_sqsum = 0.0 1016 for a_ in alpha: 1017 alpha_sqsum += np.sum(a_ ** 2.0) 1018 1019 z = -np.dot(self.X.T, a) 1020 h_ = (1.0 / (2 * self.rr.k)) \ 1021 * np.sum(maths.positive(np.abs(z - lAta) - self.l1.l) ** 2.0) \ 1022 + (0.5 * self.gl.l * self.gl.get_mu() * alpha_sqsum) 1023 1024 # print "g :", g 1025 # print "f_:", f_ 1026 # print "h_:", h_ 1027 gap = g + f_ + h_ 1028 1029 # print "Fenchel duality gap:", gap 1030 1031 return gap

1032 1033 ## alpha_ = self.gl.alpha(beta) 1034 ## 1035 ## P_ = self.rr.f(beta) \ 1036 ## + self.l1.f(beta) \ 1037 ## + self.gl.phi(alpha_, beta) 1038 ## 1039 ## beta_hat_ = self.betahat(alpha_, beta) 1040 ## 1041 ## D_ = self.rr.f(beta_hat_) \ 1042 ## + self.l1.f(beta_hat_) \ 1043 ## + self.gl.phi(alpha_, beta_hat_) 1044 # 1045 # mu = consts.TOLERANCE 1046 # old_mu = self.gl.set_mu(mu) 1047 # 1048 # alpha = self.gl.alpha(beta) 1049 # 1050 # P = self.rr.f(beta) \ 1051 # + self.l1.f(beta) \ 1052 # + self.gl.phi(alpha, beta) 1053 # 1054 # beta_hat = self.betahat(alpha, beta, eps=eps, max_iter=max_iter) 1055 # 1056 # D = self.rr.f(beta_hat) \ 1057 # + self.l1.f(beta_hat) \ 1058 # + self.gl.phi(alpha, beta_hat) 1059 # 1060 ## print "rr.f :", self.rr.f(beta) - self.rr.f(beta_hat) 1061 ## print "l1.f :", self.l1.f(beta) - self.l1.f(beta_hat) 1062 ## print "gl.phi:", self.gl.phi(alpha, beta) - self.gl.phi(alpha, beta_hat) 1063 # 1064 # self.gl.set_mu(old_mu) 1065 # 1066 ## print "old gap:", (P_ - D_), ", new gap:", (P - D) 1067 ## print "new gap:", (P - D) 1068 # 1069 # return P - D 1070

1071 - def A(self):

1072 """Linear operator of the Nesterov function. 1073 1074 From the interface "NesterovFunction". 1075 """ 1076 return self.gl.A()

1077

1078 - def Aa(self, alpha):

1079 """Computes A^T.alpha. 1080 1081 From the interface "NesterovFunction". 1082 """ 1083 return self.gl.Aa(alpha)

1084

1085 - def project(self, a):

1086 """ Projection onto the compact space of the Nesterov function. 1087 1088 From the interface "NesterovFunction". 1089 """ 1090 return self.gl.project(a)

1091

1092 - def step(self, x):

1093 """The step size to use in descent methods. 1094 1095 From the interface "StepSize". 1096 1097 Parameters 1098 ---------- 1099 x : Numpy array. The point at which to evaluate the step size. 1100 """ 1101 return 1.0 / self.L()

1102

1103 1104 -class LogisticRegressionL1L2TV(LinearRegressionL1L2TV):

1105 """Combination (sum) of RidgeLogisticRegression, L1 and TotalVariation. 1106 """

1107 - def __init__(self, X, y, l1, l2, tv, A=None, mu=0.0, weights=None, 1108 penalty_start=0, mean=True):

1109 """ 1110 Parameters 1111 ---------- 1112 X : Numpy array (n-by-p). The X matrix for the logistic regression. 1113 1114 y : Numpy array (n-by-1). The y vector for the logistic regression. 1115 1116 l1 : Non-negative float. The Lagrange multiplier, or regularisation 1117 constant, for the L1 penalty. 1118 1119 l2 : Non-negative float. The Lagrange multiplier, or regularisation 1120 constant, for the ridge (L2) penalty. 1121 1122 tv : Non-negative float. The Lagrange multiplier, or regularisation 1123 constant, of the smoothed TV function. 1124 1125 A : Numpy array (usually sparse). The linear operator for the Nesterov 1126 formulation for TV. May not be None! 1127 1128 mu : Non-negative float. The regularisation constant for the smoothing 1129 of the TV function. 1130 1131 weights: List with n elements. The sample's weights. 1132 1133 penalty_start : Non-negative integer. The number of columns, variables 1134 etc., to except from penalisation. Equivalently, the first 1135 index to be penalised. Default is 0, all columns are included. 1136 1137 mean : Boolean. Whether to compute the squared loss or the mean 1138 squared loss. Default is True, the mean squared loss. 1139 """ 1140 self.X = X 1141 self.y = y 1142 1143 self.rr = RidgeLogisticRegression(X, y, l2, 1144 weights=weights, 1145 penalty_start=penalty_start, 1146 mean=mean) 1147 self.l1 = L1(l1, penalty_start=penalty_start) 1148 self.tv = TotalVariation(tv, A=A, mu=mu, penalty_start=penalty_start) 1149 if weights is None: 1150 weights = np.ones(y.shape) # .reshape(y.shape) 1151 self.weights = weights 1152 self.penalty_start = penalty_start 1153 self.mean = mean 1154 1155 self.reset()

1156

1157 - def gap(self, beta, beta_hat=None, 1158 eps=consts.TOLERANCE, max_iter=consts.MAX_ITER):

1159 """Compute the duality gap for the logistic function. 1160 1161 From the interface "DualFunction". 1162 """ 1163 if self.penalty_start > 0: 1164 beta_ = beta[self.penalty_start:, :] 1165 else: 1166 beta_ = beta 1167 1168 n = float(self.X.shape[0]) 1169 alpha = self.tv.alpha(beta_) 1170 g = self.fmu(beta_) 1171 Xbeta = np.dot(self.X, beta_) 1172 pi = np.reciprocal(1.0 + np.exp(-Xbeta)) 1173 #if weights is None: 1174 # weights = np.ones(self.y.shape) 1175 scale = 1.0 / n if self.mean else 1. 1176 1177 # a in the next line is the gradient of l at xbeta following the ols 1178 # paper notations 1179 a = (pi - self.y) * (self.weights * scale) 1180 b = ((1. / (self.weights * scale)) * a) + self.y 1181 f_ = np.sum((b * np.log(b) + (1 - b) 1182 * np.log(1 - b)) * (self.weights * scale)) 1183 1184 lAta = self.tv.l * self.tv.Aa(alpha) 1185 if self.penalty_start > 0: 1186 lAta = np.vstack((np.zeros((self.penalty_start, 1)), 1187 lAta)) 1188 1189 alpha_sqsum = 0.0 1190 for a_ in alpha: 1191 alpha_sqsum += np.sum(a_ ** 2.0) 1192 1193 z = -np.dot(self.X.T, a) 1194 h_ = (1.0 / (2 * self.rr.k)) \ 1195 * np.sum(maths.positive(np.abs(z - lAta) - self.l1.l) ** 2.0) \ 1196 + (0.5 * self.tv.l * self.tv.get_mu() * alpha_sqsum) 1197 1198 gap = g + f_ + h_ 1199 1200 return gap

1201

1202 -class LogisticRegressionL1L2GL(LinearRegressionL1L2GL):

1203 """Combination (sum) of RidgeLogisticRegression, L1 and TotalVariation. 1204 """

1205 - def __init__(self, X, y, l1, l2, gl, A=None, mu=0.0, weights=None, 1206 penalty_start=0, mean=True):

1207 """ 1208 Parameters 1209 ---------- 1210 X : Numpy array. The X matrix (n-by-p) for the logistic regression. 1211 1212 y : Numpy array. The y vector for the logistic regression. 1213 1214 l1 : Non-negative float. The Lagrange multiplier, or regularisation 1215 constant, for the L1 penalty. 1216 1217 l2 : Non-negative float. The Lagrange multiplier, or regularisation 1218 constant, for the ridge (L2) penalty. 1219 1220 gl : Non-negative float. The Lagrange multiplier, or regularisation 1221 constant, of the smoothed function. 1222 1223 A : Numpy array (usually sparse). The linear operator for the Nesterov 1224 formulation for GL. May not be None! 1225 1226 mu : Non-negative float. The regularisation constant for the smoothing 1227 of the GL function. 1228 1229 weights: List with n elements. The sample's weights. 1230 1231 penalty_start : Non-negative integer. The number of columns, variables 1232 etc., to except from penalisation. Equivalently, the first 1233 index to be penalised. Default is 0, all columns are included. 1234 """ 1235 self.X = X 1236 self.y = y 1237 1238 self.rr = RidgeLogisticRegression(X, y, l2, weights=weights, mean=mean) 1239 self.l1 = L1(l1, penalty_start=penalty_start) 1240 self.gl = GroupLassoOverlap(gl, A=A, mu=mu, 1241 penalty_start=penalty_start) 1242 1243 self.penalty_start = penalty_start 1244 self.mean = mean 1245 1246 self.reset()

1247

1248 1249 -class LinearRegressionL2SmoothedL1TV(properties.CompositeFunction, 1250 properties.NesterovFunction, 1251 properties.GradientMap, 1252 properties.DualFunction, 1253 properties.StronglyConvex):

1254 """Combination (sum) of Linear Regression, L2 and simultaneously smoothed 1255 L1 and TotalVariation. 1256 1257 Parameters 1258 ---------- 1259 X : Numpy array. The X matrix for the ridge regression. 1260 1261 y : Numpy array. The y vector for the ridge regression. 1262 1263 l2 : Non-negative float. The Lagrange multiplier, or regularisation 1264 constant, for the ridge (L2) penalty. 1265 1266 l1 : Non-negative float. The Lagrange multiplier, or regularisation 1267 constant, for the L1 penalty. 1268 1269 tv : Non-negative float. The Lagrange multiplier, or regularisation 1270 constant, of the TV function. 1271 1272 A : A list or tuple with 4 elements of (usually sparse) arrays. The linear 1273 operator for the smoothed L1+TV. The first element must be the 1274 linear operator for L1 and the following three for TV. May not be 1275 None. 1276 1277 mu : Non-negative float. The regularisation constant for the smoothing. 1278 1279 penalty_start : Non-negative integer. The number of columns, variables 1280 etc., to except from penalisation. Equivalently, the first index 1281 to be penalised. Default is 0, all columns are included. 1282 1283 mean : Boolean. Whether to compute the squared loss or the mean squared 1284 loss. Default is True, the mean squared loss. 1285 """

1286 - def __init__(self, X, y, l2, l1, tv, A=None, mu=consts.TOLERANCE, 1287 penalty_start=0, mean=True):

1288 1289 if l2 < consts.TOLERANCE: 1290 raise ValueError("The L2 regularisation constant must be " + \ 1291 "non-zero.") 1292 1293 self.X = X 1294 self.y = y 1295 1296 self.g = RidgeRegression(X, y, l2, 1297 penalty_start=penalty_start, 1298 mean=mean) 1299 self.h = L1TV(l1=l1, tv=tv, A=A, mu=mu, 1300 penalty_start=penalty_start) 1301 1302 self.mu = float(mu) 1303 1304 self.penalty_start = max(0, int(penalty_start)) 1305 self.mean = bool(mean) 1306 1307 self.reset()

1308

1309 - def reset(self):

1310 1311 self.g.reset() 1312 self.h.reset() 1313 1314 self._Xy = None 1315 self._XtinvXXtkI = None

1316

1317 - def set_params(self, **kwargs):

1318 1319 # TODO: This is not a good solution. Can we solve this in a better way? 1320 mu = kwargs.pop("mu", self.get_mu()) 1321 self.set_mu(mu) 1322 1323 super(LinearRegressionL2SmoothedL1TV, self).set_params(**kwargs)

1324

1325 - def get_mu(self):

1326 """Returns the regularisation constant for the smoothing. 1327 1328 From the interface "NesterovFunction". 1329 """ 1330 return self.h.get_mu()

1331

1332 - def set_mu(self, mu):

1333 """Sets the regularisation constant for the smoothing. 1334 1335 From the interface "NesterovFunction". 1336 1337 Parameters: 1338 ---------- 1339 mu : Non-negative float. The regularisation constant for the smoothing 1340 to use from now on. 1341 1342 Returns: 1343 ------- 1344 old_mu : Non-negative float. The old regularisation constant for the 1345 smoothing that was overwritten and is no longer used. 1346 """ 1347 return self.h.set_mu(mu)

1348

1349 - def f(self, beta):

1350 """ Function value. 1351 """ 1352 return self.g.f(beta) \ 1353 + self.h.f(beta)

1354

1355 - def phi(self, alpha, beta):

1356 """ Function value. 1357 """ 1358 return self.g.f(beta) \ 1359 + self.h.phi(alpha, beta)

1360

1361 - def A(self):

1362 return self.h.A()

1363

1364 - def L(self):

1365 """Lipschitz constant of the gradient. 1366 1367 From the interface "LipschitzContinuousGradient". 1368 """ 1369 # b = self.g.lambda_min() 1370 b = self.parameter() 1371 # TODO: Use max_iter here!! 1372 a = self.h.lambda_max() # max_iter=max_iter) 1373 1374 return a / b

1375

1376 - def parameter(self):

1377 """Returns the strongly convex parameter for the function. 1378 1379 From the interface "StronglyConvex". 1380 """ 1381 return self.g.parameter()

1382

1383 - def V(self, alpha, beta, L):

1384 """The gradient map associated to the function. 1385 1386 From the interface "GradientMap". 1387 """ 1388 if self.penalty_start > 0: 1389 beta_ = beta[self.penalty_start:, :] 1390 else: 1391 beta_ = beta 1392 1393 if L < consts.TOLERANCE: 1394 L = consts.TOLERANCE 1395 1396 A = self.h.A() 1397 a = [0] * len(A) 1398 a[0] = (1.0 / L) * A[0].dot(beta_) 1399 a[1] = (1.0 / L) * A[1].dot(beta_) 1400 a[2] = (1.0 / L) * A[2].dot(beta_) 1401 a[3] = (1.0 / L) * A[3].dot(beta_) 1402 1403 u_new = [0] * len(alpha) 1404 for i in xrange(len(alpha)): 1405 u_new[i] = alpha[i] + a[i] 1406 1407 return self.h.project(u_new)

1408

1409 - def alpha(self, beta):

1410 """ Dual variable of the Nesterov function. 1411 1412 From the interface "NesterovFunction". 1413 """ 1414 return self.h.alpha(beta)

1415

1416 - def betahat(self, alpha, beta=None):

1417 """ Returns the beta that minimises the dual function. 1418 1419 From the interface "DualFunction". 1420 """ 1421 # TODO: Kernelise this function! See how I did in 1422 # LinearRegressionL1L2TV._beta_hat. 1423 1424 A = self.h.A() 1425 lAta = A[0].T.dot(alpha[0]) # L1 1426 lAta += A[1].T.dot(alpha[1]) # TV X 1427 lAta += A[2].T.dot(alpha[2]) # TV Y 1428 lAta += A[3].T.dot(alpha[3]) # TV Z 1429 1430 if self.penalty_start > 0: 1431 lAta = np.vstack((np.zeros((self.penalty_start, 1)), 1432 lAta)) 1433 1434 # XXkI = np.dot(X.T, X) + self.g.k * np.eye(X.shape[1]) 1435 1436 n = float(self.X.shape[0]) 1437 1438 if self._Xy is None: 1439 if self.mean: 1440 self._Xy = np.dot(self.X.T, self.y) * (1.0 / n) 1441 else: 1442 self._Xy = np.dot(self.X.T, self.y) 1443 1444 Xty_lAta = (self._Xy - lAta) * (1.0 / self.g.k) 1445 1446 # t = time() 1447 # XXkI = np.dot(X.T, X) 1448 # index = np.arange(min(XXkI.shape)) 1449 # XXkI[index, index] += self.g.k 1450 # invXXkI = np.linalg.inv(XXkI) 1451 # print "t:", time() - t 1452 # beta = np.dot(invXXkI, Xty_lAta) 1453 1454 if self._XtinvXXtkI is None: 1455 XXtkI = np.dot(self.X, self.X.T) 1456 index = np.arange(min(XXtkI.shape)) 1457 if self.mean: 1458 XXtkI[index, index] += self.g.k * n 1459 else: 1460 XXtkI[index, index] += self.g.k 1461 invXXtkI = np.linalg.inv(XXtkI) 1462 self._XtinvXXtkI = np.dot(self.X.T, invXXtkI) 1463 1464 beta = (Xty_lAta - np.dot(self._XtinvXXtkI, np.dot(self.X, Xty_lAta))) 1465 1466 return beta

1467

1468 - def gap(self, beta, beta_hat):

1469 """Compute the duality gap. 1470 1471 From the interface "DualFunction". 1472 """ 1473 # TODO: Add this function or refactor API! 1474 raise NotImplementedError("We cannot currently do this!")

1475

1476 - def estimate_mu(self, beta):

1477 """Computes a "good" value of mu with respect to the given beta. 1478 1479 From the interface "NesterovFunction". 1480 """ 1481 raise NotImplementedError("We do not use this here!")

1482

1483 - def M(self):

1484 """ The maximum value of the regularisation of the dual variable. We 1485 have 1486 1487 M = max_{alpha in K} 0.5*|alpha|²_2. 1488 1489 From the interface "NesterovFunction". 1490 """ 1491 return self.h.M()

1492

1493 - def project(self, a):

1494 """ Projection onto the compact space of the Nesterov function. 1495 1496 From the interface "NesterovFunction". 1497 """ 1498 return self.h.project(a)

1499

1500 1501 -class AugmentedLinearRegressionL1L2TV(properties.SplittableFunction, 1502 properties.AugmentedProximalOperator):

1503 """Combination (sum) of LinearRegression, L1, L2 and 1D TotalVariation 1504 with a linear constraint. Represents the problem 1505 1506 min. f(b) = g(x) 1507 + h(r) 1508 = (1 / 2) ||Xb - y||² + (k / 2) ||b||² 1509 + l ||r_1||_1 + g ||r_tv||_1, 1510 s.t. r = Db. 1511 1512 The proximal operators of the splittable functions are assumed to be from 1513 augmented Lagrangians. 1514 1515 Note: This function only works for 1-dimensional total variation. 1516 1517 Parameters 1518 ---------- 1519 X : Numpy array. The X matrix for the linear regression. 1520 1521 y : Numpy array. The y vector for the linear regression. 1522 1523 l1 : Non-negative float. The Lagrange multiplier, or regularisation 1524 constant, for the L1 penalty. 1525 1526 l2 : Non-negative float. The Lagrange multiplier, or regularisation 1527 constant, for the ridge penalty. 1528 1529 tv : Non-negative float. The Lagrange multiplier, or regularisation 1530 constant, of the total variation function. 1531 1532 A : List or tuple of numpy (or usually sparse scipy) arrays. The linear 1533 operator for the constraints. 1534 1535 rho : Positive float. The penalty parameter for the augmented Lagrangian. 1536 1537 penalty_start : Non-negative integer. The number of columns, variables 1538 etc., to except from penalisation. Equivalently, the first 1539 index to be penalised. Default is 0, all columns are included. 1540 1541 mean : Boolean. Whether to compute the squared loss or the mean 1542 squared loss. Default is True, the mean squared loss. 1543 """

1544 - def __init__(self, X, y, l1, l2, tv, A=None, rho=1.0, penalty_start=0, 1545 mean=True):

1546 1547 super(AugmentedLinearRegressionL1L2TV, self).__init__(rho=rho) 1548 1549 self.X = X 1550 self.y = y 1551 1552 class MultipleFunctions(properties.Function, 1553 properties.ProximalOperator): 1554 1555 def __init__(self, functions, rho): 1556 self.funcs = functions 1557 self.rho = rho

1558 1559 def f(self, xrr): 1560 if isinstance(xrr, linalgs.MultipartArray): 1561 xrr = xrr.get_parts() 1562 1563 # Rescale the function values by rho. 1564 f = self.funcs[0].f(xrr[0]) * self.rho \ 1565 + self.funcs[1].f(xrr[1]) * self.rho 1566 1567 return f

1568 1569 def reset(self): 1570 self.funcs[0].reset() 1571 self.funcs[1].reset() 1572 1573 def prox(self, xrr, factor=1.0, eps=consts.TOLERANCE, 1574 max_iter=100): 1575 1576 if isinstance(xrr, linalgs.MultipartArray): 1577 xrr = xrr.get_parts() 1578 1579 parts = [0, 0] 1580 1581 parts[0] = self.funcs[0].prox(xrr[0], factor=factor, 1582 eps=eps, max_iter=max_iter) 1583 parts[1] = self.funcs[1].prox(xrr[1], factor=factor, 1584 eps=eps, max_iter=max_iter) 1585 1586 return linalgs.MultipartArray(parts, vertical=True) 1587 1588 class L1TV(properties.SplittableFunction, 1589 properties.ProximalOperator): 1590 1591 def __init__(self, l1, tv, p): 1592 self.l1 = float(l1) 1593 self.tv = float(tv) 1594 self.p = max(1, int(p)) 1595 1596 self.g = L1(l1) 1597 self.h = L1(tv) 1598 1599 def reset(self): 1600 self.g.reset() 1601 self.h.reset() 1602 1603 def f(self, x): 1604 """Function value. 1605 """ 1606 x1 = x[:self.p, :] 1607 x2 = x[self.p:, :] 1608 1609 return self.g.f(x1) \ 1610 + self.h.f(x2) 1611 1612 def prox(self, x, factor=1.0, eps=consts.TOLERANCE, max_iter=100): 1613 1614 x1 = x[:self.p, :] 1615 x2 = x[self.p:, :] 1616 1617 y = np.vstack((self.g.prox(x1, factor=factor, 1618 eps=eps, max_iter=max_iter), 1619 self.h.prox(x2, factor=factor, 1620 eps=eps, max_iter=max_iter))) 1621 1622 return y 1623 1624 self.g = MultipleFunctions([RidgeSquaredError(X, y, l2, l=1.0 / rho, 1625 penalty_start=penalty_start, 1626 mean=mean), 1627 L1TV(l1 / rho, tv / rho, A[0].shape[1])], 1628 rho) 1629 1630 if len(A) == 4: 1631 A = A[:2] # Skip 2nd and 3rd dimension of the image (they are 1) 1632 1633 self.h = LinearVariableConstraint(A, penalty_start=penalty_start, 1634 solver=linalgs.TridiagonalSolver()) 1635 1636 self.penalty_start = max(0, int(penalty_start)) 1637 self.mean = bool(mean) 1638 1639 self.reset() 1640

1641 - def reset(self):

1642 1643 self.g.reset() 1644 self.h.reset()

1645

1646 - def f(self, xy):

1647 1648 if isinstance(xy, linalgs.MultipartArray): 1649 xy = xy.get_parts() 1650 1651 return self.g.f(xy[0]) \ 1652 + self.h.f(xy[1])

1653

1654 - def prox(self, x, **kwargs):

1655 1656 raise NotImplementedError("Use the prox of the parts of the " \ 1657 "splitted function, g.prox() and h.prox().")

1658

1659 - def set_rho(self, rho):

1660 """Update the penalty parameter. 1661 1662 From the interface "AugmentedProximalOperator". 1663 """ 1664 rho = max(0.0, float(rho)) 1665 1666 # Ridge regression 1667 self.g.funcs[0].l = 1.0 / rho 1668 # L1 1669 self.g.funcs[1].g.l = self.g.funcs[1].l1 / rho 1670 # TV 1671 self.g.funcs[1].h.l = self.g.funcs[1].tv / rho 1672 1673 # MultipleFunctions 1674 self.g.rho = rho 1675 1676 # self 1677 self.rho = rho

1678

1679 1680 -class PrincipalComponentAnalysisL1TV(properties.CompositeFunction, 1681 properties.NesterovFunction, 1682 properties.Continuation, 1683 properties.DualFunction, 1684 properties.StronglyConvex, 1685 properties.StepSize):

1686 """Combination (sum) of PCA (Variance), L1 and TotalVariation 1687 """

1688 - def __init__(self, X, l, g, A=None, mu=0.0, penalty_start=0):

1689 """ 1690 Parameters: 1691 ---------- 1692 X : Numpy array. The X matrix for the ridge regression. 1693 1694 l : Non-negative float. The Lagrange multiplier, or regularisation 1695 constant, for the L1 penalty. 1696 1697 g : Non-negative float. The Lagrange multiplier, or regularisation 1698 constant, of the smoothed TV function. 1699 1700 A : Numpy array (usually sparse). The linear operator for the Nesterov 1701 formulation of TV. May not be None! 1702 1703 mu : Non-negative float. The regularisation constant for the smoothing 1704 of the TV function. 1705 1706 penalty_start : Non-negative integer. The number of columns, variables 1707 etc., to except from penalisation. Equivalently, the first 1708 index to be penalised. Default is 0, all columns are included. 1709 """ 1710 self.X = X 1711 self.pca = LatentVariableVariance(X) 1712 self.l1 = L1(l, penalty_start=penalty_start) 1713 self.tv = TotalVariation(g, A=A, mu=mu, penalty_start=penalty_start) 1714 1715 self.reset()

1716

1717 - def reset(self):

1718 1719 self.pca.reset() 1720 self.l1.reset() 1721 self.tv.reset() 1722 1723 self._Xty = None 1724 self._invXXkI = None 1725 self._XtinvXXtkI = None

1726

1727 - def set_params(self, **kwargs):

1728 1729 # TODO: This is not a nice solution. Can we solve it better? 1730 mu = kwargs.pop("mu", self.get_mu()) 1731 self.set_mu(mu) 1732 1733 super(PrincipalComponentAnalysisL1TV, self).set_params(**kwargs)

1734

1735 - def get_mu(self):

1736 """Returns the regularisation constant for the smoothing. 1737 1738 From the interface "NesterovFunction". 1739 """ 1740 return self.tv.get_mu()

1741

1742 - def set_mu(self, mu):

1743 """Sets the regularisation constant for the smoothing. 1744 1745 From the interface "NesterovFunction". 1746 1747 Parameters: 1748 ---------- 1749 mu : Non-negative float. The regularisation constant for the smoothing 1750 to use from now on. 1751 1752 Returns: 1753 ------- 1754 old_mu : Non-negative float. The old regularisation constant for the 1755 smoothing that was overwritten and is no longer used. 1756 """ 1757 return self.tv.set_mu(mu)

1758

1759 - def f(self, beta):

1760 """Function value. 1761 """ 1762 return self.pca.f(beta) \ 1763 + self.l1.f(beta) \ 1764 + self.tv.f(beta)

1765

1766 - def fmu(self, beta, mu=None):

1767 """Function value. 1768 """ 1769 return self.pca.f(beta) \ 1770 + self.l1.f(beta) \ 1771 + self.tv.fmu(beta, mu)

1772

1773 - def phi(self, alpha, beta):

1774 """ Function value with known alpha. 1775 """ 1776 return self.pca.f(beta) \ 1777 + self.l1.f(beta) \ 1778 + self.tv.phi(alpha, beta)

1779

1780 - def grad(self, beta):

1781 """Gradient of the differentiable part of the function. 1782 1783 From the interface "Gradient". 1784 """ 1785 return self.pca.grad(beta) \ 1786 + self.tv.grad(beta)

1787

1788 - def L(self):

1789 """Lipschitz constant of the gradient. 1790 1791 From the interface "LipschitzContinuousGradient". 1792 """ 1793 return self.pca.L() \ 1794 + self.tv.L()

1795

1796 - def prox(self, beta, factor=1.0, **kwargs):

1797 """The proximal operator of the non-differentiable part of the 1798 function. 1799 1800 From the interface "ProximalOperator". 1801 """ 1802 return self.l1.prox(beta, factor, **kwargs)

1803

1804 - def estimate_mu(self, beta):

1805 """Computes a "good" value of mu with respect to the given beta. 1806 1807 From the interface "NesterovFunction". 1808 """ 1809 return self.tv.estimate_mu(beta)

1810

1811 - def M(self):

1812 """The maximum value of the regularisation of the dual variable. We 1813 have 1814 1815 M = max_{alpha in K} 0.5*|alpha|²_2. 1816 1817 From the interface "NesterovFunction". 1818 """ 1819 return self.tv.M()

1820

1821 - def mu_opt(self, eps):

1822 """The optimal value of mu given epsilon. 1823 1824 From the interface "Continuation". 1825 """ 1826 gM = self.tv.l * self.tv.M() 1827 1828 # Mu is set to 1.0, because it is in fact not here "anymore". It is 1829 # factored out in this solution. 1830 old_mu = self.tv.set_mu(1.0) 1831 gA2 = self.tv.L() # Gamma is in here! 1832 self.tv.set_mu(old_mu) 1833 1834 Lg = self.rr.L() 1835 1836 return (-gM * gA2 + np.sqrt((gM * gA2) ** 2.0 1837 + gM * Lg * gA2 * eps)) \ 1838 / (gM * Lg)

1839

1840 - def eps_opt(self, mu):

1841 """The optimal value of epsilon given mu. 1842 1843 From the interface "Continuation". 1844 """ 1845 gM = self.tv.l * self.tv.M() 1846 1847 # Mu is set to 1.0, because it is in fact not here "anymore". It is 1848 # factored out in this solution. 1849 old_mu = self.tv.set_mu(1.0) 1850 gA2 = self.tv.L() # Gamma is in here! 1851 self.tv.set_mu(old_mu) 1852 1853 Lg = self.rr.L() 1854 1855 return (2.0 * gM * gA2 * mu 1856 + gM * Lg * mu ** 2.0) \ 1857 / gA2

1858

1859 - def eps_max(self, mu):

1860 """The maximum value of epsilon. 1861 1862 From the interface "Continuation". 1863 1864 Parameters 1865 ---------- 1866 mu : Positive float. The regularisation constant of the smoothing. 1867 1868 Returns 1869 ------- 1870 eps : Positive float. The upper limit, the maximum, precision. 1871 """ 1872 gM = self.tv.l * self.tv.M() 1873 1874 return float(mu) * gM

1875

1876 - def mu_max(self, eps):

1877 """The maximum value of mu. 1878 1879 From the interface "Continuation". 1880 1881 Parameters 1882 ---------- 1883 eps : Positive float. The maximum precision of the smoothing. 1884 1885 Returns 1886 ------- 1887 mu : Positive float. The upper limit, the maximum, of the 1888 regularisation constant of the smoothing. 1889 """ 1890 gM = self.tv.l * self.tv.M() 1891 1892 return float(eps) / gM

1893

1894 - def betahat(self, alphak, betak):

1895 """ Returns the beta that minimises the dual function. Used when we 1896 compute the gap. 1897 1898 From the interface "DualFunction". 1899 """ 1900 raise NotImplementedError('Abstract method "betahat" must be ' 1901 'specialised!')

1902 # if self._Xty is None: 1903 # self._Xty = np.dot(self.X.T, self.y) 1904 # 1905 # Ata_tv = self.tv.l * self.tv.Aa(alphak) 1906 # Ata_l1 = self.l1.l * SmoothedL1.project([betak / consts.TOLERANCE])[0] 1907 # v = (self._Xty - Ata_tv - Ata_l1) 1908 # 1909 # shape = self.X.shape 1910 # 1911 # if shape[0] > shape[1]: # If n > p 1912 # 1913 # # Ridge solution 1914 # if self._invXXkI is None: 1915 # XtXkI = np.dot(self.X.T, self.X) 1916 # index = np.arange(min(XtXkI.shape)) 1917 # XtXkI[index, index] += self.rr.k 1918 # self._invXXkI = np.linalg.inv(XtXkI) 1919 # 1920 # beta_hat = np.dot(self._invXXkI, v) 1921 # 1922 # else: # If p > n 1923 # # Ridge solution using the Woodbury matrix identity: 1924 # if self._XtinvXXtkI is None: 1925 # XXtkI = np.dot(self.X, self.X.T) 1926 # index = np.arange(min(XXtkI.shape)) 1927 # XXtkI[index, index] += self.rr.k 1928 # invXXtkI = np.linalg.inv(XXtkI) 1929 # self._XtinvXXtkI = np.dot(self.X.T, invXXtkI) 1930 # 1931 # beta_hat = (v - np.dot(self._XtinvXXtkI, np.dot(self.X, v))) \ 1932 # / self.rr.k 1933 # 1934 # return beta_hat 1935

1936 - def gap(self, beta, beta_hat=None):

1937 """Compute the duality gap. 1938 1939 From the interface "DualFunction". 1940 """ 1941 raise NotImplementedError('Abstract method "gap" must be ' 1942 'specialised!')

1943 # alpha = self.tv.alpha(beta) 1944 # 1945 # P = self.rr.f(beta) \ 1946 # + self.l1.f(beta) \ 1947 # + self.tv.phi(alpha, beta) 1948 # 1949 # beta_hat = self.betahat(alpha, beta) 1950 # 1951 # D = self.rr.f(beta_hat) \ 1952 # + self.l1.f(beta_hat) \ 1953 # + self.tv.phi(alpha, beta_hat) 1954 # 1955 # return P - D 1956

1957 - def A(self):

1958 """Linear operator of the Nesterov function. 1959 1960 From the interface "NesterovFunction". 1961 """ 1962 return self.tv.A()

1963

1964 - def Aa(self, alpha):

1965 """Computes A^\T\alpha. 1966 1967 From the interface "NesterovFunction". 1968 """ 1969 return self.tv.Aa(alpha)

1970

1971 - def project(self, a):

1972 """ Projection onto the compact space of the Nesterov function. 1973 1974 From the interface "NesterovFunction". 1975 """ 1976 return self.tv.project(a)

1977

1978 - def parameter(self):

1979 """Returns the strongly convex parameter for the function. 1980 1981 From the interface "StronglyConvex". 1982 """ 1983 return self.rr.k

1984

1985 - def step(self, x):

1986 """The step size to use in descent methods. 1987 1988 From the interface "StepSize". 1989 1990 Parameters 1991 ---------- 1992 x : Numpy array. The point at which to evaluate the step size. 1993 """ 1994 return 1.0 / self.L()

1995

Source Code for Module parsimony.functions.combinedfunctions