"""
==========================================
Primal (Block) Coordinate Descent Solvers
==========================================
This module provides (block) coordinate descent solvers for a variety of loss
functions and penalties.
"""
# Author: Mathieu Blondel
# License: BSD
import numpy as np
from joblib import Parallel, delayed
from .base import BaseClassifier
from .base import BaseRegressor
from .dataset_fast import get_dataset
from .primal_cd_fast import _primal_cd
from .primal_cd_fast import Squared
from .primal_cd_fast import SmoothHinge
from .primal_cd_fast import SquaredHinge
from .primal_cd_fast import ModifiedHuber
from .primal_cd_fast import Log
class _BaseCD(object):
def _get_loss(self):
params = {"max_steps": self._get_max_steps(),
"sigma": self.sigma,
"beta": self.beta,
"verbose": self.verbose}
losses = {
"squared": Squared(verbose=self.verbose),
"smooth_hinge": SmoothHinge(**params),
"squared_hinge": SquaredHinge(**params),
"modified_huber": ModifiedHuber(**params),
"log": Log(**params),
}
return losses[self.loss]
def _get_max_steps(self):
if self.max_steps == "auto":
if self.loss == "log":
max_steps = 0
else:
max_steps = 30
else:
max_steps = self.max_steps
return max_steps
def _get_penalty(self):
penalties = {
"l1": 1,
"l2": 2,
}
return penalties[self.penalty]
def _init_errors(self, Y):
n_samples, n_vectors = Y.shape
if self.loss == "squared":
self.errors_ = -Y.T
else:
self.errors_ = np.ones((n_vectors, n_samples), dtype=np.float64)
[docs]class CDClassifier(_BaseCD, BaseClassifier):
r"""Estimator for learning linear classifiers by (block) coordinate descent.
The objective functions considered take the form
minimize F(W) = C * L(W) + alpha * R(W),
where L(W) is a loss term and R(W) is a penalty term.
Parameters
----------
loss : str, 'squared_hinge', 'log', 'modified_huber', 'squared'
The loss function to be used.
penalty : str, 'l2', 'l1', 'l1/l2'
The penalty to be used.
- l2: ridge
- l1: lasso
- l1/l2: group lasso
multiclass : bool
Whether to use a direct multiclass formulation (True) or one-vs-rest
(False). Direct formulations are only available for loss='squared_hinge'
and loss='log'.
C : float
Weight of the loss term.
alpha : float
Weight of the penalty term.
max_iter : int
Maximum number of iterations to perform.
tol : float
Tolerance of the stopping criterion.
termination : str, 'violation_sum', 'violation_max'
Stopping criterion to use.
shrinking : bool
Whether to activate shrinking or not.
max_steps : int or "auto"
Maximum number of steps to use during the line search. Use max_steps=0
to use a constant step size instead of the line search. Use
max_steps="auto" to let CDClassifier choose the best value.
sigma : float
Constant used in the line search sufficient decrease condition.
beta : float
Multiplicative constant used in the backtracking line search.
warm_start : bool
Whether to activate warm-start or not.
debiasing : bool
Whether to refit the model using l2 penalty (only useful if penalty='l1'
or penalty='l1/l2').
Cd : float
Value of `C` when doing debiasing.
warm_debiasing : bool
Whether to warm-start the model or not when doing debiasing.
selection : str, 'cyclic', 'uniform'
Strategy to use for selecting coordinates.
permute : bool
Whether to permute coordinates or not before cycling (only when
selection='cyclic').
callback : callable
Callback function.
n_calls : int
Frequency with which `callback` must be called.
random_state : RandomState or int
The seed of the pseudo random number generator to use.
verbose : int
Verbosity level.
n_jobs : int
Number of CPU's to be used when `multiclass=False` and when
penalty is a non group-lasso penalty. By default use one CPU.
If set to -1, use all CPU's
Examples
--------
The following example demonstrates how to learn a classification
model with a multiclass squared hinge loss and an l1/l2 penalty.
>>> from sklearn.datasets import fetch_20newsgroups_vectorized
>>> from lightning.classification import CDClassifier
>>> bunch = fetch_20newsgroups_vectorized(subset="all")
>>> X, y = bunch.data, bunch.target
>>> clf = CDClassifier(penalty="l1/l2",
loss="squared_hinge",
multiclass=True,
max_iter=20,
alpha=1e-4,
C=1.0 / X.shape[0],
tol=1e-3,
random_state=0).fit(X, y)
>>> accuracy = clf.score(X, y)
References
----------
Block Coordinate Descent Algorithms for Large-scale Sparse Multiclass
Classification. Mathieu Blondel, Kazuhiro Seki, and Kuniaki Uehara.
Machine Learning, May 2013.
"""
def __init__(self, loss="squared_hinge", penalty="l2", multiclass=False,
C=1.0, alpha=1.0,
max_iter=50, tol=1e-3, termination="violation_sum",
shrinking=True,
max_steps="auto", sigma=0.01, beta=0.5,
warm_start=False, debiasing=False, Cd=1.0,
warm_debiasing=False,
selection="cyclic", permute=True,
callback=None, n_calls=100,
random_state=None, verbose=0, n_jobs=1):
self.C = C
self.alpha = alpha
self.loss = loss
self.penalty = penalty
self.multiclass = multiclass
self.max_iter = max_iter
self.tol = tol
self.termination = termination
self.shrinking = shrinking
self.max_steps = max_steps
self.sigma = sigma
self.beta = beta
self.warm_start = warm_start
self.debiasing = debiasing
self.Cd = Cd
self.warm_debiasing = warm_debiasing
self.selection = selection
self.permute = permute
self.callback = callback
self.n_calls = n_calls
self.random_state = random_state
self.verbose = verbose
self.coef_ = None
self.violation_init_ = {}
self.n_jobs = n_jobs
[docs] def fit(self, X, y):
"""Fit model according to X and y.
Parameters
----------
X : array-like, shape = [n_samples, n_features]
Training vectors, where n_samples is the number of samples
and n_features is the number of features.
y : array-like, shape = [n_samples]
Target values.
Returns
-------
self : classifier
Returns self.
"""
rs = self._get_random_state()
# Create dataset
ds = get_dataset(X, order="fortran")
n_samples = ds.get_n_samples()
n_features = ds.get_n_features()
if self.penalty != "l1/l2" and self.multiclass:
raise NotImplementedError("True multiclass options not implemented "
"for non group-lasso(l1/l2) penalties.")
# Create label transformers
#neg_label = 0 if self.penalty == "nn" else -1
reencode = self.penalty == "l1/l2"
y, n_classes, n_vectors = self._set_label_transformers(y,
reencode,
neg_label=-1)
Y = np.asfortranarray(self.label_binarizer_.transform(y),
dtype=np.float64)
# Initialize coefficients
if not self.warm_start or self.coef_ is None:
self.C_init = self.C
self.coef_ = np.zeros((n_vectors, n_features), dtype=np.float64)
self._init_errors(Y)
self.intercept_ = np.zeros(n_vectors, dtype=np.float64)
indices = np.arange(n_features, dtype=np.int32)
max_steps = self._get_max_steps()
# Learning
if self.penalty == "l1/l2":
tol = self.tol
#n_min = np.min(np.sum(Y == 1, axis=0))
#tol *= max(n_min, 1) / n_samples
vinit = self.violation_init_.get(0, 0) * self.C / self.C_init
model = _primal_cd(self, self.coef_, self.errors_,
ds, y, Y, -1, self.multiclass,
indices, 12, self._get_loss(),
self.selection, self.permute, self.termination,
self.C, self.alpha,
self.max_iter, max_steps,
self.shrinking, vinit,
rs, tol, self.callback, self.n_calls,
self.verbose)
viol = model[0]
if self.warm_start and len(self.violation_init_) == 0:
self.violation_init_[0] = viol
elif self.penalty in ("l1", "l2", "nn"):
penalty = self._get_penalty()
n_pos = np.zeros(n_vectors)
vinit = self.C / self.C_init * np.ones_like(n_pos)
for k in range(n_vectors):
n_pos[k] = np.sum(Y[:, k] == 1)
vinit[k] *= self.violation_init_.get(k, 0)
n_neg = n_samples - n_pos
tol = self.tol * np.maximum(np.minimum(n_pos, n_neg), 1) / n_samples
jobs = (delayed(_primal_cd)(self, self.coef_, self.errors_,
ds, y, Y, k, False,
indices, penalty, self._get_loss(),
self.selection, self.permute,
self.termination,
self.C, self.alpha,
self.max_iter, max_steps,
self.shrinking, vinit[k],
rs, tol[k], self.callback, self.n_calls,
self.verbose)
for k in range(n_vectors))
model = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(jobs)
viol, coefs, errors = zip(*model)
self.coef_ = np.asarray(coefs)
self.errors_ = np.asarray(errors)
for k in range(n_vectors):
if self.warm_start and not k in self.violation_init_:
self.violation_init_[k] = viol[k]
if self.debiasing:
nz = self.coef_ != 0
if not self.warm_debiasing:
self.coef_ = np.zeros((n_vectors, n_features), dtype=np.float64)
self._init_errors(Y)
indices = np.arange(n_features, dtype=np.int32)
jobs = (delayed(_primal_cd)(
self, self.coef_, self.errors_,
ds, y, Y, k, False,
indices[nz[k]], 2, self._get_loss(),
"cyclic", self.permute,
"violation_sum",
self.Cd, 1.0,
self.max_iter, max_steps,
False, 0,
rs, self.tol, self.callback, self.n_calls,
self.verbose
)
for k in range(n_vectors))
model = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(jobs)
viol, coefs, errors = zip(*model)
self.coef_ = np.asarray(coefs)
self.errors_ = np.asarray(errors)
return self
[docs]class CDRegressor(_BaseCD, BaseRegressor):
r"""Estimator for learning linear regressors by (block) coordinate descent.
The objective functions considered take the form
minimize F(W) = C * L(W) + alpha * R(W),
where L(W) is a loss term and R(W) is a penalty term.
Parameters
----------
loss : str, 'squared'
The loss function to be used.
penalty : str, 'l2', 'l1', 'l1/l2'
The penalty to be used.
- l2: ridge
- l1: lasso
- l1/l2: group lasso
For other parameters, see `CDClassifier`.
"""
def __init__(self, C=1.0, alpha=1.0,
loss="squared", penalty="l2",
max_iter=50, tol=1e-3, termination="violation_sum",
shrinking=True,
max_steps=30, sigma=0.01, beta=0.5,
warm_start=False, debiasing=False, Cd=1.0,
warm_debiasing=False,
selection="cyclic", permute=True,
callback=None, n_calls=100,
random_state=None, verbose=0, n_jobs=1):
self.C = C
self.alpha = alpha
self.loss = loss
self.penalty = penalty
self.max_iter = max_iter
self.tol = tol
self.termination = termination
self.shrinking = shrinking
self.max_steps = max_steps
self.sigma = sigma
self.beta = beta
self.warm_start = warm_start
self.debiasing = debiasing
self.Cd = Cd
self.warm_debiasing = warm_debiasing
self.selection = selection
self.permute = permute
self.callback = callback
self.n_calls = n_calls
self.random_state = random_state
self.verbose = verbose
self.coef_ = None
self.violation_init_ = {}
self.n_jobs = n_jobs
[docs] def fit(self, X, y):
"""Fit model according to X and y.
Parameters
----------
X : array-like, shape = [n_samples, n_features]
Training vectors, where n_samples is the number of samples
and n_features is the number of features.
y : array-like, shape = [n_samples] or [n_samples, n_targets]
Target values.
Returns
-------
self : regressor
Returns self.
"""
rs = self._get_random_state()
# Create dataset
ds = get_dataset(X, order="fortran")
n_features = ds.get_n_features()
self.outputs_2d_ = len(y.shape) == 2
if self.outputs_2d_:
Y = y
else:
Y = y.reshape(-1, 1)
Y = np.asfortranarray(Y, dtype=np.float64)
y = np.empty(0, dtype=np.int32)
n_vectors = Y.shape[1]
# Initialize coefficients
if not self.warm_start or self.coef_ is None:
self.C_init = self.C
self.coef_ = np.zeros((n_vectors, n_features), dtype=np.float64)
self._init_errors(Y)
self.intercept_ = np.zeros(n_vectors, dtype=np.float64)
indices = np.arange(n_features, dtype=np.int32)
if self.penalty == "l1/l2":
vinit = self.violation_init_.get(0, 0) * self.C / self.C_init
model = _primal_cd(self, self.coef_, self.errors_,
ds, y, Y, -1, False,
indices, 12, self._get_loss(),
self.selection, self.permute, self.termination,
self.C, self.alpha,
self.max_iter, self.max_steps,
self.shrinking, vinit,
rs, self.tol, self.callback, self.n_calls,
self.verbose)
viol = model[0]
if self.warm_start and len(self.violation_init_) == 0:
self.violation_init_[0] = viol
else:
penalty = self._get_penalty()
vinit = np.asarray([self.violation_init_.get(k, 0)
for k in range(n_vectors)]) * self.C / self.C_init
jobs = (delayed(_primal_cd)(self, self.coef_, self.errors_,
ds, y, Y, k, False,
indices, penalty, self._get_loss(),
self.selection, self.permute,
self.termination,
self.C, self.alpha,
self.max_iter, self.max_steps,
self.shrinking, vinit[k],
rs, self.tol, self.callback, self.n_calls,
self.verbose)
for k in range(n_vectors))
model = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(jobs)
viol, self.coef_, self.error_ = zip(*model)
self.coef_ = np.asarray(self.coef_)
self.error_ = np.asarray(self.error_)
if self.warm_start and not n_vectors in self.violation_init_:
self.violation_init_[n_vectors] = viol
return self