Source code for lightning.impl.primal_cd

"""
==========================================
Primal (Block) Coordinate Descent Solvers
==========================================

This module provides (block) coordinate descent solvers for a variety of loss
functions and penalties.
"""

# Author: Mathieu Blondel
# License: BSD

import numpy as np

from joblib import Parallel, delayed

from .base import BaseClassifier
from .base import BaseRegressor

from .dataset_fast import get_dataset
from .primal_cd_fast import _primal_cd

from .primal_cd_fast import Squared
from .primal_cd_fast import SmoothHinge
from .primal_cd_fast import SquaredHinge
from .primal_cd_fast import ModifiedHuber
from .primal_cd_fast import Log


class _BaseCD(object):

    def _get_loss(self):
        params = {"max_steps": self._get_max_steps(),
                  "sigma": self.sigma,
                  "beta": self.beta,
                  "verbose": self.verbose}

        losses = {
            "squared": Squared(verbose=self.verbose),
            "smooth_hinge": SmoothHinge(**params),
            "squared_hinge": SquaredHinge(**params),
            "modified_huber": ModifiedHuber(**params),
            "log": Log(**params),
        }

        return losses[self.loss]

    def _get_max_steps(self):
        if self.max_steps == "auto":
            if self.loss == "log":
                max_steps = 0
            else:
                max_steps = 30
        else:
            max_steps = self.max_steps
        return max_steps

    def _get_penalty(self):
        penalties = {
            "l1": 1,
            "l2": 2,
        }
        return penalties[self.penalty]

    def _init_errors(self, Y):
        n_samples, n_vectors = Y.shape
        if self.loss == "squared":
            self.errors_ = -Y.T
        else:
            self.errors_ = np.ones((n_vectors, n_samples), dtype=np.float64)


[docs]class CDClassifier(_BaseCD, BaseClassifier):
    r"""Estimator for learning linear classifiers by (block) coordinate descent.

    The objective functions considered take the form

    minimize F(W) = C * L(W) + alpha * R(W),

    where L(W) is a loss term and R(W) is a penalty term.

    Parameters
    ----------
    loss : str, 'squared_hinge', 'log', 'modified_huber', 'squared'
        The loss function to be used.

    penalty : str, 'l2', 'l1', 'l1/l2'
        The penalty to be used.

        - l2: ridge
        - l1: lasso
        - l1/l2: group lasso

    multiclass : bool
        Whether to use a direct multiclass formulation (True) or one-vs-rest
        (False). Direct formulations are only available for loss='squared_hinge'
        and loss='log'.

    C : float
        Weight of the loss term.

    alpha : float
        Weight of the penalty term.

    max_iter : int
        Maximum number of iterations to perform.

    tol : float
        Tolerance of the stopping criterion.

    termination : str, 'violation_sum', 'violation_max'
        Stopping criterion to use.

    shrinking : bool
        Whether to activate shrinking or not.

    max_steps : int or "auto"
        Maximum number of steps to use during the line search. Use max_steps=0
        to use a constant step size instead of the line search. Use
        max_steps="auto" to let CDClassifier choose the best value.

    sigma : float
        Constant used in the line search sufficient decrease condition.

    beta : float
        Multiplicative constant used in the backtracking line search.

    warm_start : bool
        Whether to activate warm-start or not.

    debiasing : bool
        Whether to refit the model using l2 penalty (only useful if penalty='l1'
        or penalty='l1/l2').

    Cd : float
        Value of `C` when doing debiasing.

    warm_debiasing : bool
        Whether to warm-start the model or not when doing debiasing.

    selection : str, 'cyclic', 'uniform'
        Strategy to use for selecting coordinates.

    permute : bool
        Whether to permute coordinates or not before cycling (only when
        selection='cyclic').

    callback : callable
        Callback function.

    n_calls : int
        Frequency with which `callback` must be called.

    random_state : RandomState or int
        The seed of the pseudo random number generator to use.

    verbose : int
        Verbosity level.

    n_jobs : int
        Number of CPU's to be used when `multiclass=False` and when
        penalty is a non group-lasso penalty. By default use one CPU.
        If set to -1, use all CPU's

    Examples
    --------
    The following example demonstrates how to learn a classification
    model with a multiclass squared hinge loss and an l1/l2 penalty.

    >>> from sklearn.datasets import fetch_20newsgroups_vectorized
    >>> from lightning.classification import CDClassifier
    >>> bunch = fetch_20newsgroups_vectorized(subset="all")
    >>> X, y = bunch.data, bunch.target
    >>> clf = CDClassifier(penalty="l1/l2",
                           loss="squared_hinge",
                           multiclass=True,
                           max_iter=20,
                           alpha=1e-4,
                           C=1.0 / X.shape[0],
                           tol=1e-3,
                           random_state=0).fit(X, y)
    >>> accuracy = clf.score(X, y)

    References
    ----------
    Block Coordinate Descent Algorithms for Large-scale Sparse Multiclass
    Classification.  Mathieu Blondel, Kazuhiro Seki, and Kuniaki Uehara.
    Machine Learning, May 2013.
    """

    def __init__(self, loss="squared_hinge", penalty="l2", multiclass=False,
                 C=1.0, alpha=1.0,
                 max_iter=50, tol=1e-3, termination="violation_sum",
                 shrinking=True,
                 max_steps="auto", sigma=0.01, beta=0.5,
                 warm_start=False, debiasing=False, Cd=1.0,
                 warm_debiasing=False,
                 selection="cyclic", permute=True,
                 callback=None, n_calls=100,
                 random_state=None, verbose=0, n_jobs=1):
        self.C = C
        self.alpha = alpha
        self.loss = loss
        self.penalty = penalty
        self.multiclass = multiclass
        self.max_iter = max_iter
        self.tol = tol
        self.termination = termination
        self.shrinking = shrinking
        self.max_steps = max_steps
        self.sigma = sigma
        self.beta = beta
        self.warm_start = warm_start
        self.debiasing = debiasing
        self.Cd = Cd
        self.warm_debiasing = warm_debiasing
        self.selection = selection
        self.permute = permute
        self.callback = callback
        self.n_calls = n_calls
        self.random_state = random_state
        self.verbose = verbose
        self.coef_ = None
        self.violation_init_ = {}
        self.n_jobs = n_jobs

[docs]    def fit(self, X, y):
        """Fit model according to X and y.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples
            and n_features is the number of features.

        y : array-like, shape = [n_samples]
            Target values.

        Returns
        -------
        self : classifier
            Returns self.
        """
        rs = self._get_random_state()

        # Create dataset
        ds = get_dataset(X, order="fortran")
        n_samples = ds.get_n_samples()
        n_features = ds.get_n_features()

        if self.penalty != "l1/l2" and self.multiclass:
            raise NotImplementedError("True multiclass options not implemented "
                                      "for non group-lasso(l1/l2) penalties.")

        # Create label transformers
        #neg_label = 0 if self.penalty == "nn" else -1
        reencode = self.penalty == "l1/l2"
        y, n_classes, n_vectors = self._set_label_transformers(y,
                                                               reencode,
                                                               neg_label=-1)
        Y = np.asfortranarray(self.label_binarizer_.transform(y),
                              dtype=np.float64)

        # Initialize coefficients
        if not self.warm_start or self.coef_ is None:
            self.C_init = self.C
            self.coef_ = np.zeros((n_vectors, n_features), dtype=np.float64)
            self._init_errors(Y)

        self.intercept_ = np.zeros(n_vectors, dtype=np.float64)
        indices = np.arange(n_features, dtype=np.int32)

        max_steps = self._get_max_steps()

        # Learning
        if self.penalty == "l1/l2":
            tol = self.tol
            #n_min = np.min(np.sum(Y == 1, axis=0))
            #tol *= max(n_min, 1) / n_samples

            vinit = self.violation_init_.get(0, 0) * self.C / self.C_init
            model = _primal_cd(self, self.coef_, self.errors_,
                              ds, y, Y, -1, self.multiclass,
                              indices, 12, self._get_loss(),
                              self.selection, self.permute, self.termination,
                              self.C, self.alpha,
                              self.max_iter, max_steps,
                              self.shrinking, vinit,
                              rs, tol, self.callback, self.n_calls,
                              self.verbose)
            viol = model[0]
            if self.warm_start and len(self.violation_init_) == 0:
                self.violation_init_[0] = viol

        elif self.penalty in ("l1", "l2", "nn"):
            penalty = self._get_penalty()

            n_pos = np.zeros(n_vectors)
            vinit = self.C / self.C_init * np.ones_like(n_pos)
            for k in range(n_vectors):
                n_pos[k] = np.sum(Y[:, k] == 1)
                vinit[k] *= self.violation_init_.get(k, 0)
            n_neg = n_samples - n_pos
            tol = self.tol * np.maximum(np.minimum(n_pos, n_neg), 1) / n_samples

            jobs = (delayed(_primal_cd)(self, self.coef_, self.errors_,
                                        ds, y, Y, k, False,
                                        indices, penalty, self._get_loss(),
                                        self.selection, self.permute,
                                        self.termination,
                                        self.C, self.alpha,
                                        self.max_iter, max_steps,
                                        self.shrinking, vinit[k],
                                        rs, tol[k], self.callback, self.n_calls,
                                        self.verbose)
                    for k in range(n_vectors))
            model = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(jobs)
            viol, coefs, errors = zip(*model)
            self.coef_ = np.asarray(coefs)
            self.errors_ = np.asarray(errors)

            for k in range(n_vectors):
                if self.warm_start and not k in self.violation_init_:
                    self.violation_init_[k] = viol[k]

        if self.debiasing:
            nz = self.coef_ != 0

            if not self.warm_debiasing:
                self.coef_ = np.zeros((n_vectors, n_features), dtype=np.float64)
                self._init_errors(Y)

            indices = np.arange(n_features, dtype=np.int32)
            jobs = (delayed(_primal_cd)(
                           self, self.coef_, self.errors_,
                           ds, y, Y, k, False,
                           indices[nz[k]], 2, self._get_loss(),
                           "cyclic", self.permute,
                           "violation_sum",
                           self.Cd, 1.0,
                           self.max_iter, max_steps,
                           False, 0,
                           rs, self.tol, self.callback, self.n_calls,
                           self.verbose
                            )
                    for k in range(n_vectors))
            model = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(jobs)
            viol, coefs, errors = zip(*model)
            self.coef_ = np.asarray(coefs)
            self.errors_ = np.asarray(errors)

        return self


[docs]class CDRegressor(_BaseCD, BaseRegressor):
    r"""Estimator for learning linear regressors by (block) coordinate descent.

    The objective functions considered take the form

    minimize F(W) = C * L(W) + alpha * R(W),

    where L(W) is a loss term and R(W) is a penalty term.

    Parameters
    ----------
    loss : str, 'squared'
        The loss function to be used.

    penalty : str, 'l2', 'l1', 'l1/l2'
        The penalty to be used.

        - l2: ridge
        - l1: lasso
        - l1/l2: group lasso

    For other parameters, see `CDClassifier`.
    """

    def __init__(self, C=1.0, alpha=1.0,
                 loss="squared", penalty="l2",
                 max_iter=50, tol=1e-3, termination="violation_sum",
                 shrinking=True,
                 max_steps=30, sigma=0.01, beta=0.5,
                 warm_start=False, debiasing=False, Cd=1.0,
                 warm_debiasing=False,
                 selection="cyclic", permute=True,
                 callback=None, n_calls=100,
                 random_state=None, verbose=0, n_jobs=1):
        self.C = C
        self.alpha = alpha
        self.loss = loss
        self.penalty = penalty
        self.max_iter = max_iter
        self.tol = tol
        self.termination = termination
        self.shrinking = shrinking
        self.max_steps = max_steps
        self.sigma = sigma
        self.beta = beta
        self.warm_start = warm_start
        self.debiasing = debiasing
        self.Cd = Cd
        self.warm_debiasing = warm_debiasing
        self.selection = selection
        self.permute = permute
        self.callback = callback
        self.n_calls = n_calls
        self.random_state = random_state
        self.verbose = verbose
        self.coef_ = None
        self.violation_init_ = {}
        self.n_jobs = n_jobs

[docs]    def fit(self, X, y):
        """Fit model according to X and y.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples
            and n_features is the number of features.

        y : array-like, shape = [n_samples] or [n_samples, n_targets]
            Target values.

        Returns
        -------
        self : regressor
            Returns self.
        """
        rs = self._get_random_state()

        # Create dataset
        ds = get_dataset(X, order="fortran")
        n_features = ds.get_n_features()

        self.outputs_2d_ = len(y.shape) == 2
        if self.outputs_2d_:
            Y = y
        else:
            Y = y.reshape(-1, 1)
        Y = np.asfortranarray(Y, dtype=np.float64)
        y = np.empty(0, dtype=np.int32)
        n_vectors = Y.shape[1]

        # Initialize coefficients
        if not self.warm_start or self.coef_ is None:
            self.C_init = self.C
            self.coef_ = np.zeros((n_vectors, n_features), dtype=np.float64)
            self._init_errors(Y)

        self.intercept_ = np.zeros(n_vectors, dtype=np.float64)
        indices = np.arange(n_features, dtype=np.int32)


        if self.penalty == "l1/l2":
            vinit = self.violation_init_.get(0, 0) * self.C / self.C_init
            model = _primal_cd(self, self.coef_, self.errors_,
                              ds, y, Y, -1, False,
                              indices, 12, self._get_loss(),
                              self.selection, self.permute, self.termination,
                              self.C, self.alpha,
                              self.max_iter, self.max_steps,
                              self.shrinking, vinit,
                              rs, self.tol, self.callback, self.n_calls,
                              self.verbose)
            viol = model[0]
            if self.warm_start and len(self.violation_init_) == 0:
                self.violation_init_[0] = viol
        else:
            penalty = self._get_penalty()
            vinit = np.asarray([self.violation_init_.get(k, 0)
                    for k in range(n_vectors)]) * self.C / self.C_init

            jobs = (delayed(_primal_cd)(self, self.coef_, self.errors_,
                                       ds, y, Y, k, False,
                                       indices, penalty, self._get_loss(),
                                       self.selection, self.permute,
                                       self.termination,
                                       self.C, self.alpha,
                                       self.max_iter, self.max_steps,
                                       self.shrinking, vinit[k],
                                       rs, self.tol, self.callback, self.n_calls,
                                       self.verbose)
                    for k in range(n_vectors))

            model = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(jobs)
            viol, self.coef_, self.error_ = zip(*model)
            self.coef_ = np.asarray(self.coef_)
            self.error_ = np.asarray(self.error_)

            if self.warm_start and not n_vectors in self.violation_init_:
                self.violation_init_[n_vectors] = viol

        return self