Source code for lightning.impl.sgd

"""
===================================
Stochastic Gradient Descent Solvers
===================================

This module provides SGD solvers for a variety of loss
functions and penalties.
"""
# Author: Mathieu Blondel
# License: BSD

import warnings

import numpy as np

from sklearn.utils import check_random_state
from sklearn.utils.extmath import safe_sparse_dot
from sklearn.utils.validation import assert_all_finite

from .base import BaseClassifier
from .base import BaseRegressor

from .dataset_fast import get_dataset

from .sgd_fast import _binary_sgd
from .sgd_fast import _multiclass_sgd

from .sgd_fast import ModifiedHuber
from .sgd_fast import Hinge
from .sgd_fast import SquaredHinge
from .sgd_fast import Log
from .sgd_fast import SquaredLoss
from .sgd_fast import Huber
from .sgd_fast import EpsilonInsensitive

from .sgd_fast import MulticlassLog
from .sgd_fast import MulticlassHinge
from .sgd_fast import MulticlassSquaredHinge


class _BaseSGD(object):

    def _get_penalty(self):
        penalties = {
            "nn": -1,
            "nnl1": -1,
            "nnl2": -2,
            "l1": 1,
            "l2": 2,
            "l1/l2": 12
        }
        return penalties[self.penalty]

    def _get_learning_rate(self):
        learning_rates = {"constant": 1, "pegasos": 2, "invscaling": 3}
        return learning_rates[self.learning_rate]


[docs]class SGDClassifier(BaseClassifier, _BaseSGD):
    r"""Estimator for learning linear classifiers by SGD.

    Parameters
    ----------
    loss : str, 'hinge', 'squared_hinge', 'log', 'perceptron'
        Loss function to be used.

    penalty : str, 'l2', 'l1', 'l1/l2'
        The penalty to be used.

        - l2: ridge
        - l1: lasso
        - l1/l2: group lasso

    multiclass : bool
        Whether to use a direct multiclass formulation (True) or one-vs-rest
        (False). Direct formulations are only available for loss='hinge', 'squared_hinge'
        and 'log'.

    alpha : float
        Weight of the penalty term.

    learning_rate : 'pegasos', 'constant', 'invscaling'
        Learning schedule to use.

    eta0 : float
        Step size.

    power_t : float
        Power to be used (when learning_rate='invscaling').

    epsilon : float
        Value to be used for epsilon-insensitive loss.

    fit_intercept : bool
        Whether to fit the intercept or not.

    intercept_decay : float
        Value by which the intercept is multiplied (to regularize it).

    max_iter : int
        Maximum number of iterations to perform.

    shuffle : bool
        Whether to shuffle data.

    callback : callable
        Callback function.

    n_calls : int
        Frequency with which `callback` must be called.

    random_state : RandomState or int
        The seed of the pseudo random number generator to use.

    verbose : int
        Verbosity level.

    Examples
    --------
    >>> from sklearn.datasets import fetch_20newsgroups_vectorized
    >>> from lightning.classification import SGDClassifier
    >>> bunch = fetch_20newsgroups_vectorized(subset="all")
    >>> X, y = bunch.data, bunch.target
    >>> clf = SGDClassifier().fit(X, y)
    >>> accuracy = clf.score(X, y)
    """

    def __init__(self, loss="hinge", penalty="l2",
                 multiclass=False, alpha=0.01,
                 learning_rate="pegasos", eta0=0.03, power_t=0.5,
                 epsilon=0.01, fit_intercept=True, intercept_decay=1.0,
                 max_iter=10, shuffle=True, random_state=None,
                 callback=None, n_calls=100, verbose=0):
        self.loss = loss
        self.penalty = penalty
        self.multiclass = multiclass
        self.alpha = alpha
        self.learning_rate = learning_rate
        self.eta0 = eta0
        self.power_t = power_t
        self.epsilon = epsilon
        self.fit_intercept = fit_intercept
        self.intercept_decay = intercept_decay
        self.max_iter = max_iter
        self.shuffle = shuffle
        self.random_state = random_state
        self.callback = callback
        self.n_calls = n_calls
        self.verbose = verbose
        self.coef_ = None
        self.intercept_ = None

    def _get_loss(self):
        if self.multiclass:
            losses = {
                "log": MulticlassLog(),
                "hinge": MulticlassHinge(),
                "squared_hinge": MulticlassSquaredHinge(),
            }
        else:
            losses = {
                "modified_huber": ModifiedHuber(),
                "hinge": Hinge(1.0),
                "squared_hinge": SquaredHinge(1.0),
                "perceptron": Hinge(0.0),
                "log": Log(),
                "squared": SquaredLoss(),
                "huber": Huber(self.epsilon),
                "epsilon_insensitive": EpsilonInsensitive(self.epsilon)
            }
        return losses[self.loss]

[docs]    def fit(self, X, y):
        """Fit model according to X and y.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples
            and n_features is the number of features.

        y : array-like, shape = [n_samples]
            Target values.

        Returns
        -------
        self : classifier
            Returns self.
        """
        rs = check_random_state(self.random_state)

        reencode = self.multiclass
        y, n_classes, n_vectors = self._set_label_transformers(y, reencode)

        ds = get_dataset(X)
        n_samples = ds.get_n_samples()
        n_features = ds.get_n_features()
        self.coef_ = np.zeros((n_vectors, n_features), dtype=np.float64)

        self.intercept_ = np.zeros(n_vectors, dtype=np.float64)

        loss = self._get_loss()
        penalty = self._get_penalty()

        if n_vectors == 1 or not self.multiclass:
            Y = np.asfortranarray(self.label_binarizer_.fit_transform(y),
                                  dtype=np.float64)
            for i in range(n_vectors):
                _binary_sgd(self,
                            self.coef_, self.intercept_, i,
                            ds, Y[:, i], loss, penalty,
                            self.alpha,
                            self._get_learning_rate(),
                            self.eta0, self.power_t,
                            self.fit_intercept,
                            self.intercept_decay,
                            int(self.max_iter * n_samples), self.shuffle, rs,
                            self.callback, self.n_calls, self.verbose)

        elif self.multiclass:
            _multiclass_sgd(self, self.coef_, self.intercept_,
                            ds, y.astype(np.int32), loss, penalty,
                            self.alpha, self._get_learning_rate(),
                            self.eta0, self.power_t, self.fit_intercept,
                            self.intercept_decay,
                            int(self.max_iter * n_samples),
                            self.shuffle, rs, self.callback, self.n_calls,
                            self.verbose)

        else:
            raise ValueError("Wrong value for multiclass.")

        try:
            assert_all_finite(self.coef_)
        except ValueError:
            warnings.warn("coef_ contains infinite values")

        return self


[docs]class SGDRegressor(BaseRegressor, _BaseSGD):
    r"""Estimator for learning linear regressors by SGD.

    Parameters
    ----------
    loss : str, 'squared', 'epsilon_insensitive', 'huber'
        Loss function to be used.

    penalty : str, 'l2', 'l1', 'l1/l2'
        The penalty to be used.

        - l2: ridge
        - l1: lasso
        - l1/l2: group lasso

    alpha : float
        Weight of the penalty term.

    learning_rate : 'pegasos', 'constant', 'invscaling'
        Learning schedule to use.

    eta0 : float
        Step size.

    power_t : float
        Power to be used (when learning_rate='invscaling').

    epsilon : float
        Value to be used for epsilon-insensitive loss.

    fit_intercept : bool
        Whether to fit the intercept or not.

    intercept_decay : float
        Value by which the intercept is multiplied (to regularize it).

    max_iter : int
        Maximum number of iterations to perform.

    shuffle : bool
        Whether to shuffle data.

    callback : callable
        Callback function.

    n_calls : int
        Frequency with which `callback` must be called.

    random_state : RandomState or int
        The seed of the pseudo random number generator to use.

    verbose : int
        Verbosity level.
    """

    def __init__(self, loss="squared", penalty="l2",
                 alpha=0.01,
                 learning_rate="pegasos", eta0=0.03, power_t=0.5,
                 epsilon=0.01, fit_intercept=True, intercept_decay=1.0,
                 max_iter=10, shuffle=True, random_state=None,
                 callback=None, n_calls=100, verbose=0):
        self.loss = loss
        self.penalty = penalty
        self.alpha = alpha
        self.learning_rate = learning_rate
        self.eta0 = eta0
        self.power_t = power_t
        self.epsilon = epsilon
        self.fit_intercept = fit_intercept
        self.intercept_decay = intercept_decay
        self.max_iter = max_iter
        self.shuffle = shuffle
        self.random_state = random_state
        self.callback = callback
        self.n_calls = n_calls
        self.verbose = verbose
        self.coef_ = None

    def _get_loss(self):
        losses = {
            "squared": SquaredLoss(),
            "huber": Huber(self.epsilon),
            "epsilon_insensitive": EpsilonInsensitive(self.epsilon)
        }
        return losses[self.loss]

[docs]    def fit(self, X, y):
        """Fit model according to X and y.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples
            and n_features is the number of features.

        y : array-like, shape = [n_samples] or [n_samples, n_targets]
            Target values.

        Returns
        -------
        self : regressor
            Returns self.
        """
        rs = check_random_state(self.random_state)

        ds = get_dataset(X)
        n_samples = ds.get_n_samples()
        n_features = ds.get_n_features()

        self.outputs_2d_ = len(y.shape) == 2
        if self.outputs_2d_:
            Y = y
        else:
            Y = y.reshape(-1, 1)
        Y = np.asfortranarray(Y)
        n_vectors = Y.shape[1]
        self.coef_ = np.zeros((n_vectors, n_features), dtype=np.float64)
        self.intercept_ = np.zeros(n_vectors, dtype=np.float64)

        loss = self._get_loss()
        penalty = self._get_penalty()

        for k in range(n_vectors):
            _binary_sgd(self,
                        self.coef_, self.intercept_, k,
                        ds, Y[:, k], loss, penalty, self.alpha,
                        self._get_learning_rate(),
                        self.eta0, self.power_t,
                        self.fit_intercept,
                        self.intercept_decay,
                        int(self.max_iter * n_samples), self.shuffle, rs,
                        self.callback, self.n_calls, self.verbose)

        try:
            assert_all_finite(self.coef_)
        except ValueError:
            warnings.warn("coef_ contains infinite values")

        return self

[docs]    def predict(self, X):
        """
        Perform regression on an array of test vectors X.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]

        Returns
        -------
        p : array, shape = [n_samples]
            Predicted target values for X
        """
        try:
            assert_all_finite(self.coef_)
            pred = safe_sparse_dot(X, self.coef_.T)
            pred += self.intercept_
        except ValueError:
            n_samples = X.shape[0]
            n_vectors = self.coef_.shape[0]
            pred = np.zeros((n_samples, n_vectors))

        if not self.outputs_2d_:
            pred = pred.ravel()

        return pred