Source code for metric_learn.itml

"""
Information Theoretic Metric Learning (ITML)
"""

import numpy as np
from sklearn.metrics import pairwise_distances
from sklearn.utils.validation import check_array
from sklearn.base import TransformerMixin
from .base_metric import _PairsClassifierMixin, MahalanobisMixin
from .constraints import Constraints, wrap_pairs
from ._util import components_from_metric, _initialize_metric_mahalanobis
import warnings


class _BaseITML(MahalanobisMixin):
  """Information Theoretic Metric Learning (ITML)"""

  _tuple_size = 2  # constraints are pairs

  def __init__(self, gamma=1., max_iter=1000, tol=1e-3,
               prior='identity', verbose=False,
               preprocessor=None, random_state=None,
               convergence_threshold='deprecated'):
    if convergence_threshold != 'deprecated':
      warnings.warn('"convergence_threshold" parameter has been '
                    ' renamed to "tol". It has been deprecated in'
                    ' version 0.6.3 and will be removed in 0.7.0'
                    '', FutureWarning)
      tol = convergence_threshold
    self.convergence_threshold = 'deprecated'  # Avoid errors
    self.gamma = gamma
    self.max_iter = max_iter
    self.tol = tol
    self.prior = prior
    self.verbose = verbose
    self.random_state = random_state
    super(_BaseITML, self).__init__(preprocessor)

  def _fit(self, pairs, y, bounds=None):
    pairs, y = self._prepare_inputs(pairs, y,
                                    type_of_inputs='tuples')
    # init bounds
    if bounds is None:
      X = np.unique(np.vstack(pairs), axis=0)
      self.bounds_ = np.percentile(pairwise_distances(X), (5, 95))
    else:
      bounds = check_array(bounds, allow_nd=False, ensure_min_samples=0,
                           ensure_2d=False)
      bounds = bounds.ravel()
      if bounds.size != 2:
        raise ValueError("`bounds` should be an array-like of two elements.")
      self.bounds_ = bounds
    self.bounds_[self.bounds_ == 0] = 1e-9
    # set the prior
    # pairs will be deduplicated into X two times, TODO: avoid that
    A = _initialize_metric_mahalanobis(pairs, self.prior, self.random_state,
                                       strict_pd=True,
                                       matrix_name='prior')
    gamma = self.gamma
    pos_pairs, neg_pairs = pairs[y == 1], pairs[y == -1]
    num_pos = len(pos_pairs)
    num_neg = len(neg_pairs)
    _lambda = np.zeros(num_pos + num_neg)
    lambdaold = np.zeros_like(_lambda)
    gamma_proj = 1. if gamma is np.inf else gamma / (gamma + 1.)
    pos_bhat = np.zeros(num_pos) + self.bounds_[0]
    neg_bhat = np.zeros(num_neg) + self.bounds_[1]
    pos_vv = pos_pairs[:, 0, :] - pos_pairs[:, 1, :]
    neg_vv = neg_pairs[:, 0, :] - neg_pairs[:, 1, :]

    for it in range(self.max_iter):
      # update positives
      for i, v in enumerate(pos_vv):
        wtw = v.dot(A).dot(v)  # scalar
        alpha = min(_lambda[i], gamma_proj * (1. / wtw - 1. / pos_bhat[i]))
        _lambda[i] -= alpha
        beta = alpha / (1 - alpha * wtw)
        pos_bhat[i] = 1. / ((1 / pos_bhat[i]) + (alpha / gamma))
        Av = A.dot(v)
        A += np.outer(Av, Av * beta)

      # update negatives
      for i, v in enumerate(neg_vv):
        wtw = v.dot(A).dot(v)  # scalar
        alpha = min(_lambda[i + num_pos],
                    gamma_proj * (1. / neg_bhat[i] - 1. / wtw))
        _lambda[i + num_pos] -= alpha
        beta = -alpha / (1 + alpha * wtw)
        neg_bhat[i] = 1. / ((1 / neg_bhat[i]) - (alpha / gamma))
        Av = A.dot(v)
        A += np.outer(Av, Av * beta)

      normsum = np.linalg.norm(_lambda) + np.linalg.norm(lambdaold)
      if normsum == 0:
        conv = np.inf
        break
      conv = np.abs(lambdaold - _lambda).sum() / normsum
      if conv < self.tol:
        break
      lambdaold = _lambda.copy()
      if self.verbose:
        print('itml iter: %d, conv = %f' % (it, conv))

    if self.verbose:
      print('itml converged at iter: %d, conv = %f' % (it, conv))
    self.n_iter_ = it

    self.components_ = components_from_metric(A)
    return self



[docs]
class ITML(_BaseITML, _PairsClassifierMixin):
  """Information Theoretic Metric Learning (ITML)

  `ITML` minimizes the (differential) relative entropy, aka Kullback-Leibler
  divergence, between two multivariate Gaussians subject to constraints on the
  associated Mahalanobis distance, which can be formulated into a Bregman
  optimization problem by minimizing the LogDet divergence subject to
  linear constraints. This algorithm can handle a wide variety of constraints
  and can optionally incorporate a prior on the distance function. Unlike some
  other methods, `ITML` does not rely on an eigenvalue computation or
  semi-definite programming.

  Read more in the :ref:`User Guide <itml>`.

  Parameters
  ----------
  gamma : float, optional (default=1.0)
    Value for slack variables

  max_iter : int, optional (default=1000)
    Maximum number of iteration of the optimization procedure.

  tol : float, optional (default=1e-3)
    Convergence tolerance.

  prior : string or numpy array, optional (default='identity')
    The Mahalanobis matrix to use as a prior. Possible options are
    'identity', 'covariance', 'random', and a numpy array of shape
    (n_features, n_features). For ITML, the prior should be strictly
    positive definite (PD).

    'identity'
      An identity matrix of shape (n_features, n_features).

    'covariance'
      The inverse covariance matrix.

    'random'
      The prior will be a random SPD matrix of shape
      `(n_features, n_features)`, generated using
      `sklearn.datasets.make_spd_matrix`.

    numpy array
      A positive definite (PD) matrix of shape
      (n_features, n_features), that will be used as such to set the
      prior.

  verbose : bool, optional (default=False)
    If True, prints information while learning

  preprocessor : array-like, shape=(n_samples, n_features) or callable
    The preprocessor to call to get tuples from indices. If array-like,
    tuples will be formed like this: X[indices].

  random_state : int or numpy.RandomState or None, optional (default=None)
    A pseudo random number generator object or a seed for it if int. If
    ``prior='random'``, ``random_state`` is used to set the prior.

  convergence_threshold : Renamed to tol. Will be deprecated in 0.7.0

  Attributes
  ----------
  bounds_ : `numpy.ndarray`, shape=(2,)
    Bounds on similarity, aside slack variables, s.t.
    ``d(a, b) < bounds_[0]`` for all given pairs of similar points ``a``
    and ``b``, and ``d(c, d) > bounds_[1]`` for all given pairs of
    dissimilar points ``c`` and ``d``, with ``d`` the learned distance. If
    not provided at initialization, bounds_[0] and bounds_[1] are set at
    train time to the 5th and 95th percentile of the pairwise distances among
    all points present in the input `pairs`.

  n_iter_ : `int`
    The number of iterations the solver has run.

  components_ : `numpy.ndarray`, shape=(n_features, n_features)
    The linear transformation ``L`` deduced from the learned Mahalanobis
    metric (See function `components_from_metric`.)

  threshold_ : `float`
    If the distance metric between two points is lower than this threshold,
    points will be classified as similar, otherwise they will be
    classified as dissimilar.

  Examples
  --------
  >>> from metric_learn import ITML
  >>> pairs = [[[1.2, 7.5], [1.3, 1.5]],
  >>>         [[6.4, 2.6], [6.2, 9.7]],
  >>>         [[1.3, 4.5], [3.2, 4.6]],
  >>>         [[6.2, 5.5], [5.4, 5.4]]]
  >>> y = [1, 1, -1, -1]
  >>> # in this task we want points where the first feature is close to be
  >>> # closer to each other, no matter how close the second feature is
  >>> itml = ITML()
  >>> itml.fit(pairs, y)

  References
  ----------
  .. [1] Jason V. Davis, et al. `Information-theoretic Metric Learning
         <http://www.prateekjain.org/publications/all_papers\
         /DavisKJSD07_ICML.pdf>`_. ICML 2007.
  """


[docs]
  def fit(self, pairs, y, bounds=None, calibration_params=None):
    """Learn the ITML model.

    The threshold will be calibrated on the trainset using the parameters
    `calibration_params`.

    Parameters
    ----------
    pairs: array-like, shape=(n_constraints, 2, n_features) or \
           (n_constraints, 2)
      3D Array of pairs with each row corresponding to two points,
      or 2D array of indices of pairs if the metric learner uses a
      preprocessor.

    y: array-like, of shape (n_constraints,)
      Labels of constraints. Should be -1 for dissimilar pair, 1 for similar.

    bounds : array-like of two numbers
      Bounds on similarity, aside slack variables, s.t.
      ``d(a, b) < bounds_[0]`` for all given pairs of similar points ``a``
      and ``b``, and ``d(c, d) > bounds_[1]`` for all given pairs of
      dissimilar points ``c`` and ``d``, with ``d`` the learned distance.
      If not provided at initialization, bounds_[0] and bounds_[1] will be
      set to the 5th and 95th percentile of the pairwise distances among all
      points present in the input `pairs`.

    calibration_params : `dict` or `None`
      Dictionary of parameters to give to `calibrate_threshold` for the
      threshold calibration step done at the end of `fit`. If `None` is
      given, `calibrate_threshold` will use the default parameters.

    Returns
    -------
    self : object
      Returns the instance.
    """
    calibration_params = (calibration_params if calibration_params is not
                          None else dict())
    self._validate_calibration_params(**calibration_params)
    self._fit(pairs, y, bounds=bounds)
    self.calibrate_threshold(pairs, y, **calibration_params)
    return self





[docs]
class ITML_Supervised(_BaseITML, TransformerMixin):
  """Supervised version of Information Theoretic Metric Learning (ITML)

  `ITML_Supervised` creates pairs of similar sample by taking same class
  samples, and pairs of dissimilar samples by taking different class
  samples. It then passes these pairs to `ITML` for training.

  Parameters
  ----------
  gamma : float, optional (default=1.0)
    Value for slack variables

  max_iter : int, optional (default=1000)
    Maximum number of iterations of the optimization procedure.

  tol : float, optional (default=1e-3)
    Tolerance of the optimization procedure.

  n_constraints : int, optional (default=None)
    Number of constraints to generate. If None, default to `20 *
    num_classes**2`.

  prior : string or numpy array, optional (default='identity')
    Initialization of the Mahalanobis matrix. Possible options are
    'identity', 'covariance', 'random', and a numpy array of shape
    (n_features, n_features). For ITML, the prior should be strictly
    positive definite (PD).

    'identity'
      An identity matrix of shape (n_features, n_features).

    'covariance'
      The inverse covariance matrix.

    'random'
      The prior will be a random SPD matrix of shape
      `(n_features, n_features)`, generated using
      `sklearn.datasets.make_spd_matrix`.

    numpy array
      A positive definite (PD) matrix of shape
      (n_features, n_features), that will be used as such to set the
      prior.

  verbose : bool, optional (default=False)
    If True, prints information while learning

  preprocessor : array-like, shape=(n_samples, n_features) or callable
    The preprocessor to call to get tuples from indices. If array-like,
    tuples will be formed like this: X[indices].

  random_state : int or numpy.RandomState or None, optional (default=None)
    A pseudo random number generator object or a seed for it if int. If
    ``prior='random'``, ``random_state`` is used to set the prior. In any
    case, `random_state` is also used to randomly sample constraints from
    labels.

  num_constraints : Renamed to n_constraints. Will be deprecated in 0.7.0

  convergence_threshold : Renamed to tol. Will be deprecated in 0.7.0

  Attributes
  ----------
  bounds_ : `numpy.ndarray`, shape=(2,)
    Bounds on similarity, aside slack variables, s.t.
    ``d(a, b) < bounds_[0]`` for all given pairs of similar points ``a``
    and ``b``, and ``d(c, d) > bounds_[1]`` for all given pairs of
    dissimilar points ``c`` and ``d``, with ``d`` the learned distance.
    If not provided at initialization, bounds_[0] and bounds_[1] are set at
    train time to the 5th and 95th percentile of the pairwise distances
    among all points in the training data `X`.

  n_iter_ : `int`
    The number of iterations the solver has run.

  components_ : `numpy.ndarray`, shape=(n_features, n_features)
    The linear transformation ``L`` deduced from the learned Mahalanobis
    metric (See function `components_from_metric`.)

  Examples
  --------
  >>> from metric_learn import ITML_Supervised
  >>> from sklearn.datasets import load_iris
  >>> iris_data = load_iris()
  >>> X = iris_data['data']
  >>> Y = iris_data['target']
  >>> itml = ITML_Supervised(n_constraints=200)
  >>> itml.fit(X, Y)

  See Also
  --------
  metric_learn.ITML : The original weakly-supervised algorithm
  :ref:`supervised_version` : The section of the project documentation
    that describes the supervised version of weakly supervised estimators.
  """


[docs]
  def __init__(self, gamma=1.0, max_iter=1000, tol=1e-3,
               n_constraints=None, prior='identity',
               verbose=False, preprocessor=None, random_state=None,
               num_constraints='deprecated',
               convergence_threshold='deprecated'):
    _BaseITML.__init__(self, gamma=gamma, max_iter=max_iter,
                       tol=tol,
                       prior=prior, verbose=verbose,
                       preprocessor=preprocessor,
                       random_state=random_state,
                       convergence_threshold=convergence_threshold)
    if num_constraints != 'deprecated':
      warnings.warn('"num_constraints" parameter has been renamed to'
                    ' "n_constraints". It has been deprecated in'
                    ' version 0.6.3 and will be removed in 0.7.0'
                    '', FutureWarning)
      n_constraints = num_constraints
    self.n_constraints = n_constraints
    # Avoid test get_params from failing (all params passed sholud be set)
    self.num_constraints = 'deprecated'



[docs]
  def fit(self, X, y, bounds=None):
    """Create constraints from labels and learn the ITML model.


    Parameters
    ----------
    X : (n x d) matrix
      Input data, where each row corresponds to a single instance.

    y : (n) array-like
      Data labels.

    bounds : array-like of two numbers
      Bounds on similarity, aside slack variables, s.t.
      ``d(a, b) < bounds_[0]`` for all given pairs of similar points ``a``
      and ``b``, and ``d(c, d) > bounds_[1]`` for all given pairs of
      dissimilar points ``c`` and ``d``, with ``d`` the learned distance.
      If not provided at initialization, bounds_[0] and bounds_[1] will be
      set to the 5th and 95th percentile of the pairwise distances among all
      points in the training data `X`.
    """
    X, y = self._prepare_inputs(X, y, ensure_min_samples=2)
    n_constraints = self.n_constraints
    if n_constraints is None:
      num_classes = len(np.unique(y))
      n_constraints = 20 * num_classes**2

    c = Constraints(y)
    pos_neg = c.positive_negative_pairs(n_constraints,
                                        random_state=self.random_state)
    pairs, y = wrap_pairs(X, pos_neg)
    return _BaseITML._fit(self, pairs, y, bounds=bounds)