Source code for metric_learn.base_metric

"""
Base module.
"""

from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.extmath import stable_cumsum
from sklearn.utils.validation import _is_arraylike, check_is_fitted
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve
import numpy as np
from abc import ABCMeta, abstractmethod
from ._util import ArrayIndexer, check_input, validate_vector
import warnings


[docs] class BaseMetricLearner(BaseEstimator, metaclass=ABCMeta): """ Base class for all metric-learners. Parameters ---------- preprocessor : array-like, shape=(n_samples, n_features) or callable The preprocessor to call to get tuples from indices. If array-like, tuples will be gotten like this: X[indices]. """
[docs] def __init__(self, preprocessor=None): self.preprocessor = preprocessor
[docs] @abstractmethod def score_pairs(self, pairs): """ Returns the score between pairs (can be a similarity, or a distance/metric depending on the algorithm) .. deprecated:: 0.7.0 Refer to `pair_distance` and `pair_score`. .. warning:: This method will be removed in 0.8.0. Please refer to `pair_distance` or `pair_score`. This change will occur in order to add learners that don't necessarily learn a Mahalanobis distance. Parameters ---------- pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2) 3D Array of pairs to score, with each row corresponding to two points, for 2D array of indices of pairs if the metric learner uses a preprocessor. Returns ------- scores : `numpy.ndarray` of shape=(n_pairs,) The score of every pair. See Also -------- get_metric : a method that returns a function to compute the metric between two points. The difference between `score_pairs` is that it works on two 1D arrays and cannot use a preprocessor. Besides, the returned function is independent of the metric learner and hence is not modified if the metric learner is. """
[docs] @abstractmethod def pair_score(self, pairs): """ .. versionadded:: 0.7.0 Compute the similarity score between pairs Returns the similarity score between pairs of points (the larger the score, the more similar the pair). For metric learners that learn a distance, the score is simply the opposite of the distance between pairs. All learners have access to this method. Parameters ---------- pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2) 3D Array of pairs to score, with each row corresponding to two points, for 2D array of indices of pairs if the metric learner uses a preprocessor. Returns ------- scores : `numpy.ndarray` of shape=(n_pairs,) The score of every pair. See Also -------- get_metric : a method that returns a function to compute the metric between two points. The difference with `pair_score` is that it works on two 1D arrays and cannot use a preprocessor. Besides, the returned function is independent of the metric learner and hence is not modified if the metric learner is. """
[docs] @abstractmethod def pair_distance(self, pairs): """ .. versionadded:: 0.7.0 Compute the distance between pairs Returns the (pseudo) distance between pairs, when available. For metric learners that do not learn a (pseudo) distance, an error is thrown instead. Parameters ---------- pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2) 3D Array of pairs for which to compute the distance, with each row corresponding to two points, for 2D array of indices of pairs if the metric learner uses a preprocessor. Returns ------- scores : `numpy.ndarray` of shape=(n_pairs,) The distance between every pair. See Also -------- get_metric : a method that returns a function to compute the metric between two points. The difference with `pair_distance` is that it works on two 1D arrays and cannot use a preprocessor. Besides, the returned function is independent of the metric learner and hence is not modified if the metric learner is. """
def _check_preprocessor(self): """Initializes the preprocessor""" if _is_arraylike(self.preprocessor): self.preprocessor_ = ArrayIndexer(self.preprocessor) elif callable(self.preprocessor) or self.preprocessor is None: self.preprocessor_ = self.preprocessor else: raise ValueError("Invalid type for the preprocessor: {}. You should " "provide either None, an array-like object, " "or a callable.".format(type(self.preprocessor))) def _prepare_inputs(self, X, y=None, type_of_inputs='classic', **kwargs): """Initializes the preprocessor and processes inputs. See `check_input` for more details. Parameters ---------- X : array-like The input data array to check. y : array-like The input labels array to check. type_of_inputs : `str` {'classic', 'tuples'} The type of inputs to check. If 'classic', the input should be a 2D array-like of points or a 1D array like of indicators of points. If 'tuples', the input should be a 3D array-like of tuples or a 2D array-like of indicators of tuples. **kwargs : dict Arguments to pass to check_input. Returns ------- X : `numpy.ndarray` The checked input data array. y : `numpy.ndarray` (optional) The checked input labels array. """ self._check_preprocessor() check_is_fitted(self, ['preprocessor_']) outs = check_input(X, y, type_of_inputs=type_of_inputs, preprocessor=self.preprocessor_, estimator=self, tuple_size=getattr(self, '_tuple_size', None), **kwargs) # Conform to SLEP010 if not hasattr(self, 'n_features_in_'): self.n_features_in_ = (outs if y is None else outs[0]).shape[1] return outs
[docs] @abstractmethod def get_metric(self): """Returns a function that takes as input two 1D arrays and outputs the value of the learned metric on these two points. Depending on the algorithm, it can return a distance or a similarity function between pairs. This function will be independent from the metric learner that learned it (it will not be modified if the initial metric learner is modified), and it can be directly plugged into the `metric` argument of scikit-learn's estimators. Returns ------- metric_fun : function The function described above. Examples -------- .. doctest:: >>> from metric_learn import NCA >>> from sklearn.datasets import make_classification >>> from sklearn.neighbors import KNeighborsClassifier >>> nca = NCA() >>> X, y = make_classification() >>> nca.fit(X, y) >>> knn = KNeighborsClassifier(metric=nca.get_metric()) >>> knn.fit(X, y) # doctest: +NORMALIZE_WHITESPACE KNeighborsClassifier(algorithm='auto', leaf_size=30, metric=<function MahalanobisMixin.get_metric.<locals>.metric_fun at 0x...>, metric_params=None, n_jobs=None, n_neighbors=5, p=2, weights='uniform') See Also -------- pair_distance : a method that returns the distance between several pairs of points. Unlike `get_metric`, this is a method of the metric learner and therefore can change if the metric learner changes. Besides, it can use the metric learner's preprocessor, and works on concatenated arrays. pair_score : a method that returns the similarity score between several pairs of points. Unlike `get_metric`, this is a method of the metric learner and therefore can change if the metric learner changes. Besides, it can use the metric learner's preprocessor, and works on concatenated arrays. """
[docs] class MetricTransformer(metaclass=ABCMeta): """ Base class for all learners that can transform data into a new space with the metric learned. """
[docs] @abstractmethod def transform(self, X): """Applies the metric transformation. Parameters ---------- X : (n x d) matrix Data to transform. Returns ------- transformed : (n x d) matrix Input data transformed to the metric space by :math:`XL^{\\top}` """
[docs] class MahalanobisMixin(BaseMetricLearner, MetricTransformer, metaclass=ABCMeta): r"""Mahalanobis metric learning algorithms. Algorithm that learns a Mahalanobis (pseudo) distance :math:`d_M(x, x')`, defined between two column vectors :math:`x` and :math:`x'` by: :math:`d_M(x, x') = \sqrt{(x-x')^T M (x-x')}`, where :math:`M` is a learned symmetric positive semi-definite (PSD) matrix. The metric between points can then be expressed as the euclidean distance between points embedded in a new space through a linear transformation. Indeed, the above matrix can be decomposed into the product of two transpose matrices (through SVD or Cholesky decomposition): :math:`d_M(x, x')^2 = (x-x')^T M (x-x') = (x-x')^T L^T L (x-x') = (L x - L x')^T (L x- L x')` Attributes ---------- components_ : `numpy.ndarray`, shape=(n_components, n_features) The learned linear transformation ``L``. """
[docs] def score_pairs(self, pairs): r""" Returns the learned Mahalanobis distance between pairs. This distance is defined as: :math:`d_M(x, x') = \\sqrt{(x-x')^T M (x-x')}` where ``M`` is the learned Mahalanobis matrix, for every pair of points ``x`` and ``x'``. This corresponds to the euclidean distance between embeddings of the points in a new space, obtained through a linear transformation. Indeed, we have also: :math:`d_M(x, x') = \\sqrt{(x_e - x_e')^T (x_e- x_e')}`, with :math:`x_e = L x` (See :class:`MahalanobisMixin`). .. deprecated:: 0.7.0 Please use `pair_distance` instead. .. warning:: This method will be removed in 0.8.0. Please refer to `pair_distance` or `pair_score`. This change will occur in order to add learners that don't necessarily learn a Mahalanobis distance. Parameters ---------- pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2) 3D Array of pairs to score, with each row corresponding to two points, for 2D array of indices of pairs if the metric learner uses a preprocessor. Returns ------- scores : `numpy.ndarray` of shape=(n_pairs,) The learned Mahalanobis distance for every pair. See Also -------- get_metric : a method that returns a function to compute the metric between two points. The difference with `score_pairs` is that it works on two 1D arrays and cannot use a preprocessor. Besides, the returned function is independent of the metric learner and hence is not modified if the metric learner is. :ref:`mahalanobis_distances` : The section of the project documentation that describes Mahalanobis Distances. """ dpr_msg = ("score_pairs will be deprecated in release 0.7.0. " "Use pair_score to compute similarity scores, or " "pair_distances to compute distances.") warnings.warn(dpr_msg, category=FutureWarning) return self.pair_distance(pairs)
[docs] def pair_score(self, pairs): """ Returns the opposite of the learned Mahalanobis distance between pairs. Parameters ---------- pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2) 3D Array of pairs to score, with each row corresponding to two points, for 2D array of indices of pairs if the metric learner uses a preprocessor. Returns ------- scores : `numpy.ndarray` of shape=(n_pairs,) The opposite of the learned Mahalanobis distance for every pair. See Also -------- get_metric : a method that returns a function to compute the metric between two points. The difference with `pair_score` is that it works on two 1D arrays and cannot use a preprocessor. Besides, the returned function is independent of the metric learner and hence is not modified if the metric learner is. :ref:`mahalanobis_distances` : The section of the project documentation that describes Mahalanobis Distances. """ return -1 * self.pair_distance(pairs)
[docs] def pair_distance(self, pairs): """ Returns the learned Mahalanobis distance between pairs. This distance is defined as: :math:`d_M(x, x') = \\sqrt{(x-x')^T M (x-x')}` where ``M`` is the learned Mahalanobis matrix, for every pair of points ``x`` and ``x'``. This corresponds to the euclidean distance between embeddings of the points in a new space, obtained through a linear transformation. Indeed, we have also: :math:`d_M(x, x') = \\sqrt{(x_e - x_e')^T (x_e- x_e')}`, with :math:`x_e = L x` (See :class:`MahalanobisMixin`). Parameters ---------- pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2) 3D Array of pairs to score, with each row corresponding to two points, for 2D array of indices of pairs if the metric learner uses a preprocessor. Returns ------- scores : `numpy.ndarray` of shape=(n_pairs,) The learned Mahalanobis distance for every pair. See Also -------- get_metric : a method that returns a function to compute the metric between two points. The difference with `pair_distance` is that it works on two 1D arrays and cannot use a preprocessor. Besides, the returned function is independent of the metric learner and hence is not modified if the metric learner is. :ref:`mahalanobis_distances` : The section of the project documentation that describes Mahalanobis Distances. """ check_is_fitted(self, ['preprocessor_']) pairs = check_input(pairs, type_of_inputs='tuples', preprocessor=self.preprocessor_, estimator=self, tuple_size=2) pairwise_diffs = self.transform(pairs[:, 1, :] - pairs[:, 0, :]) # (for MahalanobisMixin, the embedding is linear so we can just embed the # difference) return np.sqrt(np.sum(pairwise_diffs**2, axis=-1))
[docs] def transform(self, X): """Embeds data points in the learned linear embedding space. Transforms samples in ``X`` into ``X_embedded``, samples inside a new embedding space such that: ``X_embedded = X.dot(L.T)``, where ``L`` is the learned linear transformation (See :class:`MahalanobisMixin`). Parameters ---------- X : `numpy.ndarray`, shape=(n_samples, n_features) The data points to embed. Returns ------- X_embedded : `numpy.ndarray`, shape=(n_samples, n_components) The embedded data points. """ check_is_fitted(self, ['preprocessor_', 'components_']) X_checked = check_input(X, type_of_inputs='classic', estimator=self, preprocessor=self.preprocessor_, accept_sparse=True) return X_checked.dot(self.components_.T)
[docs] def get_metric(self): check_is_fitted(self, 'components_') components_T = self.components_.T.copy() def metric_fun(u, v, squared=False): """This function computes the metric between u and v, according to the previously learned metric. Parameters ---------- u : array-like, shape=(n_features,) The first point involved in the distance computation. v : array-like, shape=(n_features,) The second point involved in the distance computation. squared : `bool` If True, the function will return the squared metric between u and v, which is faster to compute. Returns ------- distance : float The distance between u and v according to the new metric. """ u = validate_vector(u) v = validate_vector(v) transformed_diff = (u - v).dot(components_T) dist = np.dot(transformed_diff, transformed_diff.T) if not squared: dist = np.sqrt(dist) return dist return metric_fun
get_metric.__doc__ = BaseMetricLearner.get_metric.__doc__
[docs] def get_mahalanobis_matrix(self): """Returns a copy of the Mahalanobis matrix learned by the metric learner. Returns ------- M : `numpy.ndarray`, shape=(n_features, n_features) The copy of the learned Mahalanobis matrix. """ check_is_fitted(self, 'components_') return self.components_.T.dot(self.components_)
[docs] class _PairsClassifierMixin(BaseMetricLearner, ClassifierMixin): """Base class for pairs learners. Attributes ---------- threshold_ : `float` If the distance metric between two points is lower than this threshold, points will be classified as similar, otherwise they will be classified as dissimilar. """ classes_ = np.array([0, 1]) _tuple_size = 2 # number of points in a tuple, 2 for pairs
[docs] def predict(self, pairs): """Predicts the learned metric between input pairs. (For now it just calls decision function). Returns the learned metric value between samples in every pair. It should ideally be low for similar samples and high for dissimilar samples. Parameters ---------- pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2) 3D Array of pairs to predict, with each row corresponding to two points, or 2D array of indices of pairs if the metric learner uses a preprocessor. Returns ------- y_predicted : `numpy.ndarray` of floats, shape=(n_constraints,) The predicted learned metric value between samples in every pair. """ check_is_fitted(self, 'preprocessor_') if "threshold_" not in vars(self): msg = ("A threshold for this estimator has not been set, " "call its set_threshold or calibrate_threshold method.") raise AttributeError(msg) return 2 * (- self.decision_function(pairs) <= self.threshold_) - 1
[docs] def decision_function(self, pairs): """Returns the decision function used to classify the pairs. Returns the opposite of the learned metric value between samples in every pair, to be consistent with scikit-learn conventions. Hence it should ideally be low for dissimilar samples and high for similar samples. This is the decision function that is used to classify pairs as similar (+1), or dissimilar (-1). Parameters ---------- pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2) 3D Array of pairs to predict, with each row corresponding to two points, or 2D array of indices of pairs if the metric learner uses a preprocessor. Returns ------- y_predicted : `numpy.ndarray` of floats, shape=(n_constraints,) The predicted decision function value for each pair. """ check_is_fitted(self, 'preprocessor_') pairs = check_input(pairs, type_of_inputs='tuples', preprocessor=self.preprocessor_, estimator=self, tuple_size=self._tuple_size) return self.pair_score(pairs)
[docs] def score(self, pairs, y): """Computes score of pairs similarity prediction. Returns the ``roc_auc`` score of the fitted metric learner. It is computed in the following way: for every value of a threshold ``t`` we classify all pairs of samples where the predicted distance is inferior to ``t`` as belonging to the "similar" class, and the other as belonging to the "dissimilar" class, and we count false positive and true positives as in a classical ``roc_auc`` curve. Parameters ---------- pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2) 3D Array of pairs, with each row corresponding to two points, or 2D array of indices of pairs if the metric learner uses a preprocessor. y : array-like, shape=(n_constraints,) The corresponding labels. Returns ------- score : float The ``roc_auc`` score. """ return roc_auc_score(y, self.decision_function(pairs))
[docs] def set_threshold(self, threshold): """Sets the threshold of the metric learner to the given value `threshold`. See more in the :ref:`User Guide <calibration>`. Parameters ---------- threshold : float The threshold value we want to set. It is the value to which the predicted distance for test pairs will be compared. If they are superior to the threshold they will be classified as similar (+1), and dissimilar (-1) if not. Returns ------- self : `_PairsClassifier` The pairs classifier with the new threshold set. """ check_is_fitted(self, 'preprocessor_') try: self.threshold_ = float(threshold) except TypeError: raise ValueError('Parameter threshold must be a real number. ' 'Got {} instead.'.format(type(threshold))) except ValueError: raise ValueError('Parameter threshold must be a real number. ' 'Got {} instead.'.format(type(threshold))) return self
[docs] def calibrate_threshold(self, pairs_valid, y_valid, strategy='accuracy', min_rate=None, beta=1.): """Decision threshold calibration for pairwise binary classification Method that calibrates the decision threshold (cutoff point) of the metric learner. This threshold will then be used when calling the method `predict`. The methods for picking cutoff points make use of traditional binary classification evaluation statistics such as the true positive and true negative rates and F-scores. The threshold will be found to maximize the chosen score on the validation set ``(pairs_valid, y_valid)``. See more in the :ref:`User Guide <calibration>`. Parameters ---------- strategy : str, optional (default='accuracy') The strategy to use for choosing the cutoff threshold. 'accuracy' Selects a decision threshold that maximizes the accuracy. 'f_beta' Selects a decision threshold that maximizes the f_beta score, with beta given by the parameter `beta`. 'max_tpr' Selects a decision threshold that yields the highest true positive rate with true negative rate at least equal to the value of the parameter `min_rate`. 'max_tnr' Selects a decision threshold that yields the highest true negative rate with true positive rate at least equal to the value of the parameter `min_rate`. beta : float in [0, 1], optional (default=None) Beta value to be used in case strategy == 'f_beta'. min_rate : float in [0, 1] or None, (default=None) In case strategy is 'max_tpr' or 'max_tnr' this parameter must be set to specify the minimal value for the true negative rate or true positive rate respectively that needs to be achieved. pairs_valid : array-like, shape=(n_pairs_valid, 2, n_features) The validation set of pairs to use to set the threshold. y_valid : array-like, shape=(n_pairs_valid,) The labels of the pairs of the validation set to use to set the threshold. They must be +1 for positive pairs and -1 for negative pairs. References ---------- .. [1] Receiver-operating characteristic (ROC) plots: a fundamental evaluation tool in clinical medicine, MH Zweig, G Campbell - Clinical chemistry, 1993 .. [2] Most of the code of this function is from scikit-learn's PR #10117 See Also -------- sklearn.calibration : scikit-learn's module for calibrating classifiers """ check_is_fitted(self, 'preprocessor_') self._validate_calibration_params(strategy, min_rate, beta) pairs_valid, y_valid = self._prepare_inputs(pairs_valid, y_valid, type_of_inputs='tuples') n_samples = pairs_valid.shape[0] if strategy == 'accuracy': scores = self.decision_function(pairs_valid) scores_sorted_idces = np.argsort(scores)[::-1] scores_sorted = scores[scores_sorted_idces] # true labels ordered by decision_function value: (higher first) y_ordered = y_valid[scores_sorted_idces] # we need to add a threshold that will reject all points scores_sorted = np.concatenate([[scores_sorted[0] + 1], scores_sorted]) # finds the threshold that maximizes the accuracy: cum_tp = stable_cumsum(y_ordered == 1) # cumulative number of true # positives # we need to add the point where all samples are rejected: cum_tp = np.concatenate([[0.], cum_tp]) cum_tn_inverted = stable_cumsum(y_ordered[::-1] == -1) cum_tn = np.concatenate([[0.], cum_tn_inverted])[::-1] cum_accuracy = (cum_tp + cum_tn) / n_samples imax = np.argmax(cum_accuracy) # we set the threshold to the lowest accepted score # note: we are working with negative distances but we want the threshold # to be with respect to the actual distances so we take minus sign self.threshold_ = - scores_sorted[imax] # note: if the best is to reject all points it's already one of the # thresholds (scores_sorted[0]) return self if strategy == 'f_beta': precision, recall, thresholds = precision_recall_curve( y_valid, self.decision_function(pairs_valid), pos_label=1) # here the thresholds are decreasing # We ignore the warnings here, in the same taste as # https://github.com/scikit-learn/scikit-learn/blob/62d205980446a1abc1065 # f4332fd74eee57fcf73/sklearn/metrics/classification.py#L1284 with np.errstate(divide='ignore', invalid='ignore'): f_beta = ((1 + beta**2) * (precision * recall) / (beta**2 * precision + recall)) # We need to set nans to zero otherwise they will be considered higher # than the others (also discussed in https://github.com/scikit-learn/ # scikit-learn/pull/10117/files#r262115773) f_beta[np.isnan(f_beta)] = 0. imax = np.argmax(f_beta) # we set the threshold to the lowest accepted score # note: we are working with negative distances but we want the threshold # to be with respect to the actual distances so we take minus sign self.threshold_ = - thresholds[imax] # Note: we don't need to deal with rejecting all points (i.e. threshold = # max_scores + 1), since this can never happen to be optimal # (see a more detailed discussion in test_calibrate_threshold_extreme) return self fpr, tpr, thresholds = roc_curve(y_valid, self.decision_function(pairs_valid), pos_label=1) # here the thresholds are decreasing fpr, tpr, thresholds = fpr, tpr, thresholds if strategy in ['max_tpr', 'max_tnr']: if strategy == 'max_tpr': indices = np.where(1 - fpr >= min_rate)[0] imax = np.argmax(tpr[indices]) if strategy == 'max_tnr': indices = np.where(tpr >= min_rate)[0] imax = np.argmax(1 - fpr[indices]) imax_valid = indices[imax] # note: we are working with negative distances but we want the threshold # to be with respect to the actual distances so we take minus sign if indices[imax] == len(thresholds): # we want to accept everything self.threshold_ = - (thresholds[imax_valid] - 1) else: # thanks to roc_curve, the first point will always be max_scores # + 1, see: https://github.com/scikit-learn/scikit-learn/pull/13523 self.threshold_ = - thresholds[imax_valid] return self
@staticmethod def _validate_calibration_params(strategy='accuracy', min_rate=None, beta=1.): """Ensure that calibration parameters have allowed values""" if strategy not in ('accuracy', 'f_beta', 'max_tpr', 'max_tnr'): raise ValueError('Strategy can either be "accuracy", "f_beta" or ' '"max_tpr" or "max_tnr". Got "{}" instead.' .format(strategy)) if strategy == 'max_tpr' or strategy == 'max_tnr': if (min_rate is None or not isinstance(min_rate, (int, float)) or not min_rate >= 0 or not min_rate <= 1): raise ValueError('Parameter min_rate must be a number in' '[0, 1]. ' 'Got {} instead.'.format(min_rate)) if strategy == 'f_beta': if beta is None or not isinstance(beta, (int, float)): raise ValueError('Parameter beta must be a real number. ' 'Got {} instead.'.format(type(beta)))
[docs] class _TripletsClassifierMixin(BaseMetricLearner, ClassifierMixin): """ Base class for triplets learners. """ classes_ = np.array([0, 1]) _tuple_size = 3 # number of points in a tuple, 3 for triplets
[docs] def predict(self, triplets): """Predicts the ordering between sample distances in input triplets. For each triplets, returns 1 if the first element is closer to the second than to the last and -1 if not. Parameters ---------- triplets : array-like, shape=(n_triplets, 3, n_features) or (n_triplets, 3) 3D array of triplets to predict, with each row corresponding to three points, or 2D array of indices of triplets if the metric learner uses a preprocessor. Returns ------- prediction : `numpy.ndarray` of floats, shape=(n_constraints,) Predictions of the ordering of pairs, for each triplet. """ return 2 * (self.decision_function(triplets) > 0) - 1
[docs] def decision_function(self, triplets): """Predicts differences between sample distances in input triplets. For each triplet (X_a, X_b, X_c) in the samples, computes the difference between the learned distance of the second pair (X_a, X_c) minus the learned distance of the first pair (X_a, X_b). The higher it is, the more probable it is that the pairs in the triplets are presented in the right order, i.e. that the label of the triplet is 1. The lower it is, the more probable it is that the label of the triplet is -1. Parameters ---------- triplet : array-like, shape=(n_triplets, 3, n_features) or \ (n_triplets, 3) 3D array of triplets to predict, with each row corresponding to three points, or 2D array of indices of triplets if the metric learner uses a preprocessor. Returns ------- decision_function : `numpy.ndarray` of floats, shape=(n_constraints,) Metric differences. """ check_is_fitted(self, 'preprocessor_') triplets = check_input(triplets, type_of_inputs='tuples', preprocessor=self.preprocessor_, estimator=self, tuple_size=self._tuple_size) return (self.pair_score(triplets[:, :2]) - self.pair_score(triplets[:, [0, 2]]))
[docs] def score(self, triplets): """Computes score on input triplets. Returns the accuracy score of the following classification task: a triplet (X_a, X_b, X_c) is correctly classified if the predicted similarity between the first pair (X_a, X_b) is higher than that of the second pair (X_a, X_c) Parameters ---------- triplets : array-like, shape=(n_triplets, 3, n_features) or \ (n_triplets, 3) 3D array of triplets to score, with each row corresponding to three points, or 2D array of indices of triplets if the metric learner uses a preprocessor. Returns ------- score : float The triplets score. """ # Since the prediction is a vector of values in {-1, +1}, we need to # rescale them to {0, 1} to compute the accuracy using the mean (because # then 1 means a correctly classified result (pairs are in the right # order), and a 0 an incorrectly classified result (pairs are in the # wrong order). return self.predict(triplets).mean() / 2 + 0.5
[docs] class _QuadrupletsClassifierMixin(BaseMetricLearner, ClassifierMixin): """ Base class for quadruplets learners. """ classes_ = np.array([0, 1]) _tuple_size = 4 # number of points in a tuple, 4 for quadruplets
[docs] def predict(self, quadruplets): """Predicts the ordering between sample distances in input quadruplets. For each quadruplet, returns 1 if the quadruplet is in the right order ( first pair is more similar than second pair), and -1 if not. Parameters ---------- quadruplets : array-like, shape=(n_quadruplets, 4, n_features) or \ (n_quadruplets, 4) 3D Array of quadruplets to predict, with each row corresponding to four points, or 2D array of indices of quadruplets if the metric learner uses a preprocessor. Returns ------- prediction : `numpy.ndarray` of floats, shape=(n_constraints,) Predictions of the ordering of pairs, for each quadruplet. """ return np.sign(self.decision_function(quadruplets))
[docs] def decision_function(self, quadruplets): """Predicts differences between sample distances in input quadruplets. For each quadruplet in the samples, computes the difference between the learned metric of the second pair minus the learned metric of the first pair. The higher it is, the more probable it is that the pairs in the quadruplet are presented in the right order, i.e. that the label of the quadruplet is 1. The lower it is, the more probable it is that the label of the quadruplet is -1. Parameters ---------- quadruplets : array-like, shape=(n_quadruplets, 4, n_features) or \ (n_quadruplets, 4) 3D Array of quadruplets to predict, with each row corresponding to four points, or 2D array of indices of quadruplets if the metric learner uses a preprocessor. Returns ------- decision_function : `numpy.ndarray` of floats, shape=(n_constraints,) Metric differences. """ check_is_fitted(self, 'preprocessor_') quadruplets = check_input(quadruplets, type_of_inputs='tuples', preprocessor=self.preprocessor_, estimator=self, tuple_size=self._tuple_size) return (self.pair_score(quadruplets[:, :2]) - self.pair_score(quadruplets[:, 2:]))
[docs] def score(self, quadruplets): """Computes score on input quadruplets Returns the accuracy score of the following classification task: a record is correctly classified if the predicted similarity between the first two samples is higher than that of the last two. Parameters ---------- quadruplets : array-like, shape=(n_quadruplets, 4, n_features) or \ (n_quadruplets, 4) 3D Array of quadruplets to score, with each row corresponding to four points, or 2D array of indices of quadruplets if the metric learner uses a preprocessor. Returns ------- score : float The quadruplets score. """ # Since the prediction is a vector of values in {-1, +1}, we need to # rescale them to {0, 1} to compute the accuracy using the mean (because # then 1 means a correctly classified result (pairs are in the right # order), and a 0 an incorrectly classified result (pairs are in the # wrong order). return self.predict(quadruplets).mean() / 2 + 0.5