Source code for metric_learn.constraints

"""
Helper module for generating different types of constraints
from supervised data labels.
"""
import numpy as np
import warnings
from sklearn.utils import check_random_state
from sklearn.neighbors import NearestNeighbors


__all__ = ['Constraints']



[docs]
class Constraints(object):
  """
  Class to build constraints from labeled data.

  See more in the :ref:`User Guide <supervised_version>`.

  Parameters
  ----------
  partial_labels : `numpy.ndarray` of ints, shape=(n_samples,)
    Array of labels, with -1 indicating unknown label.

  Attributes
  ----------
  partial_labels : `numpy.ndarray` of ints, shape=(n_samples,)
    Array of labels, with -1 indicating unknown label.
  """


[docs]
  def __init__(self, partial_labels):
    partial_labels = np.asanyarray(partial_labels, dtype=int)
    self.partial_labels = partial_labels



[docs]
  def positive_negative_pairs(self, n_constraints, same_length=False,
                              random_state=None, num_constraints='deprecated'):
    """
    Generates positive pairs and negative pairs from labeled data.

    Positive pairs are formed by randomly drawing ``n_constraints`` pairs of
    points with the same label. Negative pairs are formed by randomly drawing
    ``n_constraints`` pairs of points with different label.

    In the case where it is not possible to generate enough positive or
    negative pairs, a smaller number of pairs will be returned with a warning.

    Parameters
    ----------
    n_constraints : int
      Number of positive and negative constraints to generate.

    same_length : bool, optional (default=False)
      If True, forces the number of positive and negative pairs to be
      equal by ignoring some pairs from the larger set.

    random_state : int or numpy.RandomState or None, optional (default=None)
      A pseudo random number generator object or a seed for it if int.

    num_constraints : Renamed to n_constraints. Will be deprecated in 0.7.0

    Returns
    -------
    a : array-like, shape=(n_constraints,)
      1D array of indicators for the left elements of positive pairs.

    b : array-like, shape=(n_constraints,)
      1D array of indicators for the right elements of positive pairs.

    c : array-like, shape=(n_constraints,)
      1D array of indicators for the left elements of negative pairs.

    d : array-like, shape=(n_constraints,)
      1D array of indicators for the right elements of negative pairs.
    """
    if num_constraints != 'deprecated':
      warnings.warn('"num_constraints" parameter has been renamed to'
                    ' "n_constraints". It has been deprecated in'
                    ' version 0.6.3 and will be removed in 0.7.0'
                    '', FutureWarning)
      self.n_constraints = num_constraints
    else:
      self.n_constraints = n_constraints
    random_state = check_random_state(random_state)
    a, b = self._pairs(n_constraints, same_label=True,
                       random_state=random_state)
    c, d = self._pairs(n_constraints, same_label=False,
                       random_state=random_state)
    if same_length and len(a) != len(c):
      n = min(len(a), len(c))
      return a[:n], b[:n], c[:n], d[:n]
    return a, b, c, d



[docs]
  def generate_knntriplets(self, X, k_genuine, k_impostor):
    """
    Generates triplets from labeled data.

    For every point (X_a) the triplets (X_a, X_b, X_c) are constructed from all
    the combinations of taking one of its `k_genuine`-nearest neighbors of the
    same class (X_b) and taking one of its `k_impostor`-nearest neighbors of
    other classes (X_c).

    In the case a class doesn't have enough points in the same class (other
    classes) to yield `k_genuine` (`k_impostor`) neighbors a warning will be
    raised and the maximum value of genuine (impostor) neighbors will be used
    for that class.

    Parameters
    ----------
    X : (n x d) matrix
      Input data, where each row corresponds to a single instance.

    k_genuine : int
      Number of neighbors of the same class to be taken into account.

    k_impostor : int
      Number of neighbors of different classes to be taken into account.

    Returns
    -------
    triplets : array-like, shape=(n_constraints, 3)
      2D array of triplets of indicators.
    """
    # Ignore unlabeled samples
    known_labels_mask = self.partial_labels >= 0
    known_labels = self.partial_labels[known_labels_mask]
    X = X[known_labels_mask]

    labels, labels_count = np.unique(known_labels, return_counts=True)
    len_input = known_labels.shape[0]

    # Handle the case where there are too few elements to yield k_genuine or
    # k_impostor neighbors for every class.

    k_genuine_vec = np.full_like(labels, k_genuine)
    k_impostor_vec = np.full_like(labels, k_impostor)

    for i, count in enumerate(labels_count):
      if k_genuine + 1 > count:
        k_genuine_vec[i] = count-1
        warnings.warn("The class {} has {} elements, which is not sufficient "
                      "to generate {} genuine neighbors as specified by "
                      "k_genuine. Will generate {} genuine neighbors instead."
                      "\n"
                      .format(labels[i], count, k_genuine+1,
                              k_genuine_vec[i]))
      if k_impostor > len_input - count:
        k_impostor_vec[i] = len_input - count
        warnings.warn("The class {} has {} elements of other classes, which is"
                      " not sufficient to generate {} impostor neighbors as "
                      "specified by k_impostor. Will generate {} impostor "
                      "neighbors instead.\n"
                      .format(labels[i], k_impostor_vec[i], k_impostor,
                              k_impostor_vec[i]))

    # The total number of possible triplets combinations per label comes from
    # taking one of the k_genuine_vec[i] genuine neighbors and one of the
    # k_impostor_vec[i] impostor neighbors for the labels_count[i] elements
    comb_per_label = labels_count * k_genuine_vec * k_impostor_vec

    # Get start and finish for later triplet assigning
    # append zero at the begining for start and get cumulative sum
    start_finish_indices = np.hstack((0, comb_per_label)).cumsum()

    # Total number of triplets is the sum of all possible combinations per
    # label
    num_triplets = start_finish_indices[-1]
    triplets = np.empty((num_triplets, 3), dtype=np.intp)

    neigh = NearestNeighbors()

    for i, label in enumerate(labels):

        # generate mask for current label
        gen_mask = known_labels == label
        gen_indx = np.where(gen_mask)

        # get k_genuine genuine neighbors
        neigh.fit(X=X[gen_indx])
        # Take elements of gen_indx according to the yielded k-neighbors
        gen_relative_indx = neigh.kneighbors(n_neighbors=k_genuine_vec[i],
                                             return_distance=False)
        gen_neigh = np.take(gen_indx, gen_relative_indx)

        # generate mask for impostors of current label
        imp_indx = np.where(~gen_mask)

        # get k_impostor impostor neighbors
        neigh.fit(X=X[imp_indx])
        # Take elements of imp_indx according to the yielded k-neighbors
        imp_relative_indx = neigh.kneighbors(n_neighbors=k_impostor_vec[i],
                                             X=X[gen_mask],
                                             return_distance=False)
        imp_neigh = np.take(imp_indx, imp_relative_indx)

        # length = len_label*k_genuine*k_impostor
        start, finish = start_finish_indices[i:i+2]

        triplets[start:finish, :] = comb(gen_indx, gen_neigh, imp_neigh,
                                         k_genuine_vec[i],
                                         k_impostor_vec[i])

    return triplets


  def _pairs(self, n_constraints, same_label=True, max_iter=10,
             random_state=np.random):
    known_label_idx, = np.where(self.partial_labels >= 0)
    known_labels = self.partial_labels[known_label_idx]
    num_labels = len(known_labels)
    ab = set()
    it = 0
    while it < max_iter and len(ab) < n_constraints:
      nc = n_constraints - len(ab)
      for aidx in random_state.randint(num_labels, size=nc):
        if same_label:
          mask = known_labels[aidx] == known_labels
          mask[aidx] = False  # avoid identity pairs
        else:
          mask = known_labels[aidx] != known_labels
        b_choices, = np.where(mask)
        if len(b_choices) > 0:
          ab.add((aidx, random_state.choice(b_choices)))
      it += 1
    if len(ab) < n_constraints:
      warnings.warn("Only generated %d %s constraints (requested %d)" % (
          len(ab), 'positive' if same_label else 'negative', n_constraints))
    ab = np.array(list(ab)[:n_constraints], dtype=int)
    return known_label_idx[ab.T]


[docs]
  def chunks(self, n_chunks=100, chunk_size=2, random_state=None,
             num_chunks='deprecated'):
    """
    Generates chunks from labeled data.

    Each of ``n_chunks`` chunks is composed of ``chunk_size`` points from
    the same class drawn at random. Each point can belong to at most 1 chunk.

    In the case where there is not enough points to generate ``n_chunks``
    chunks of size ``chunk_size``, a ValueError will be raised.

    Parameters
    ----------
    n_chunks : int, optional (default=100)
      Number of chunks to generate.

    chunk_size : int, optional (default=2)
      Number of points in each chunk.

    random_state : int or numpy.RandomState or None, optional (default=None)
      A pseudo random number generator object or a seed for it if int.

    num_chunks : Renamed to n_chunks. Will be deprecated in 0.7.0

    Returns
    -------
    chunks : array-like, shape=(n_samples,)
      1D array of chunk indicators, where -1 indicates that the point does not
      belong to any chunk.
    """
    if num_chunks != 'deprecated':
      warnings.warn('"num_chunks" parameter has been renamed to'
                    ' "n_chunks". It has been deprecated in'
                    ' version 0.6.3 and will be removed in 0.7.0'
                    '', FutureWarning)
      n_chunks = num_chunks
    random_state = check_random_state(random_state)
    chunks = -np.ones_like(self.partial_labels, dtype=int)
    uniq, lookup = np.unique(self.partial_labels, return_inverse=True)
    unknown_uniq = np.where(uniq < 0)[0]
    all_inds = [set(np.where(lookup == c)[0]) for c in range(len(uniq))
                if c not in unknown_uniq]
    max_chunks = int(np.sum([len(s) // chunk_size for s in all_inds]))
    if max_chunks < n_chunks:
      raise ValueError(('Not enough possible chunks of %d elements in each'
                        ' class to form expected %d chunks - maximum number'
                        ' of chunks is %d'
                        ) % (chunk_size, n_chunks, max_chunks))
    idx = 0
    while idx < n_chunks and all_inds:
      if len(all_inds) == 1:
        c = 0
      else:
        c = random_state.randint(0, high=len(all_inds) - 1)
      inds = all_inds[c]
      if len(inds) < chunk_size:
        del all_inds[c]
        continue
      ii = random_state.choice(list(inds), chunk_size, replace=False)
      inds.difference_update(ii)
      chunks[ii] = idx
      idx += 1
    return chunks




def comb(A, B, C, sizeB, sizeC):
    # generate_knntriplets helper function
    # generate an array with all combinations of choosing
    # an element from A, B and C
    return np.vstack((np.tile(A, (sizeB*sizeC, 1)).ravel(order='F'),
                      np.tile(np.hstack(B), (sizeC, 1)).ravel(order='F'),
                      np.tile(C, (1, sizeB)).ravel())).T


def wrap_pairs(X, constraints):
  a = np.array(constraints[0])
  b = np.array(constraints[1])
  c = np.array(constraints[2])
  d = np.array(constraints[3])
  constraints = np.vstack((np.column_stack((a, b)), np.column_stack((c, d))))
  y = np.concatenate([np.ones_like(a), -np.ones_like(c)])
  pairs = X[constraints]
  return pairs, y