"""
Helper module for generating different types of constraints
from supervised data labels.
"""
import numpy as np
import warnings
from sklearn.utils import check_random_state
from sklearn.neighbors import NearestNeighbors
__all__ = ['Constraints']
[docs]
class Constraints(object):
"""
Class to build constraints from labeled data.
See more in the :ref:`User Guide <supervised_version>`.
Parameters
----------
partial_labels : `numpy.ndarray` of ints, shape=(n_samples,)
Array of labels, with -1 indicating unknown label.
Attributes
----------
partial_labels : `numpy.ndarray` of ints, shape=(n_samples,)
Array of labels, with -1 indicating unknown label.
"""
[docs]
def __init__(self, partial_labels):
partial_labels = np.asanyarray(partial_labels, dtype=int)
self.partial_labels = partial_labels
[docs]
def positive_negative_pairs(self, n_constraints, same_length=False,
random_state=None, num_constraints='deprecated'):
"""
Generates positive pairs and negative pairs from labeled data.
Positive pairs are formed by randomly drawing ``n_constraints`` pairs of
points with the same label. Negative pairs are formed by randomly drawing
``n_constraints`` pairs of points with different label.
In the case where it is not possible to generate enough positive or
negative pairs, a smaller number of pairs will be returned with a warning.
Parameters
----------
n_constraints : int
Number of positive and negative constraints to generate.
same_length : bool, optional (default=False)
If True, forces the number of positive and negative pairs to be
equal by ignoring some pairs from the larger set.
random_state : int or numpy.RandomState or None, optional (default=None)
A pseudo random number generator object or a seed for it if int.
num_constraints : Renamed to n_constraints. Will be deprecated in 0.7.0
Returns
-------
a : array-like, shape=(n_constraints,)
1D array of indicators for the left elements of positive pairs.
b : array-like, shape=(n_constraints,)
1D array of indicators for the right elements of positive pairs.
c : array-like, shape=(n_constraints,)
1D array of indicators for the left elements of negative pairs.
d : array-like, shape=(n_constraints,)
1D array of indicators for the right elements of negative pairs.
"""
if num_constraints != 'deprecated':
warnings.warn('"num_constraints" parameter has been renamed to'
' "n_constraints". It has been deprecated in'
' version 0.6.3 and will be removed in 0.7.0'
'', FutureWarning)
self.n_constraints = num_constraints
else:
self.n_constraints = n_constraints
random_state = check_random_state(random_state)
a, b = self._pairs(n_constraints, same_label=True,
random_state=random_state)
c, d = self._pairs(n_constraints, same_label=False,
random_state=random_state)
if same_length and len(a) != len(c):
n = min(len(a), len(c))
return a[:n], b[:n], c[:n], d[:n]
return a, b, c, d
[docs]
def generate_knntriplets(self, X, k_genuine, k_impostor):
"""
Generates triplets from labeled data.
For every point (X_a) the triplets (X_a, X_b, X_c) are constructed from all
the combinations of taking one of its `k_genuine`-nearest neighbors of the
same class (X_b) and taking one of its `k_impostor`-nearest neighbors of
other classes (X_c).
In the case a class doesn't have enough points in the same class (other
classes) to yield `k_genuine` (`k_impostor`) neighbors a warning will be
raised and the maximum value of genuine (impostor) neighbors will be used
for that class.
Parameters
----------
X : (n x d) matrix
Input data, where each row corresponds to a single instance.
k_genuine : int
Number of neighbors of the same class to be taken into account.
k_impostor : int
Number of neighbors of different classes to be taken into account.
Returns
-------
triplets : array-like, shape=(n_constraints, 3)
2D array of triplets of indicators.
"""
# Ignore unlabeled samples
known_labels_mask = self.partial_labels >= 0
known_labels = self.partial_labels[known_labels_mask]
X = X[known_labels_mask]
labels, labels_count = np.unique(known_labels, return_counts=True)
len_input = known_labels.shape[0]
# Handle the case where there are too few elements to yield k_genuine or
# k_impostor neighbors for every class.
k_genuine_vec = np.full_like(labels, k_genuine)
k_impostor_vec = np.full_like(labels, k_impostor)
for i, count in enumerate(labels_count):
if k_genuine + 1 > count:
k_genuine_vec[i] = count-1
warnings.warn("The class {} has {} elements, which is not sufficient "
"to generate {} genuine neighbors as specified by "
"k_genuine. Will generate {} genuine neighbors instead."
"\n"
.format(labels[i], count, k_genuine+1,
k_genuine_vec[i]))
if k_impostor > len_input - count:
k_impostor_vec[i] = len_input - count
warnings.warn("The class {} has {} elements of other classes, which is"
" not sufficient to generate {} impostor neighbors as "
"specified by k_impostor. Will generate {} impostor "
"neighbors instead.\n"
.format(labels[i], k_impostor_vec[i], k_impostor,
k_impostor_vec[i]))
# The total number of possible triplets combinations per label comes from
# taking one of the k_genuine_vec[i] genuine neighbors and one of the
# k_impostor_vec[i] impostor neighbors for the labels_count[i] elements
comb_per_label = labels_count * k_genuine_vec * k_impostor_vec
# Get start and finish for later triplet assigning
# append zero at the begining for start and get cumulative sum
start_finish_indices = np.hstack((0, comb_per_label)).cumsum()
# Total number of triplets is the sum of all possible combinations per
# label
num_triplets = start_finish_indices[-1]
triplets = np.empty((num_triplets, 3), dtype=np.intp)
neigh = NearestNeighbors()
for i, label in enumerate(labels):
# generate mask for current label
gen_mask = known_labels == label
gen_indx = np.where(gen_mask)
# get k_genuine genuine neighbors
neigh.fit(X=X[gen_indx])
# Take elements of gen_indx according to the yielded k-neighbors
gen_relative_indx = neigh.kneighbors(n_neighbors=k_genuine_vec[i],
return_distance=False)
gen_neigh = np.take(gen_indx, gen_relative_indx)
# generate mask for impostors of current label
imp_indx = np.where(~gen_mask)
# get k_impostor impostor neighbors
neigh.fit(X=X[imp_indx])
# Take elements of imp_indx according to the yielded k-neighbors
imp_relative_indx = neigh.kneighbors(n_neighbors=k_impostor_vec[i],
X=X[gen_mask],
return_distance=False)
imp_neigh = np.take(imp_indx, imp_relative_indx)
# length = len_label*k_genuine*k_impostor
start, finish = start_finish_indices[i:i+2]
triplets[start:finish, :] = comb(gen_indx, gen_neigh, imp_neigh,
k_genuine_vec[i],
k_impostor_vec[i])
return triplets
def _pairs(self, n_constraints, same_label=True, max_iter=10,
random_state=np.random):
known_label_idx, = np.where(self.partial_labels >= 0)
known_labels = self.partial_labels[known_label_idx]
num_labels = len(known_labels)
ab = set()
it = 0
while it < max_iter and len(ab) < n_constraints:
nc = n_constraints - len(ab)
for aidx in random_state.randint(num_labels, size=nc):
if same_label:
mask = known_labels[aidx] == known_labels
mask[aidx] = False # avoid identity pairs
else:
mask = known_labels[aidx] != known_labels
b_choices, = np.where(mask)
if len(b_choices) > 0:
ab.add((aidx, random_state.choice(b_choices)))
it += 1
if len(ab) < n_constraints:
warnings.warn("Only generated %d %s constraints (requested %d)" % (
len(ab), 'positive' if same_label else 'negative', n_constraints))
ab = np.array(list(ab)[:n_constraints], dtype=int)
return known_label_idx[ab.T]
[docs]
def chunks(self, n_chunks=100, chunk_size=2, random_state=None,
num_chunks='deprecated'):
"""
Generates chunks from labeled data.
Each of ``n_chunks`` chunks is composed of ``chunk_size`` points from
the same class drawn at random. Each point can belong to at most 1 chunk.
In the case where there is not enough points to generate ``n_chunks``
chunks of size ``chunk_size``, a ValueError will be raised.
Parameters
----------
n_chunks : int, optional (default=100)
Number of chunks to generate.
chunk_size : int, optional (default=2)
Number of points in each chunk.
random_state : int or numpy.RandomState or None, optional (default=None)
A pseudo random number generator object or a seed for it if int.
num_chunks : Renamed to n_chunks. Will be deprecated in 0.7.0
Returns
-------
chunks : array-like, shape=(n_samples,)
1D array of chunk indicators, where -1 indicates that the point does not
belong to any chunk.
"""
if num_chunks != 'deprecated':
warnings.warn('"num_chunks" parameter has been renamed to'
' "n_chunks". It has been deprecated in'
' version 0.6.3 and will be removed in 0.7.0'
'', FutureWarning)
n_chunks = num_chunks
random_state = check_random_state(random_state)
chunks = -np.ones_like(self.partial_labels, dtype=int)
uniq, lookup = np.unique(self.partial_labels, return_inverse=True)
unknown_uniq = np.where(uniq < 0)[0]
all_inds = [set(np.where(lookup == c)[0]) for c in range(len(uniq))
if c not in unknown_uniq]
max_chunks = int(np.sum([len(s) // chunk_size for s in all_inds]))
if max_chunks < n_chunks:
raise ValueError(('Not enough possible chunks of %d elements in each'
' class to form expected %d chunks - maximum number'
' of chunks is %d'
) % (chunk_size, n_chunks, max_chunks))
idx = 0
while idx < n_chunks and all_inds:
if len(all_inds) == 1:
c = 0
else:
c = random_state.randint(0, high=len(all_inds) - 1)
inds = all_inds[c]
if len(inds) < chunk_size:
del all_inds[c]
continue
ii = random_state.choice(list(inds), chunk_size, replace=False)
inds.difference_update(ii)
chunks[ii] = idx
idx += 1
return chunks
def comb(A, B, C, sizeB, sizeC):
# generate_knntriplets helper function
# generate an array with all combinations of choosing
# an element from A, B and C
return np.vstack((np.tile(A, (sizeB*sizeC, 1)).ravel(order='F'),
np.tile(np.hstack(B), (sizeC, 1)).ravel(order='F'),
np.tile(C, (1, sizeB)).ravel())).T
def wrap_pairs(X, constraints):
a = np.array(constraints[0])
b = np.array(constraints[1])
c = np.array(constraints[2])
d = np.array(constraints[3])
constraints = np.vstack((np.column_stack((a, b)), np.column_stack((c, d))))
y = np.concatenate([np.ones_like(a), -np.ones_like(c)])
pairs = X[constraints]
return pairs, y