Source code for category_encoders.hashing

"""The hashing module contains all methods and classes related to the hashing trick."""

import hashlib
import math
import multiprocessing
import platform
from concurrent.futures import ProcessPoolExecutor

import numpy as np
import pandas as pd

import category_encoders.utils as util

__author__ = 'willmcginnis', 'LiuShulun'


[docs] class HashingEncoder( util.UnsupervisedTransformerMixin,util.BaseEncoder): """A multivariate hashing implementation with configurable dimensionality/precision. The advantage of this encoder is that it does not maintain a dictionary of observed categories. Consequently, the encoder does not grow in size and accepts new values during data scoring by design. It's important to read about how max_process & max_sample work before setting them manually, inappropriate setting slows down encoding. Default value of 'max_process' is 1 on Windows because multiprocessing might cause issues, see in : https://github.com/scikit-learn-contrib/categorical-encoding/issues/215 https://docs.python.org/2/library/multiprocessing.html?highlight=process#windows Parameters ---------- verbose: int integer indicating verbosity of the output. 0 for none. cols: list a list of columns to encode, if None, all string columns will be encoded. drop_invariant: bool boolean for whether or not to drop columns with 0 variance. return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). hash_method: str which hashing method to use. Any method from hashlib works. max_process: int how many processes to use in transform(). Limited in range(1, 64). By default, it uses half of the logical CPUs. For example, 4C4T makes max_process=2, 4C8T makes max_process=4. Set it larger if you have a strong CPU. It is not recommended to set it larger than is the count of the logical CPUs as it will actually slow down the encoding. max_sample: int how many samples to encode by each process at a time. This setting is useful on low memory machines. By default, max_sample=(all samples num)/(max_process). For example, 4C8T CPU with 100,000 samples makes max_sample=25,000, 6C12T CPU with 100,000 samples makes max_sample=16,666. It is not recommended to set it larger than the default value. n_components: int how many bits to use to represent the feature. By default, we use 8 bits. For high-cardinality features, consider using up-to 32 bits. process_creation_method: string either "fork", "spawn" or "forkserver" (availability depends on your platform). See https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods for more details and tradeoffs. Defaults to "fork" on linux/macos as it is the fastest option and to "spawn" on windows as it is the only one available Example ------- >>> from category_encoders.hashing import HashingEncoder >>> import pandas as pd >>> from sklearn.datasets import fetch_openml >>> bunch = fetch_openml(name='house_prices', as_frame=True) >>> display_cols = [ ... 'Id', ... 'MSSubClass', ... 'MSZoning', ... 'LotFrontage', ... 'YearBuilt', ... 'Heating', ... 'CentralAir', ... ] >>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)[display_cols] >>> y = bunch.target >>> he = HashingEncoder(cols=['CentralAir', 'Heating']).fit(X, y) >>> numeric_dataset = he.transform(X) >>> print(numeric_dataset.info()) <class 'pandas.core.frame.DataFrame'> RangeIndex: 1460 entries, 0 to 1459 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 col_0 1460 non-null int64 1 col_1 1460 non-null int64 2 col_2 1460 non-null int64 3 col_3 1460 non-null int64 4 col_4 1460 non-null int64 5 col_5 1460 non-null int64 6 col_6 1460 non-null int64 7 col_7 1460 non-null int64 8 Id 1460 non-null float64 9 MSSubClass 1460 non-null float64 10 MSZoning 1460 non-null object 11 LotFrontage 1201 non-null float64 12 YearBuilt 1460 non-null float64 dtypes: float64(4), int64(8), object(1) memory usage: 148.4+ KB None References ---------- .. [1] Feature Hashing for Large Scale Multitask Learning, from https://alex.smola.org/papers/2009/Weinbergeretal09.pdf .. [2] Don't be tricked by the Hashing Trick, from https://booking.ai/dont-be-tricked-by-the-hashing-trick-192a6aae3087 """ prefit_ordinal = False encoding_relation = util.EncodingRelation.ONE_TO_M def __init__( self, max_process=0, max_sample=0, verbose=0, n_components=8, cols=None, drop_invariant=False, return_df=True, hash_method='md5', process_creation_method='fork', ): super().__init__( verbose=verbose, cols=cols, drop_invariant=drop_invariant, return_df=return_df, handle_unknown='does not apply', handle_missing='does not apply', ) if max_process not in range(1, 128): if platform.system() == 'Windows': self.max_process = 1 else: self.max_process = int(math.ceil(multiprocessing.cpu_count() / 2)) if self.max_process < 1: self.max_process = 1 elif self.max_process > 128: self.max_process = 128 else: self.max_process = max_process self.max_sample = int(max_sample) if platform.system() == 'Windows': self.process_creation_method = 'spawn' else: self.process_creation_method = process_creation_method self.data_lines = 0 self.X = None self.n_components = n_components self.hash_method = hash_method def _fit(self, X, y=None, **kwargs): pass def _transform(self, X, override_return_df=False): """Perform the transformation to new categorical data. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- p : array, shape = [n_samples, n_numeric + N] Transformed values with encoding applied. """ if self._dim is None: raise ValueError('Must train encoder before it can be used to transform data.') # first check the type X = util.convert_input(X) # then make sure that it is the right size if X.shape[1] != self._dim: raise ValueError(f'Unexpected input dimension {X.shape[1]}, expected {self._dim}') if not list(self.cols): return X X = self.hashing_trick( X, hashing_method=self.hash_method, N=self.n_components, cols=self.cols, ) return X
[docs] @staticmethod def hash_chunk(hash_method: str, np_df: np.ndarray, N: int) -> np.ndarray: """Perform hashing on the given numpy array. Parameters ---------- hash_method: str Hashlib method to use. np_df: np.ndarray Data to hash. N: int Number of bits to encode the data. Returns ------- np.ndarray Hashed data. """ # Calling getattr outside the loop saves some time in the loop hasher_constructor = getattr(hashlib, hash_method) # Same when the call to getattr is implicit int_from_bytes = int.from_bytes result = np.zeros((np_df.shape[0], N), dtype='int') for i, row in enumerate(np_df): for val in row: if val is not None: hasher = hasher_constructor() # Computes an integer index from the hasher digest. The endian is # "big" as the code use to read: # column_index = int(hasher.hexdigest(), 16) % N # which is implicitly considering the hexdigest to be big endian, # even if the system is little endian. # Building the index that way is about 30% faster than using the # hexdigest. hasher.update(bytes(str(val), 'utf-8')) column_index = int_from_bytes(hasher.digest(), byteorder='big') % N result[i, column_index] += 1 return result
[docs] def hashing_trick_with_np_parallel(self, df: pd.DataFrame, N: int) -> pd.DataFrame: """Perform the hashing trick in parallel. Parameters ---------- df: pd.DataFrame data to hash. N: int how many bits to use to represent the feature. Returns ------- pd.DataFrame hashed data. """ np_df = df.to_numpy() ctx = multiprocessing.get_context(self.process_creation_method) with ProcessPoolExecutor(max_workers=self.max_process, mp_context=ctx) as executor: result = np.concatenate( list( executor.map( self.hash_chunk, [self.hash_method] * self.max_process, np.array_split(np_df, self.max_process), [N] * self.max_process, ) ) ) return pd.DataFrame(result, index=df.index)
[docs] def hashing_trick_with_np_no_parallel(self, df: pd.DataFrame, N: int) -> pd.DataFrame: """Perform the hashing trick in a single thread (non-parallel). Parameters ---------- df: pd.DataFrame data to hash. N: int how many bits to use to represent the feature. Returns ------- pd.DataFrame hashed data. """ np_df = df.to_numpy() result = HashingEncoder.hash_chunk(self.hash_method, np_df, N) return pd.DataFrame(result, index=df.index)
[docs] def hashing_trick(self, X_in, hashing_method='md5', N=2, cols=None, make_copy=False): """A basic hashing implementation with configurable dimensionality/precision. Performs the hashing trick on a pandas dataframe, `X`, using the hashing method from hashlib identified by `hashing_method`. The number of output dimensions (`N`), and columns to hash (`cols`) are also configurable. Parameters ---------- X_in: pandas dataframe description text hashing_method: string, optional description text N: int, optional description text cols: list, optional description text make_copy: bool, optional description text Returns ------- out : dataframe A hashing encoded dataframe. References ---------- Cite the relevant literature, e.g. [1]_. You may also cite these references in the notes section above. .. [1] Kilian Weinberger; Anirban Dasgupta; John Langford; Alex Smola; Josh Attenberg (2009). Feature Hashing for Large Scale Multitask Learning. Proc. ICML. """ if hashing_method not in hashlib.algorithms_available: raise ValueError( f"Hashing Method: {hashing_method} not available. " f"Please use one of: {', '.join([str(x) for x in hashlib.algorithms_available])}" ) if make_copy: X = X_in.copy(deep=True) else: X = X_in if cols is None: cols = X.columns new_cols = [f'col_{d}' for d in range(N)] X_cat = X.loc[:, cols] X_num = X.loc[:, [x for x in X.columns if x not in cols]] if self.max_process == 1: X_cat = self.hashing_trick_with_np_no_parallel(X_cat, N) else: X_cat = self.hashing_trick_with_np_parallel(X_cat, N) X_cat.columns = new_cols X = pd.concat([X_cat, X_num], axis=1) return X