"""Quantile Encoder."""
from __future__ import annotations
__author__ = 'david26694', 'cmougan'
import operator
import warnings
from functools import reduce
from typing import Sequence
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.exceptions import NotFittedError
import category_encoders.utils as util
from category_encoders.ordinal import OrdinalEncoder
[docs]
class QuantileEncoder(util.SupervisedTransformerMixin, util.BaseEncoder):
"""Quantile Encoding for categorical features.
This a statistically modified version of target MEstimate encoder where selected features
are replaced by the statistical quantile instead of the mean. Replacing with the
median is a particular case where self.quantile = 0.5. In comparison to MEstimateEncoder
it has two tunable parameter `m` and `quantile`
Parameters
----------
verbose: int
integer indicating verbosity of the output. 0 for none.
quantile: float
float indicating statistical quantile. ´0.5´ for median.
m: float
this is the “m” in the m-probability estimate. Higher value of m results into
stronger shrinking. M is non-negative. 0 for no smoothing.
cols: list
a list of columns to encode, if None, all string columns will be encoded.
drop_invariant: bool
boolean for whether or not to drop columns with 0 variance.
return_df: bool
boolean for whether to return a pandas DataFrame from transform
(otherwise it will be a numpy array).
handle_missing: str
options are 'error', 'return_nan' and 'value', defaults to 'value',
which returns the target quantile.
handle_unknown: str
options are 'error', 'return_nan' and 'value', defaults to 'value',
which returns the target quantile.
Example
-------
>>> from category_encoders import *
>>> import pandas as pd
>>> from sklearn.datasets import fetch_openml
>>> bunch = fetch_openml(name='house_prices', as_frame=True)
>>> display_cols = [
... 'Id',
... 'MSSubClass',
... 'MSZoning',
... 'LotFrontage',
... 'YearBuilt',
... 'Heating',
... 'CentralAir',
... ]
>>> y = bunch.target
>>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)[display_cols]
>>> enc = QuantileEncoder(cols=['CentralAir', 'Heating'], quantile=0.5, m=1.0).fit(X, y)
>>> numeric_dataset = enc.transform(X)
>>> print(numeric_dataset.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Id 1460 non-null float64
1 MSSubClass 1460 non-null float64
2 MSZoning 1460 non-null object
3 LotFrontage 1201 non-null float64
4 YearBuilt 1460 non-null float64
5 Heating 1460 non-null float64
6 CentralAir 1460 non-null float64
dtypes: float64(6), object(1)
memory usage: 80.0+ KB
None
References
----------
.. [1] Quantile Encoder: Tackling High Cardinality Categorical Features in Regression Problems,
https://link.springer.com/chapter/10.1007%2F978-3-030-85529-1_14
.. [2] A Preprocessing Scheme for High-Cardinality Categorical Attributes in Classification
and Prediction Problems, equation 7, from https://dl.acm.org/citation.cfm?id=507538
.. [3] On estimating probabilities in tree pruning, equation 1,
from https://link.springer.com/chapter/10.1007/BFb0017010
.. [4] Additive smoothing, from https://en.wikipedia.org/wiki/Additive_smoothing#Generalized_to_the_case_of_known_incidence_rates
.. [5] Target encoding done the right way https://maxhalford.github.io/blog/target-encoding/
"""
prefit_ordinal = True
encoding_relation = util.EncodingRelation.ONE_TO_ONE
def __init__(
self,
verbose: int = 0,
cols: list[str] = None,
drop_invariant: bool = False,
return_df: bool = True,
handle_missing: str = 'value',
handle_unknown: str = 'value',
quantile: float = 0.5,
m: float = 1.0,
):
super().__init__(
verbose=verbose,
cols=cols,
drop_invariant=drop_invariant,
return_df=return_df,
handle_unknown=handle_unknown,
handle_missing=handle_missing,
)
self.ordinal_encoder = None
self.mapping = None
self.quantile = quantile
self.m = m
def _fit(self, X: pd.DataFrame, y: pd.Series, **kwargs) -> None:
y = y.astype(float)
self.ordinal_encoder = OrdinalEncoder(
verbose=self.verbose,
cols=self.cols,
handle_unknown='value',
handle_missing='value',
)
self.ordinal_encoder = self.ordinal_encoder.fit(X)
X_ordinal = self.ordinal_encoder.transform(X)
self.mapping = self.fit_quantile_encoding(X_ordinal, y)
[docs]
def fit_quantile_encoding(self, X: pd.DataFrame, y: pd.Series) -> dict[str, pd.Series]:
"""Calculate the quantile encoding mapping.
Parameters
----------
X: training data.
y: target data.
Returns
-------
mapping col-name -> series with category-label -> quantile mapping.
"""
mapping = {}
# Calculate global statistics
prior = np.quantile(y, self.quantile)
for switch in self.ordinal_encoder.category_mapping:
col = switch.get('col')
values = switch.get('mapping')
# Calculate sum, count and quantile of the target for each unique value
# in the feature col
stats = y.groupby(X[col]).agg([lambda x: np.quantile(x, self.quantile), 'sum', 'count'])
stats.columns = ['quantile', 'sum', 'count']
# Calculate the m-probability estimate of the quantile
estimate = (stats['count'] * stats['quantile'] + prior * self.m) / (
stats['count'] + self.m
)
if self.handle_unknown == 'return_nan':
estimate.loc[-1] = np.nan
elif self.handle_unknown == 'value':
estimate.loc[-1] = prior
if self.handle_missing == 'return_nan':
estimate.loc[values.loc[np.nan]] = np.nan
elif self.handle_missing == 'value':
estimate.loc[-2] = prior
mapping[col] = estimate
return mapping
def _transform(self, X: pd.DataFrame, y: pd.Series | None = None):
X = self.ordinal_encoder.transform(X)
if self.handle_unknown == 'error':
if X[self.cols].isin([-1]).any().any():
raise ValueError('Unexpected categories found in dataframe')
X = self.quantile_encode(X)
return X
[docs]
def quantile_encode(self, X_in: pd.DataFrame) -> pd.DataFrame:
"""Apply quantile encoding."""
X = X_in.copy(deep=True)
for col in self.cols:
X[col] = X[col].map(self.mapping[col])
return X
# todo does not fit in schema since it is an ensemble of other encoders
[docs]
class SummaryEncoder(BaseEstimator):
"""Summary Encoding for categorical features.
It's an encoder designed for creating richer representations by applying quantile
encoding for a set of quantiles.
Parameters
----------
verbose: int
integer indicating verbosity of the output. 0 for none.
quantiles: list
list of floats indicating the statistical quantiles. Each element represent a column
m: float
this is the “m” in the m-probability estimate. Higher value of m results into stronger
shrinking. M is non-negative. 0 for no smoothing.
cols: list
a list of columns to encode, if None, all string columns will be encoded.
drop_invariant: bool
boolean for whether or not to drop columns with 0 variance.
return_df: bool
boolean for whether to return a pandas DataFrame from transform
(otherwise it will be a numpy array).
handle_missing: str
options are 'error', 'return_nan' and 'value', defaults to 'value',
which returns the target quantile.
handle_unknown: str
options are 'error', 'return_nan' and 'value', defaults to 'value',
which returns the target quantile.
Example
-------
>>> from category_encoders import *
>>> import pandas as pd
>>> from sklearn.datasets import fetch_openml
>>> bunch = fetch_openml(name='house_prices', as_frame=True)
>>> display_cols = [
... 'Id',
... 'MSSubClass',
... 'MSZoning',
... 'LotFrontage',
... 'YearBuilt',
... 'Heating',
... 'CentralAir',
... ]
>>> y = bunch.target
>>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)[display_cols]
>>> enc = SummaryEncoder(cols=['CentralAir', 'Heating'], quantiles=[0.25, 0.5, 0.75]).fit(X, y)
>>> numeric_dataset = enc.transform(X)
>>> print(numeric_dataset.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Id 1460 non-null float64
1 MSSubClass 1460 non-null float64
2 MSZoning 1460 non-null object
3 LotFrontage 1201 non-null float64
4 YearBuilt 1460 non-null float64
5 Heating_25 1460 non-null float64
6 Heating_50 1460 non-null float64
7 Heating_75 1460 non-null float64
8 CentralAir_25 1460 non-null float64
9 CentralAir_50 1460 non-null float64
10 CentralAir_75 1460 non-null float64
dtypes: float64(10), object(1)
memory usage: 125.6+ KB
None
References
----------
.. [1] Quantile Encoder: Tackling High Cardinality Categorical Features in Regression Problems,
https://link.springer.com/chapter/10.1007%2F978-3-030-85529-1_14
.. [2] A Preprocessing Scheme for High-Cardinality Categorical Attributes in Classification
and Prediction Problems, equation 7, from https://dl.acm.org/citation.cfm?id=507538
.. [3] On estimating probabilities in tree pruning, equation 1,
from https://link.springer.com/chapter/10.1007/BFb0017010
.. [4] Additive smoothing, from https://en.wikipedia.org/wiki/Additive_smoothing#Generalized_to_the_case_of_known_incidence_rates
.. [5] Target encoding done the right way https://maxhalford.github.io/blog/target-encoding/
"""
encoding_relation = util.EncodingRelation.ONE_TO_M
def __init__(
self,
verbose: int = 0,
cols: list[str] = None,
drop_invariant: bool = False,
return_df: bool = True,
handle_missing: str = 'value',
handle_unknown: str = 'value',
quantiles: Sequence[float] = (0.25, 0.75),
m: float = 1.0,
):
self.return_df = return_df
self.drop_invariant = drop_invariant
self.drop_cols = []
self.verbose = verbose
self.cols = cols
self.use_default_cols = (
cols is None
) # if True, even a repeated call of fit() will select string columns from X
self.ordinal_encoder = None
self._dim = None
self.mapping = None
self.handle_unknown = handle_unknown
self.handle_missing = handle_missing
self.quantiles = quantiles
self.m = m
self.encoder_list = None
[docs]
def fit(self, X: util.X_type, y: util.y_type) -> SummaryEncoder:
"""Fits the encoder according to X and y by fitting the individual encoders.
Parameters
----------
X : array-like, shape = [n_samples, n_features]
Training vectors, where n_samples is the number of samples
and n_features is the number of features.
y : array-like, shape = [n_samples]
Target values.
Returns
-------
self : encoder
Returns self.
"""
X, y = util.convert_inputs(X, y)
self.feature_names_in_ = X.columns.tolist()
self.n_features_in_ = len(self.feature_names_in_)
if self.use_default_cols:
self.cols = util.get_categorical_cols(X)
else:
self.cols = util.convert_cols_to_list(self.cols)
rounded_percentiles = [round(quantile * 100) for quantile in self.quantiles]
if len(rounded_percentiles) != len(set(rounded_percentiles)):
raise ValueError('There are two quantiles that belong to the same rounded percentile')
encoder_list = []
for quantile in self.quantiles:
enc = QuantileEncoder(
verbose=self.verbose,
cols=self.cols,
drop_invariant=self.drop_invariant,
# always return df for individual encoders. If not desired this is handled below.
return_df=True,
handle_missing=self.handle_missing,
handle_unknown=self.handle_unknown,
quantile=quantile,
m=self.m,
)
enc.fit(X.copy(), y)
encoder_list.append(enc)
self.drop_cols += enc.invariant_cols
self.feature_names_out_ = reduce(
operator.add,
[
(
[
self._get_col_name(c, enc.quantile)
for enc in encoder_list
if c not in enc.invariant_cols
]
if c in self.cols
else [c]
)
for c in X.columns
],
)
self.encoder_list = encoder_list
return self
def __sklearn_tags__(self) -> util.EncoderTags:
"""Set scikit transformer tags."""
sk_tags = super().__sklearn_tags__()
tags = util.EncoderTags.from_sk_tags(sk_tags)
tags.target_tags.required = True
return tags
[docs]
def get_feature_names(self) -> np.ndarray:
"""Deprecated method to get feature names. Use `get_feature_names_out` instead."""
msg = (
'`get_feature_names` is deprecated in all of sklearn. '
'Use `get_feature_names_out` instead.'
)
warnings.warn(msg, category=FutureWarning, stacklevel=2)
return self.get_feature_names_out()
[docs]
def get_feature_names_out(self, input_features=None) -> np.ndarray:
"""Returns the names of all transformed / added columns.
Note that in sklearn the get_feature_names_out function takes the feature_names_in
as an argument and determines the output feature names using the input.
A fit is usually not necessary and if so a NotFittedError is raised.
We just require a fit all the time and return the fitted output columns.
Returns
-------
feature_names: np.ndarray
A list with all feature names transformed or added.
Note: potentially dropped features (because the feature is constant/invariant)
are not included!
"""
out_feats = getattr(self, 'feature_names_out_', None)
if not isinstance(out_feats, list):
raise NotFittedError('Estimator has to be fitted to return feature names.')
else:
return np.array(out_feats, dtype=object)
[docs]
def get_feature_names_in(self) -> np.ndarray:
"""Get the names of all input columns present when fitting.
These columns are necessary for the transform step.
"""
in_feats = getattr(self, 'feature_names_in_', None)
if isinstance(in_feats, list):
in_feats = np.array(in_feats)
if not isinstance(in_feats, np.ndarray):
raise NotFittedError('Estimator has to be fitted to return feature names.')
else:
return in_feats
@staticmethod
def _get_col_name(col: str, quantile: float) -> str:
percentile = round(quantile * 100)
return f'{col}_{percentile}'