Source code for category_encoders.quantile_encoder

"""Quantile Encoder."""

from __future__ import annotations

__author__ = 'david26694', 'cmougan'

import operator
import warnings
from functools import reduce
from typing import Sequence

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.exceptions import NotFittedError

import category_encoders.utils as util
from category_encoders.ordinal import OrdinalEncoder



[docs]
class QuantileEncoder(util.SupervisedTransformerMixin, util.BaseEncoder):
    """Quantile Encoding for categorical features.

    This a statistically modified version of target MEstimate encoder where selected features
    are replaced by the statistical quantile instead of the mean. Replacing with the
    median is a particular case where self.quantile = 0.5. In comparison to MEstimateEncoder
    it has two tunable parameter `m` and `quantile`

    Parameters
    ----------
    verbose: int
        integer indicating verbosity of the output. 0 for none.
    quantile: float
        float indicating statistical quantile. ´0.5´ for median.
    m: float
        this is the “m” in the m-probability estimate. Higher value of m results into
        stronger shrinking. M is non-negative. 0 for no smoothing.
    cols: list
        a list of columns to encode, if None, all string columns will be encoded.
    drop_invariant: bool
        boolean for whether or not to drop columns with 0 variance.
    return_df: bool
        boolean for whether to return a pandas DataFrame from transform
        (otherwise it will be a numpy array).
    handle_missing: str
        options are 'error', 'return_nan'  and 'value', defaults to 'value',
        which returns the target quantile.
    handle_unknown: str
        options are 'error', 'return_nan' and 'value', defaults to 'value',
        which returns the target quantile.

    Example
    -------
    >>> from category_encoders import *
    >>> import pandas as pd
    >>> from sklearn.datasets import fetch_openml
    >>> bunch = fetch_openml(name='house_prices', as_frame=True)
    >>> display_cols = [
    ...     'Id',
    ...     'MSSubClass',
    ...     'MSZoning',
    ...     'LotFrontage',
    ...     'YearBuilt',
    ...     'Heating',
    ...     'CentralAir',
    ... ]
    >>> y = bunch.target
    >>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)[display_cols]
    >>> enc = QuantileEncoder(cols=['CentralAir', 'Heating'], quantile=0.5, m=1.0).fit(X, y)
    >>> numeric_dataset = enc.transform(X)
    >>> print(numeric_dataset.info())
    <class 'pandas.core.frame.DataFrame'>
    RangeIndex: 1460 entries, 0 to 1459
    Data columns (total 7 columns):
     #   Column       Non-Null Count  Dtype
    ---  ------       --------------  -----
     0   Id           1460 non-null   float64
     1   MSSubClass   1460 non-null   float64
     2   MSZoning     1460 non-null   object
     3   LotFrontage  1201 non-null   float64
     4   YearBuilt    1460 non-null   float64
     5   Heating      1460 non-null   float64
     6   CentralAir   1460 non-null   float64
    dtypes: float64(6), object(1)
    memory usage: 80.0+ KB
    None

    References
    ----------

    .. [1] Quantile Encoder: Tackling High Cardinality Categorical Features in Regression Problems,
        https://link.springer.com/chapter/10.1007%2F978-3-030-85529-1_14
    .. [2] A Preprocessing Scheme for High-Cardinality Categorical Attributes in Classification
        and Prediction Problems, equation 7, from https://dl.acm.org/citation.cfm?id=507538
    .. [3] On estimating probabilities in tree pruning, equation 1,
        from https://link.springer.com/chapter/10.1007/BFb0017010
    .. [4] Additive smoothing, from https://en.wikipedia.org/wiki/Additive_smoothing#Generalized_to_the_case_of_known_incidence_rates
    .. [5] Target encoding done the right way https://maxhalford.github.io/blog/target-encoding/
    """

    prefit_ordinal = True
    encoding_relation = util.EncodingRelation.ONE_TO_ONE

    def __init__(
        self,
        verbose: int = 0,
        cols: list[str] = None,
        drop_invariant: bool = False,
        return_df: bool = True,
        handle_missing: str = 'value',
        handle_unknown: str = 'value',
        quantile: float = 0.5,
        m: float = 1.0,
    ):
        super().__init__(
            verbose=verbose,
            cols=cols,
            drop_invariant=drop_invariant,
            return_df=return_df,
            handle_unknown=handle_unknown,
            handle_missing=handle_missing,
        )
        self.ordinal_encoder = None
        self.mapping = None
        self.quantile = quantile
        self.m = m

    def _fit(self, X: pd.DataFrame, y: pd.Series, **kwargs) -> None:
        y = y.astype(float)

        self.ordinal_encoder = OrdinalEncoder(
            verbose=self.verbose,
            cols=self.cols,
            handle_unknown='value',
            handle_missing='value',
        )
        self.ordinal_encoder = self.ordinal_encoder.fit(X)
        X_ordinal = self.ordinal_encoder.transform(X)
        self.mapping = self.fit_quantile_encoding(X_ordinal, y)


[docs]
    def fit_quantile_encoding(self, X: pd.DataFrame, y: pd.Series) -> dict[str, pd.Series]:
        """Calculate the quantile encoding mapping.

        Parameters
        ----------
        X: training data.
        y: target data.

        Returns
        -------
        mapping col-name -> series with category-label -> quantile mapping.
        """
        mapping = {}

        # Calculate global statistics
        prior = np.quantile(y, self.quantile)

        for switch in self.ordinal_encoder.category_mapping:
            col = switch.get('col')
            values = switch.get('mapping')

            # Calculate sum, count and quantile of the target for each unique value
            # in the feature col
            stats = y.groupby(X[col]).agg([lambda x: np.quantile(x, self.quantile), 'sum', 'count'])
            stats.columns = ['quantile', 'sum', 'count']

            # Calculate the m-probability estimate of the quantile
            estimate = (stats['count'] * stats['quantile'] + prior * self.m) / (
                stats['count'] + self.m
            )

            if self.handle_unknown == 'return_nan':
                estimate.loc[-1] = np.nan
            elif self.handle_unknown == 'value':
                estimate.loc[-1] = prior

            if self.handle_missing == 'return_nan':
                estimate.loc[values.loc[np.nan]] = np.nan
            elif self.handle_missing == 'value':
                estimate.loc[-2] = prior

            mapping[col] = estimate

        return mapping


    def _transform(self, X: pd.DataFrame, y: pd.Series | None = None):
        X = self.ordinal_encoder.transform(X)

        if self.handle_unknown == 'error':
            if X[self.cols].isin([-1]).any().any():
                raise ValueError('Unexpected categories found in dataframe')

        X = self.quantile_encode(X)
        return X


[docs]
    def quantile_encode(self, X_in: pd.DataFrame) -> pd.DataFrame:
        """Apply quantile encoding."""
        X = X_in.copy(deep=True)

        for col in self.cols:
            X[col] = X[col].map(self.mapping[col])

        return X




# todo does not fit in schema since it is an ensemble of other encoders

[docs]
class SummaryEncoder(BaseEstimator):
    """Summary Encoding for categorical features.

    It's an encoder designed for creating richer representations by applying quantile
    encoding for a set of quantiles.

    Parameters
    ----------
    verbose: int
        integer indicating verbosity of the output. 0 for none.
    quantiles: list
        list of floats indicating the statistical quantiles. Each element represent a column
    m: float
        this is the “m” in the m-probability estimate. Higher value of m results into stronger
        shrinking. M is non-negative. 0 for no smoothing.
    cols: list
        a list of columns to encode, if None, all string columns will be encoded.
    drop_invariant: bool
        boolean for whether or not to drop columns with 0 variance.
    return_df: bool
        boolean for whether to return a pandas DataFrame from transform
        (otherwise it will be a numpy array).
    handle_missing: str
        options are 'error', 'return_nan'  and 'value', defaults to 'value',
        which returns the target quantile.
    handle_unknown: str
        options are 'error', 'return_nan' and 'value', defaults to 'value',
        which returns the target quantile.

    Example
    -------
    >>> from category_encoders import *
    >>> import pandas as pd
    >>> from sklearn.datasets import fetch_openml
    >>> bunch = fetch_openml(name='house_prices', as_frame=True)
    >>> display_cols = [
    ...     'Id',
    ...     'MSSubClass',
    ...     'MSZoning',
    ...     'LotFrontage',
    ...     'YearBuilt',
    ...     'Heating',
    ...     'CentralAir',
    ... ]
    >>> y = bunch.target
    >>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)[display_cols]
    >>> enc = SummaryEncoder(cols=['CentralAir', 'Heating'], quantiles=[0.25, 0.5, 0.75]).fit(X, y)
    >>> numeric_dataset = enc.transform(X)
    >>> print(numeric_dataset.info())
    <class 'pandas.core.frame.DataFrame'>
    RangeIndex: 1460 entries, 0 to 1459
    Data columns (total 11 columns):
     #   Column         Non-Null Count  Dtype
    ---  ------         --------------  -----
     0   Id             1460 non-null   float64
     1   MSSubClass     1460 non-null   float64
     2   MSZoning       1460 non-null   object
     3   LotFrontage    1201 non-null   float64
     4   YearBuilt      1460 non-null   float64
     5   Heating_25     1460 non-null   float64
     6   Heating_50     1460 non-null   float64
     7   Heating_75     1460 non-null   float64
     8   CentralAir_25  1460 non-null   float64
     9   CentralAir_50  1460 non-null   float64
     10  CentralAir_75  1460 non-null   float64
    dtypes: float64(10), object(1)
    memory usage: 125.6+ KB
    None

    References
    ----------
    .. [1] Quantile Encoder: Tackling High Cardinality Categorical Features in Regression Problems,
    https://link.springer.com/chapter/10.1007%2F978-3-030-85529-1_14
    .. [2] A Preprocessing Scheme for High-Cardinality Categorical Attributes in Classification
    and Prediction Problems, equation 7, from https://dl.acm.org/citation.cfm?id=507538
    .. [3] On estimating probabilities in tree pruning, equation 1,
    from https://link.springer.com/chapter/10.1007/BFb0017010
    .. [4] Additive smoothing, from https://en.wikipedia.org/wiki/Additive_smoothing#Generalized_to_the_case_of_known_incidence_rates
    .. [5] Target encoding done the right way https://maxhalford.github.io/blog/target-encoding/
    """

    encoding_relation = util.EncodingRelation.ONE_TO_M

    def __init__(
        self,
        verbose: int = 0,
        cols: list[str] = None,
        drop_invariant: bool = False,
        return_df: bool = True,
        handle_missing: str = 'value',
        handle_unknown: str = 'value',
        quantiles: Sequence[float] = (0.25, 0.75),
        m: float = 1.0,
    ):
        self.return_df = return_df
        self.drop_invariant = drop_invariant
        self.drop_cols = []
        self.verbose = verbose
        self.cols = cols
        self.use_default_cols = (
            cols is None
        )  # if True, even a repeated call of fit() will select string columns from X
        self.ordinal_encoder = None
        self._dim = None
        self.mapping = None
        self.handle_unknown = handle_unknown
        self.handle_missing = handle_missing
        self.quantiles = quantiles
        self.m = m
        self.encoder_list = None


[docs]
    def fit(self, X: util.X_type, y: util.y_type) -> SummaryEncoder:
        """Fits the encoder according to X and y by fitting the individual encoders.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like, shape = [n_samples]
            Target values.

        Returns
        -------
        self : encoder
            Returns self.

        """
        X, y = util.convert_inputs(X, y)
        self.feature_names_in_ = X.columns.tolist()
        self.n_features_in_ = len(self.feature_names_in_)

        if self.use_default_cols:
            self.cols = util.get_categorical_cols(X)
        else:
            self.cols = util.convert_cols_to_list(self.cols)

        rounded_percentiles = [round(quantile * 100) for quantile in self.quantiles]
        if len(rounded_percentiles) != len(set(rounded_percentiles)):
            raise ValueError('There are two quantiles that belong to the same rounded percentile')

        encoder_list = []
        for quantile in self.quantiles:
            enc = QuantileEncoder(
                verbose=self.verbose,
                cols=self.cols,
                drop_invariant=self.drop_invariant,
                # always return df for individual encoders. If not desired this is handled below.
                return_df=True,
                handle_missing=self.handle_missing,
                handle_unknown=self.handle_unknown,
                quantile=quantile,
                m=self.m,
            )
            enc.fit(X.copy(), y)
            encoder_list.append(enc)
            self.drop_cols += enc.invariant_cols
        self.feature_names_out_ = reduce(
            operator.add,
            [
                (
                    [
                        self._get_col_name(c, enc.quantile)
                        for enc in encoder_list
                        if c not in enc.invariant_cols
                    ]
                    if c in self.cols
                    else [c]
                )
                for c in X.columns
            ],
        )
        self.encoder_list = encoder_list
        return self



[docs]
    def transform(
        self, X: util.X_type, y: util.y_type | None = None, override_return_df: bool = False
    ) -> pd.DataFrame | np.ndarray:
        """Summary encode new data.

        Parameters
        ----------
        X: data to encode.
        y: optional target information.
        override_return_df: if true return a numpy array instead of a
            dataframe regardless of the return_df parameter.

        Returns
        -------
        encoded data.

        """
        if self.encoder_list is None:
            raise ValueError('Must train encoder before it can be used to transform data.')
        X, y = util.convert_inputs(X, y)

        orig_cols = X.columns
        transformed_df = X.copy()
        for idx, encoder in enumerate(self.encoder_list):
            colname_mapping = {col: self._get_col_name(col, encoder.quantile) for col in self.cols}
            X_encoded = encoder.transform(X.copy()).rename(columns=colname_mapping)
            if idx == 0:
                transformed_df = X_encoded
            else:
                new_feat = X_encoded[[c for c in X_encoded.columns if c not in orig_cols]]
                transformed_df = pd.concat([transformed_df, new_feat], axis=1)
        feature_order = [c for c in self.get_feature_names_out() if c in transformed_df]
        transformed_df = transformed_df[feature_order]

        if self.return_df or override_return_df:
            return transformed_df
        else:
            return transformed_df.to_numpy()


    def __sklearn_tags__(self) -> util.EncoderTags:
        """Set scikit transformer tags."""
        sk_tags = super().__sklearn_tags__()
        tags = util.EncoderTags.from_sk_tags(sk_tags)
        tags.target_tags.required = True
        return tags


[docs]
    def fit_transform(self, X: util.X_type, y: util.y_type | None = None):
        """Fit and transform using target.

        This also uses the target for transforming, not only for training.
        """
        if y is None:
            raise TypeError('fit_transform() missing argument: ' 'y' '')
        return self.fit(X, y).transform(X, y)



[docs]
    def get_feature_names(self) -> np.ndarray:
        """Deprecated method to get feature names. Use `get_feature_names_out` instead."""
        msg = (
            '`get_feature_names` is deprecated in all of sklearn. '
            'Use `get_feature_names_out` instead.'
        )
        warnings.warn(msg, category=FutureWarning, stacklevel=2)
        return self.get_feature_names_out()



[docs]
    def get_feature_names_out(self, input_features=None) -> np.ndarray:
        """Returns the names of all transformed / added columns.

        Note that in sklearn the get_feature_names_out function takes the feature_names_in
        as an argument and determines the output feature names using the input.
        A fit is usually not necessary and if so a NotFittedError is raised.
        We just require a fit all the time and return the fitted output columns.

        Returns
        -------
        feature_names: np.ndarray
            A list with all feature names transformed or added.
            Note: potentially dropped features (because the feature is constant/invariant)
            are not included!

        """
        out_feats = getattr(self, 'feature_names_out_', None)
        if not isinstance(out_feats, list):
            raise NotFittedError('Estimator has to be fitted to return feature names.')
        else:
            return np.array(out_feats, dtype=object)



[docs]
    def get_feature_names_in(self) -> np.ndarray:
        """Get the names of all input columns present when fitting.

        These columns are necessary for the transform step.
        """
        in_feats = getattr(self, 'feature_names_in_', None)
        if isinstance(in_feats, list):
            in_feats = np.array(in_feats)
        if not isinstance(in_feats, np.ndarray):
            raise NotFittedError('Estimator has to be fitted to return feature names.')
        else:
            return in_feats


    @staticmethod
    def _get_col_name(col: str, quantile: float) -> str:
        percentile = round(quantile * 100)
        return f'{col}_{percentile}'