import numpy as np
import pandas as pd
from category_encoders import OrdinalEncoder
import category_encoders.utils as util
[docs]class RankHotEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin):
"""The rank-hot encoder is similar to a one-hot encoder,
except every feature up to and including the current rank is hot.
This is also called thermometer encoding.
Parameters
----------
verbose: int
integer indicating verbosity of the output. 0 for none.
cols: list
a list of columns to encode, if None, all string columns will be encoded.
drop_invariant: bool
boolean for whether or not to drop columns with 0 variance.
use_cat_names: bool
if True, category values will be included in the encoded column names.
Since this can result in duplicate column names,
duplicates are suffixed with '#' symbol until a unique name is generated.
If False, category indices will be used instead of the category values.
handle_unknown: str
options are 'error', 'value', 'return_nan'.
The default is 'value'.
'value': If an unknown label occurrs, it is represented as 0 array.
'error': If an unknown label occurrs, error message is displayed.
'return_nan': If an unknown label occurrs, np.nan is returned in all columns.
handle_missing: str
options are 'error', 'value' and 'return_nan'. The default is 'value'.
Missing value also considered as unknown value in the final data set.
Example
-------
>>> from category_encoders import *
>>> import pandas as pd
>>> from sklearn.datasets import fetch_openml
>>> bunch = fetch_openml(name="house_prices", as_frame=True)
>>> display_cols = ["Id", "MSSubClass", "MSZoning", "LotFrontage", "YearBuilt", "Heating", "CentralAir"]
>>> y = bunch.target
>>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)[display_cols]
>>> enc = RankHotEncoder(cols=['CentralAir', 'Heating'], handle_unknown='indicator').fit(X, y)
>>> numeric_dataset = enc.transform(X)
>>> print(numeric_dataset.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Id 1460 non-null float64
1 MSSubClass 1460 non-null float64
2 MSZoning 1460 non-null object
3 LotFrontage 1201 non-null float64
4 YearBuilt 1460 non-null float64
5 Heating_1 1460 non-null int64
6 Heating_2 1460 non-null int64
7 Heating_3 1460 non-null int64
8 Heating_4 1460 non-null int64
9 Heating_5 1460 non-null int64
10 Heating_6 1460 non-null int64
11 CentralAir_1 1460 non-null int64
12 CentralAir_2 1460 non-null int64
dtypes: float64(4), int64(8), object(1)
memory usage: 148.4+ KB
None
"""
prefit_ordinal = True
encoding_relation = util.EncodingRelation.ONE_TO_N_UNIQUE
def __init__(
self,
verbose=0,
cols=None,
drop_invariant=False,
return_df=True,
handle_missing="value",
handle_unknown="value",
use_cat_names=None,
):
super().__init__(
verbose=verbose,
cols=cols,
drop_invariant=drop_invariant,
return_df=return_df,
handle_unknown=handle_unknown,
handle_missing=handle_missing,
)
self._dim = None
self.mapping = None
self.use_cat_names = use_cat_names
def _fit(self, X, y, **kwargs):
oe_missing_strat = {
'error': 'error',
'return_nan': 'return_nan',
'value': 'value',
'indicator': 'return_nan',
}[self.handle_missing]
# supply custom mapping in order to assure order of ordinal variable
ordered_mapping = []
for col in self.cols:
oe_col = OrdinalEncoder(verbose=self.verbose, cols=[col], handle_unknown="value", handle_missing=oe_missing_strat)
oe_col.fit(X[col].sort_values().to_frame(name=col))
ordered_mapping += oe_col.mapping
self.ordinal_encoder = OrdinalEncoder(
verbose=self.verbose, cols=self.cols, handle_unknown="value", handle_missing=oe_missing_strat, mapping=ordered_mapping
)
self.ordinal_encoder = self.ordinal_encoder.fit(X)
self.mapping = self.generate_mapping()
return self
def _transform(self, X_in, override_return_df=False):
X = X_in.copy(deep=True)
X = self.ordinal_encoder.transform(X)
input_cols = X.columns.tolist()
if self.handle_unknown == "error":
if X[self.cols].isin([-1]).any().any():
raise ValueError("Columns to be encoded can not contain new values")
for switch, ordinal_switch in zip(self.mapping, self.ordinal_encoder.category_mapping):
col = switch.get("col")
mod = switch.get("mapping")
encode_feature_series = X[col]
unknow_elements = encode_feature_series[encode_feature_series == -1]
encoding_dict = {i: list(row.values()) for i, row in mod.to_dict(orient="index").items()}
if self.handle_unknown == "value":
default_value = [0] * len(encoding_dict)
elif self.handle_unknown == "return_nan":
default_value = [np.nan] * len(encoding_dict)
elif self.handle_unknown == "error":
if not unknow_elements.empty:
unknowns_str = ', '.join([str(x) for x in unknow_elements.unique()])
msg = f"Unseen values {unknowns_str} during transform in column {col}."
raise ValueError(msg)
default_value = [0] * len(encoding_dict)
else:
raise ValueError(f"invalid option for 'handle_unknown' parameter: {self.handle_unknown}")
def apply_coding(row: pd.Series):
val = row.iloc[0]
if pd.isna(val):
if self.handle_missing == "value":
return default_value
elif self.handle_missing == "return_nan":
return [np.nan] * len(default_value)
else:
raise ValueError("Unhandled NaN")
return encoding_dict.get(row.iloc[0], default_value)
encoded = encode_feature_series.to_frame().apply(apply_coding, axis=1, result_type="expand")
encoded.columns = mod.columns
X = pd.concat([encoded, X], axis=1)
old_column_index = input_cols.index(col)
input_cols[old_column_index:old_column_index + 1] = mod.columns
X = X.reindex(columns=input_cols)
return X
def create_dataframe(self, X, encoded, key_col):
if not (isinstance(encoded, pd.DataFrame) or isinstance(encoded, pd.Series)):
encoded = pd.DataFrame(encoded, columns=key_col)
X_ = pd.concat([encoded, X], axis=1)
return X_
def inverse_transform(self, X_in):
X = X_in.copy(deep=True)
cols = X.columns.tolist()
if self._dim is None:
raise ValueError("Must train encoder before it can be used to inverse_transform data")
for switch, ordinal_mapping in zip(self.mapping, self.ordinal_encoder.category_mapping):
col = switch.get("col")
cats = switch.get("mapping")
if col != ordinal_mapping.get("col"):
raise ValueError("Column order of OrdinalEncoder and RankHotEncoder do not match")
inv_map = {v: k for k, v in ordinal_mapping.get("mapping").to_dict().items()}
arrs = X[cats.columns]
reencode = arrs.sum(axis=1).rename(col)
orig_dtype = ordinal_mapping.get("data_type")
reencode2 = reencode.replace(inv_map).astype(orig_dtype)
if np.any(reencode2[:] == 0):
reencode2[reencode2[:] == 0] = np.nan
X = self.create_dataframe(X, reencode2, col)
first_inex = cols.index(cats.columns[0])
last_index = cols.index(cats.columns[-1]) + 1
del cols[first_inex:last_index]
cols.insert(self.ordinal_encoder.feature_names_out_.index(col), col)
X = X.reindex(columns=cols)
return X
def generate_mapping(self):
mapping = []
found_column_counts = {}
for switch in self.ordinal_encoder.mapping:
col: str = switch.get("col")
values: pd.Series = switch.get("mapping").copy(deep=True)
if self.handle_missing == "value":
values = values[values > 0]
if len(values) == 0:
continue
index = []
new_columns = []
for cat_name, class_ in values.items():
if self.use_cat_names:
n_col_name = f"{col}_{cat_name}"
found_count = found_column_counts.get(n_col_name, 0)
found_column_counts[n_col_name] = found_count + 1
n_col_name += "#" * found_count
else:
n_col_name = f"{col}_{class_}"
index.append(class_)
new_columns.append(n_col_name)
base_matrix = np.tril(np.ones((len(index), len(index)), dtype=int))
base_df = pd.DataFrame(data=base_matrix, columns=new_columns, index=index)
mapping.append({"col": col, "mapping": base_df})
return mapping