"""
Forest confidence intervals.
Calculate confidence intervals for scikit-learn RandomForestRegressor and
RandomForestClassifier predictions.
"""
import numpy as np
import copy
from sklearn.ensemble._forest import BaseForest
from sklearn.ensemble._forest import (_generate_sample_indices,
_get_n_samples_bootstrap)
from sklearn.ensemble._bagging import BaseBagging
from .calibration import calibrateEB
from .due import _due, _BibTeX
__all__ = ("calc_inbag", "random_forest_error", "_bias_correction",
"_core_computation")
_due.cite(
_BibTeX(
"""
@ARTICLE{Wager2014-wn,
title = "Confidence Intervals for Random Forests: The Jackknife and the Infinitesimal Jackknife",
author = "Wager, Stefan and Hastie, Trevor and Efron, Bradley",
journal = "J. Mach. Learn. Res.",
volume = 15,
number = 1,
pages = "1625--1651",
month = jan,
year = 2014,}"""
),
description=(
"Confidence Intervals for Random Forests:",
"The Jackknife and the Infinitesimal Jackknife",
),
path="forestci",
)
def calc_inbag(n_samples, forest):
"""
Derive samples used to create trees in scikit-learn RandomForest objects.
Recovers the samples in each tree from the random state of that tree using
:func:`forest._generate_sample_indices`.
Parameters
----------
n_samples : int
The number of samples used to fit the scikit-learn RandomForest object.
forest : RandomForest
Regressor or Classifier object that is already fit by scikit-learn.
Returns
-------
Array that records how many times a data point was placed in a tree.
Columns are individual trees. Rows are the number of times a sample was
used in a tree.
"""
if not forest.bootstrap:
e_s = "Cannot calculate the inbag from a forest that has bootstrap=False"
raise ValueError(e_s)
n_trees = forest.n_estimators
inbag = np.zeros((n_samples, n_trees))
sample_idx = []
if isinstance(forest, BaseForest):
n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, forest.max_samples)
for t_idx in range(n_trees):
sample_idx.append(
_generate_sample_indices(
forest.estimators_[t_idx].random_state,
n_samples,
n_samples_bootstrap,
)
)
inbag[:, t_idx] = np.bincount(sample_idx[-1], minlength=n_samples)
elif isinstance(forest, BaseBagging):
for t_idx, estimator_sample in enumerate(forest.estimators_samples_):
sample_idx.append(estimator_sample)
inbag[:, t_idx] = np.bincount(sample_idx[-1], minlength=n_samples)
return inbag
def _core_computation(
X_train,
X_test,
inbag,
pred_centered,
n_trees,
memory_constrained=False,
memory_limit=None,
test_mode=False,
):
"""
Helper function, that performs the core computation
Parameters
----------
X_train : ndarray
An array with shape (n_train_sample, n_features).
X_test : ndarray
An array with shape (n_test_sample, n_features).
inbag : ndarray
The inbag matrix that fit the data. If set to `None` (default) it
will be inferred from the forest. However, this only works for trees
for which bootstrapping was set to `True`. That is, if sampling was
done with replacement. Otherwise, users need to provide their own
inbag matrix.
pred_centered : ndarray
Centered predictions that are an intermediate result in the
computation.
memory_constrained: boolean (optional)
Whether or not there is a restriction on memory. If False, it is
assumed that a ndarry of shape (n_train_sample,n_test_sample) fits
in main memory. Setting to True can actually provide a speed up if
memory_limit is tuned to the optimal range.
memory_limit: int (optional)
An upper bound for how much memory the itermediate matrices will take
up in Megabytes. This must be provided if memory_constrained=True.
"""
if not memory_constrained:
return np.sum((np.dot(inbag - 1, pred_centered.T) / n_trees) ** 2, 0)
if not memory_limit:
raise ValueError("If memory_constrained=True, must provide", "memory_limit.")
# Assumes double precision float
chunk_size = int((memory_limit * 1e6) / (8.0 * X_train.shape[0]))
if chunk_size == 0:
min_limit = 8.0 * X_train.shape[0] / 1e6
raise ValueError(
"memory_limit provided is too small."
+ "For these dimensions, memory_limit must "
+ "be greater than or equal to %.3e" % min_limit
)
chunk_edges = np.arange(0, X_test.shape[0] + chunk_size, chunk_size)
inds = range(X_test.shape[0])
chunks = [
inds[chunk_edges[i] : chunk_edges[i + 1]] for i in range(len(chunk_edges) - 1)
]
if test_mode:
print("Number of chunks: %d" % (len(chunks),))
V_IJ = np.concatenate(
[
np.sum((np.dot(inbag - 1, pred_centered[chunk].T) / n_trees) ** 2, 0)
for chunk in chunks
]
)
return V_IJ
def _bias_correction(V_IJ, inbag, pred_centered, n_trees):
"""
Helper functions that implements bias correction
Parameters
----------
V_IJ : ndarray
Intermediate result in the computation.
inbag : ndarray
The inbag matrix that fit the data. If set to `None` (default) it
will be inferred from the forest. However, this only works for trees
for which bootstrapping was set to `True`. That is, if sampling was
done with replacement. Otherwise, users need to provide their own
inbag matrix.
pred_centered : ndarray
Centered predictions that are an intermediate result in the
computation.
n_trees : int
The number of trees in the forest object.
"""
n_train_samples = inbag.shape[0]
n_var = np.mean(
np.square(inbag[0:n_trees]).mean(axis=1).T.view()
- np.square(inbag[0:n_trees].mean(axis=1)).T.view()
)
boot_var = np.square(pred_centered).sum(axis=1) / n_trees
bias_correction = n_train_samples * n_var * boot_var / n_trees
V_IJ_unbiased = V_IJ - bias_correction
return V_IJ_unbiased
def _centered_prediction_forest(forest, X_test):
"""
Center the tree predictions by the mean prediction (forest)
The centering is done for all provided test samples.
This function allows unit testing for internal correctness.
Parameters
----------
forest : RandomForest
Regressor or Classifier object.
X_test : ndarray
An array with shape (n_test_sample, n_features). The design matrix
for testing data
Returns
-------
pred_centered : ndarray
An array with shape (n_test_sample, n_estimators).
The predictions of each single tree centered by the
mean prediction (i.e. the prediction of the forest)
"""
# reformatting required for single sample arrays
# caution: assumption that number of features always > 1
if len(X_test.shape) == 1:
# reshape according to the reshaping annotation in scikit-learn
X_test = X_test.reshape(1, -1)
pred = np.array([tree.predict(X_test) for tree in forest]).T
pred_mean = np.mean(pred, 1).reshape(X_test.shape[0], 1)
return pred - pred_mean
[docs]def random_forest_error(
forest,
X_train,
X_test,
inbag=None,
calibrate=True,
memory_constrained=False,
memory_limit=None,
):
"""
Calculate error bars from scikit-learn RandomForest estimators.
RandomForest is a regressor or classifier object
this variance can be used to plot error bars for RandomForest objects
Parameters
----------
forest : RandomForest
Regressor or Classifier object.
X_train : ndarray
An array with shape (n_train_sample, n_features). The design matrix for
training data.
X_test : ndarray
An array with shape (n_test_sample, n_features). The design matrix
for testing data
inbag : ndarray, optional
The inbag matrix that fit the data. If set to `None` (default) it
will be inferred from the forest. However, this only works for trees
for which bootstrapping was set to `True`. That is, if sampling was
done with replacement. Otherwise, users need to provide their own
inbag matrix.
calibrate: boolean, optional
Whether to apply calibration to mitigate Monte Carlo noise.
Some variance estimates may be negative due to Monte Carlo effects if
the number of trees in the forest is too small. To use calibration,
Default: True
memory_constrained: boolean, optional
Whether or not there is a restriction on memory. If False, it is
assumed that a ndarry of shape (n_train_sample,n_test_sample) fits
in main memory. Setting to True can actually provide a speed up if
memory_limit is tuned to the optimal range.
memory_limit: int, optional.
An upper bound for how much memory the itermediate matrices will take
up in Megabytes. This must be provided if memory_constrained=True.
Returns
-------
An array with the unbiased sampling variance (V_IJ_unbiased)
for a RandomForest object.
See Also
----------
:func:`calc_inbag`
Notes
-----
The calculation of error is based on the infinitesimal jackknife variance,
as described in [Wager2014]_ and is a Python implementation of the R code
provided at: https://github.com/swager/randomForestCI
.. [Wager2014] S. Wager, T. Hastie, B. Efron. "Confidence Intervals for
Random Forests: The Jackknife and the Infinitesimal Jackknife", Journal
of Machine Learning Research vol. 15, pp. 1625-1651, 2014.
"""
if inbag is None:
inbag = calc_inbag(X_train.shape[0], forest)
pred_centered = _centered_prediction_forest(forest, X_test)
n_trees = forest.n_estimators
V_IJ = _core_computation(
X_train, X_test, inbag, pred_centered, n_trees, memory_constrained, memory_limit
)
V_IJ_unbiased = _bias_correction(V_IJ, inbag, pred_centered, n_trees)
# Correct for cases where resampling is done without replacement:
if np.max(inbag) == 1:
variance_inflation = 1 / (1 - np.mean(inbag)) ** 2
V_IJ_unbiased *= variance_inflation
if not calibrate:
return V_IJ_unbiased
if V_IJ_unbiased.shape[0] <= 20:
print("No calibration with n_samples <= 20: ",
"consider using more n_estimators in your model, ",
"for more accurate ci and to avoid negative values.")
return V_IJ_unbiased
if calibrate:
# Calibration is a correction for converging quicker to the case of infinite n_estimators,
# as presented in Wager (2014) http://jmlr.org/papers/v15/wager14a.html
calibration_ratio = 2
n_sample = np.ceil(n_trees / calibration_ratio)
new_forest = copy.deepcopy(forest)
random_idx = np.random.permutation(len(new_forest.estimators_))[: int(n_sample)]
new_forest.estimators_ = list(np.array(new_forest.estimators_)[random_idx])
if hasattr(new_forest, "_seeds"):
new_forest._seeds = new_forest._seeds[random_idx]
new_forest.n_estimators = int(n_sample)
results_ss = random_forest_error(
new_forest,
X_train,
X_test,
calibrate=False,
memory_constrained=memory_constrained,
memory_limit=memory_limit,
)
# Use this second set of variance estimates
# to estimate scale of Monte Carlo noise
sigma2_ss = np.mean((results_ss - V_IJ_unbiased) ** 2)
delta = n_sample / n_trees
sigma2 = (delta ** 2 + (1 - delta) ** 2) / (2 * (1 - delta) ** 2) * sigma2_ss
# Use Monte Carlo noise scale estimate for empirical Bayes calibration
V_IJ_calibrated = calibrateEB(V_IJ_unbiased, sigma2)
return V_IJ_calibrated