Skip to content

Metrics

Evaluation metrics for conformal prediction.

Regression

mapie.metrics.regression.regression_coverage_score

regression_coverage_score(
    y_true: NDArray, y_intervals: NDArray
) -> NDArray

Effective coverage obtained by the prediction intervals.

Intervals given by the predict_interval method can be passed directly to the y_intervals argument (see example below).

Beside this intended use, this function also works with:

  • y_true of shape (n_sample,) and y_intervals of shape (n_sample, 2)
  • y_true of shape (n_sample, n) and y_intervals of shape (n_sample, 2, n)

The effective coverage is obtained by computing the fraction of true labels that lie within the prediction intervals.

PARAMETER DESCRIPTION
y_true

True labels.

TYPE: NDArray

y_intervals

Lower and upper bound of prediction intervals with different confidence levels, given by the predict_interval method

TYPE: NDArray

RETURNS DESCRIPTION
NDArray of shape (n_confidence_level,)

Effective coverage obtained by the prediction intervals for each confidence level.

Examples:

>>> from mapie.metrics.regression import regression_coverage_score
>>> from mapie.regression import SplitConformalRegressor
>>> from mapie.utils import train_conformalize_test_split
>>> from sklearn.datasets import make_regression
>>> from sklearn.model_selection import train_test_split
>>> from sklearn.linear_model import Ridge
>>> X, y = make_regression(n_samples=500, n_features=2, noise=1.0)
>>> (
...     X_train, X_conformalize, X_test,
...     y_train, y_conformalize, y_test
... ) = train_conformalize_test_split(
...     X, y, train_size=0.6, conformalize_size=0.2, test_size=0.2, random_state=1
... )
>>> mapie_regressor = SplitConformalRegressor(
...     estimator=Ridge(),
...     confidence_level=0.95,
...     prefit=False,
... ).fit(X_train, y_train).conformalize(X_conformalize, y_conformalize)
>>> predicted_points, predicted_intervals = mapie_regressor.predict_interval(X_test)
>>> coverage = regression_coverage_score(y_test, predicted_intervals)[0]
Source code in mapie/metrics/regression.py
def regression_coverage_score(
    y_true: NDArray,
    y_intervals: NDArray,
) -> NDArray:
    """
    Effective coverage obtained by the prediction intervals.

    Intervals given by the `predict_interval` method can be passed directly
    to the `y_intervals` argument (see example below).

    Beside this intended use, this function also works with:

    - `y_true` of shape (n_sample,) and `y_intervals` of shape (n_sample, 2)
    - `y_true` of shape (n_sample, n) and `y_intervals` of shape
      (n_sample, 2, n)

    The effective coverage is obtained by computing the fraction
    of true labels that lie within the prediction intervals.

    Parameters
    ------------
    y_true: NDArray of shape (n_samples,)
        True labels.

    y_intervals: NDArray of shape (n_samples, 2, n_confidence_level)
        Lower and upper bound of prediction intervals
        with different confidence levels, given by the `predict_interval` method

    Returns
    ---------
    NDArray of shape (n_confidence_level,)
        Effective coverage obtained by the prediction intervals
        for each confidence level.

    Examples
    ---------
    >>> from mapie.metrics.regression import regression_coverage_score
    >>> from mapie.regression import SplitConformalRegressor
    >>> from mapie.utils import train_conformalize_test_split
    >>> from sklearn.datasets import make_regression
    >>> from sklearn.model_selection import train_test_split
    >>> from sklearn.linear_model import Ridge

    >>> X, y = make_regression(n_samples=500, n_features=2, noise=1.0)
    >>> (
    ...     X_train, X_conformalize, X_test,
    ...     y_train, y_conformalize, y_test
    ... ) = train_conformalize_test_split(
    ...     X, y, train_size=0.6, conformalize_size=0.2, test_size=0.2, random_state=1
    ... )

    >>> mapie_regressor = SplitConformalRegressor(
    ...     estimator=Ridge(),
    ...     confidence_level=0.95,
    ...     prefit=False,
    ... ).fit(X_train, y_train).conformalize(X_conformalize, y_conformalize)

    >>> predicted_points, predicted_intervals = mapie_regressor.predict_interval(X_test)
    >>> coverage = regression_coverage_score(y_test, predicted_intervals)[0]
    """
    _check_arrays_length(y_true, y_intervals)
    _check_array_nan(y_true)
    _check_array_inf(y_true)
    _check_array_nan(y_intervals)
    _check_array_inf(y_intervals)

    y_intervals = _check_array_shape_regression(y_true, y_intervals)
    if len(y_true.shape) != 2:
        y_true = cast(NDArray, column_or_1d(y_true))
        y_true = np.expand_dims(y_true, axis=1)
    coverages = np.mean(
        np.logical_and(
            np.less_equal(y_intervals[:, 0, :], y_true),
            np.greater_equal(y_intervals[:, 1, :], y_true),
        ),
        axis=0,
    )
    return cast(NDArray, coverages)

mapie.metrics.regression.regression_mean_width_score

regression_mean_width_score(
    y_intervals: NDArray,
) -> NDArray

Effective mean width score obtained by the prediction intervals.

PARAMETER DESCRIPTION
y_intervals

Lower and upper bound of prediction intervals with different confidence levels, given by the predict_interval method

TYPE: NDArray

RETURNS DESCRIPTION
NDArray of shape (n_confidence_level,)

Effective mean width of the prediction intervals for each confidence level.

Examples:

>>> import numpy as np
>>> from mapie.metrics.regression import regression_mean_width_score
>>> y_intervals = np.array([[[4, 6, 8], [6, 9, 11]],
...                    [[9, 10, 11], [10, 12, 14]],
...                    [[8.5, 9.5, 10], [12.5, 12, 13]],
...                    [[7, 8, 9], [8.5, 9.5, 10]],
...                    [[5, 6, 7], [6.5, 8, 9]]])
>>> print(regression_mean_width_score(y_intervals))
[2.  2.2 2.4]
Source code in mapie/metrics/regression.py
def regression_mean_width_score(y_intervals: NDArray) -> NDArray:
    """
    Effective mean width score obtained by the prediction intervals.

    Parameters
    ----------
    y_intervals: NDArray of shape (n_samples, 2, n_confidence_level)
        Lower and upper bound of prediction intervals
        with different confidence levels, given by the `predict_interval` method

    Returns
    ---------
    NDArray of shape (n_confidence_level,)
        Effective mean width of the prediction intervals for each confidence level.

    Examples
    --------
    >>> import numpy as np
    >>> from mapie.metrics.regression import regression_mean_width_score
    >>> y_intervals = np.array([[[4, 6, 8], [6, 9, 11]],
    ...                    [[9, 10, 11], [10, 12, 14]],
    ...                    [[8.5, 9.5, 10], [12.5, 12, 13]],
    ...                    [[7, 8, 9], [8.5, 9.5, 10]],
    ...                    [[5, 6, 7], [6.5, 8, 9]]])
    >>> print(regression_mean_width_score(y_intervals))
    [2.  2.2 2.4]
    """
    y_intervals = np.asarray(y_intervals, dtype=float)

    _check_array_nan(y_intervals)
    _check_array_inf(y_intervals)

    width = np.abs(y_intervals[:, 1, :] - y_intervals[:, 0, :])
    mean_width = width.mean(axis=0)
    return cast(NDArray, mean_width)

mapie.metrics.regression.regression_ssc

regression_ssc(
    y_true: NDArray, y_intervals: NDArray, num_bins: int = 3
) -> NDArray

Compute Size-Stratified Coverage metrics proposed in [3] that is the conditional coverage conditioned by the size of the intervals. The intervals are ranked by their size (ascending) and then divided into num_bins groups: one value of coverage by groups is computed.

Warning: This metric should be used only with non constant intervals (intervals of different sizes), with constant intervals the result may be misinterpreted.

[3] Angelopoulos, A. N., & Bates, S. (2021). A gentle introduction to conformal prediction and distribution-free uncertainty quantification. arXiv preprint arXiv:2107.07511.

PARAMETER DESCRIPTION
y_true

True labels.

TYPE: NDArray

y_intervals

Prediction intervals given by booleans of labels.

TYPE: NDArray

num_bins

Number of groups. Should be less than the number of different interval widths.

TYPE: int DEFAULT: 3

RETURNS DESCRIPTION
NDArray of shape (n_confidence_level, num_bins)

Examples:

>>> from mapie.metrics.regression import regression_ssc
>>> import numpy as np
>>> y_true = np.array([5, 7.5, 9.5])
>>> y_intervals = np.array([
... [4, 6],
... [6.0, 9.0],
... [9, 10.0]
... ])
>>> print(regression_ssc(y_true, y_intervals, num_bins=2))
[[1. 1.]]
Source code in mapie/metrics/regression.py
def regression_ssc(y_true: NDArray, y_intervals: NDArray, num_bins: int = 3) -> NDArray:
    """
    Compute Size-Stratified Coverage metrics proposed in [3] that is
    the conditional coverage conditioned by the size of the intervals.
    The intervals are ranked by their size (ascending) and then divided into
    num_bins groups: one value of coverage by groups is computed.

    Warning: This metric should be used only with non constant intervals
    (intervals of different sizes), with constant intervals the result
    may be misinterpreted.

    [3] Angelopoulos, A. N., & Bates, S. (2021).
    A gentle introduction to conformal prediction and
    distribution-free uncertainty quantification.
    arXiv preprint arXiv:2107.07511.

    Parameters
    ----------
    y_true: NDArray of shape (n_samples,)
        True labels.
    y_intervals: NDArray of shape (n_samples, 2, n_confidence_level) or (n_samples, 2)
        Prediction intervals given by booleans of labels.
    num_bins: int n
        Number of groups. Should be less than the number of different
        interval widths.

    Returns
    -------
    NDArray of shape (n_confidence_level, num_bins)

    Examples
    --------
    >>> from mapie.metrics.regression import regression_ssc
    >>> import numpy as np
    >>> y_true = np.array([5, 7.5, 9.5])
    >>> y_intervals = np.array([
    ... [4, 6],
    ... [6.0, 9.0],
    ... [9, 10.0]
    ... ])
    >>> print(regression_ssc(y_true, y_intervals, num_bins=2))
    [[1. 1.]]
    """
    y_true = cast(NDArray, column_or_1d(y_true))
    y_intervals = _check_array_shape_regression(y_true, y_intervals)
    _check_number_bins(num_bins)
    widths = np.abs(y_intervals[:, 1, :] - y_intervals[:, 0, :])
    _check_nb_intervals_sizes(widths, num_bins)

    _check_arrays_length(y_true, y_intervals)
    _check_array_nan(y_true)
    _check_array_inf(y_true)
    _check_array_nan(y_intervals)
    _check_array_inf(y_intervals)

    indexes_sorted = np.argsort(widths, axis=0)
    indexes_bybins = np.array_split(indexes_sorted, num_bins, axis=0)
    coverages = np.zeros((y_intervals.shape[2], num_bins))
    for i, indexes in enumerate(indexes_bybins):
        intervals_binned = np.stack(
            [
                np.take_along_axis(y_intervals[:, 0, :], indexes, axis=0),
                np.take_along_axis(y_intervals[:, 1, :], indexes, axis=0),
            ],
            axis=1,
        )
        coverages[:, i] = regression_coverage_score(y_true[indexes], intervals_binned)

    return coverages

mapie.metrics.regression.regression_ssc_score

regression_ssc_score(
    y_true: NDArray, y_intervals: NDArray, num_bins: int = 3
) -> NDArray

Aggregate by the minimum for each confidence level the Size-Stratified Coverage [3]: returns the maximum violation of the conditional coverage (with the groups defined).

Warning: This metric should be used only with non constant intervals (intervals of different sizes), with constant intervals the result may be misinterpreted.

[3] Angelopoulos, A. N., & Bates, S. (2021). A gentle introduction to conformal prediction and distribution-free uncertainty quantification. arXiv preprint arXiv:2107.07511.

PARAMETER DESCRIPTION
y_true

True labels.

TYPE: NDArray

y_intervals

Prediction intervals given by booleans of labels.

TYPE: NDArray

num_bins

Number of groups. Should be less than the number of different interval widths.

TYPE: int DEFAULT: 3

RETURNS DESCRIPTION
NDArray of shape (n_confidence_level,)

Examples:

>>> from mapie.metrics.regression import regression_ssc_score
>>> import numpy as np
>>> y_true = np.array([5, 7.5, 9.5])
>>> y_intervals = np.array([
... [[4, 4], [6, 7.5]],
... [[6.0, 8], [9.0, 10]],
... [[9, 9], [10.0, 10.0]]
... ])
>>> print(regression_ssc_score(y_true, y_intervals, num_bins=2))
[1.  0.5]
Source code in mapie/metrics/regression.py
def regression_ssc_score(
    y_true: NDArray, y_intervals: NDArray, num_bins: int = 3
) -> NDArray:
    """
    Aggregate by the minimum for each confidence level the Size-Stratified Coverage [3]:
    returns the maximum violation of the conditional coverage
    (with the groups defined).

    Warning: This metric should be used only with non constant intervals
    (intervals of different sizes), with constant intervals the result
    may be misinterpreted.

    [3] Angelopoulos, A. N., & Bates, S. (2021).
    A gentle introduction to conformal prediction and
    distribution-free uncertainty quantification.
    arXiv preprint arXiv:2107.07511.

    Parameters
    ----------
    y_true: NDArray of shape (n_samples,)
        True labels.
    y_intervals: NDArray of shape (n_samples, 2, n_confidence_level) or (n_samples, 2)
        Prediction intervals given by booleans of labels.
    num_bins: int n
        Number of groups. Should be less than the number of different
        interval widths.

    Returns
    -------
    NDArray of shape (n_confidence_level,)

    Examples
    --------
    >>> from mapie.metrics.regression import regression_ssc_score
    >>> import numpy as np
    >>> y_true = np.array([5, 7.5, 9.5])
    >>> y_intervals = np.array([
    ... [[4, 4], [6, 7.5]],
    ... [[6.0, 8], [9.0, 10]],
    ... [[9, 9], [10.0, 10.0]]
    ... ])
    >>> print(regression_ssc_score(y_true, y_intervals, num_bins=2))
    [1.  0.5]
    """
    return cast(NDArray, np.min(regression_ssc(y_true, y_intervals, num_bins), axis=1))

mapie.metrics.regression.hsic

hsic(
    y_true: NDArray,
    y_intervals: NDArray,
    kernel_sizes: ArrayLike = (1, 1),
) -> NDArray

Compute the square root of the hsic coefficient. HSIC is Hilbert-Schmidt independence criterion that is a correlation measure. Here we use it as proposed in [4], to compute the correlation between the indicator of coverage and the interval size.

If hsic is 0, the two variables (the indicator of coverage and the interval size) are independant.

Warning: This metric should be used only with non constant intervals (intervals of different sizes), with constant intervals the result may be misinterpreted.

[4] Feldman, S., Bates, S., & Romano, Y. (2021). Improving conditional coverage via orthogonal quantile regression. Advances in Neural Information Processing Systems, 34, 2060-2071.

PARAMETER DESCRIPTION
y_true

True labels.

TYPE: NDArray

y_intervals

Prediction sets given by booleans of labels.

TYPE: NDArray

kernel_sizes

The variance (sigma) for each variable (the indicator of coverage and the interval size), this coefficient controls the width of the curve.

TYPE: ArrayLike DEFAULT: (1, 1)

RETURNS DESCRIPTION
NDArray of shape (n_confidence_level,)

One hsic correlation coefficient by confidence level.

RAISES DESCRIPTION
ValueError

If kernel_sizes has a length different from 2 and if it has negative or null values.

Examples:

>>> from mapie.metrics.regression import hsic
>>> import numpy as np
>>> y_true = np.array([9.5, 10.5, 12.5])
>>> y_intervals = np.array([
... [[9, 9], [10.0, 10.0]],
... [[8.5, 9], [12.5, 12]],
... [[10.5, 10.5], [12.0, 12]]
... ])
>>> print(hsic(y_true, y_intervals))
[0.31787614 0.2962914 ]
Source code in mapie/metrics/regression.py
def hsic(
    y_true: NDArray, y_intervals: NDArray, kernel_sizes: ArrayLike = (1, 1)
) -> NDArray:
    """
    Compute the square root of the hsic coefficient. HSIC is Hilbert-Schmidt
    independence criterion that is a correlation measure. Here we use it as
    proposed in [4], to compute the correlation between the indicator of
    coverage and the interval size.

    If hsic is 0, the two variables (the indicator of coverage and the
    interval size) are independant.

    Warning: This metric should be used only with non constant intervals
    (intervals of different sizes), with constant intervals the result
    may be misinterpreted.

    [4] Feldman, S., Bates, S., & Romano, Y. (2021).
    Improving conditional coverage via orthogonal quantile regression.
    Advances in Neural Information Processing Systems, 34, 2060-2071.

    Parameters
    ----------
    y_true: NDArray of shape (n_samples,)
        True labels.
    y_intervals: NDArray of shape (n_samples, 2, n_confidence_level) or (n_samples, 2)
        Prediction sets given by booleans of labels.
    kernel_sizes: ArrayLike of size (2,)
        The variance (sigma) for each variable (the indicator of coverage and
        the interval size), this coefficient controls the width of the curve.

    Returns
    -------
    NDArray of shape (n_confidence_level,)
        One hsic correlation coefficient by confidence level.

    Raises
    ------
    ValueError
        If kernel_sizes has a length different from 2
        and if it has negative or null values.

    Examples
    --------
    >>> from mapie.metrics.regression import hsic
    >>> import numpy as np
    >>> y_true = np.array([9.5, 10.5, 12.5])
    >>> y_intervals = np.array([
    ... [[9, 9], [10.0, 10.0]],
    ... [[8.5, 9], [12.5, 12]],
    ... [[10.5, 10.5], [12.0, 12]]
    ... ])
    >>> print(hsic(y_true, y_intervals))
    [0.31787614 0.2962914 ]
    """
    y_true = cast(NDArray, column_or_1d(y_true))
    y_intervals = _check_array_shape_regression(y_true, y_intervals)

    _check_arrays_length(y_true, y_intervals)
    _check_array_nan(y_true)
    _check_array_inf(y_true)
    _check_array_nan(y_intervals)
    _check_array_inf(y_intervals)

    kernel_sizes = cast(NDArray, column_or_1d(kernel_sizes))
    if len(kernel_sizes) != 2:
        raise ValueError("kernel_sizes should be an ArrayLike of length 2")
    if (kernel_sizes <= 0).any():
        raise ValueError("kernel_size should be positive")
    n_samples, _, n_confidence_level = y_intervals.shape
    y_true_per_alpha = np.tile(y_true, (n_confidence_level, 1)).transpose()
    widths = np.expand_dims(
        np.abs(y_intervals[:, 1, :] - y_intervals[:, 0, :]).transpose(), axis=2
    )
    cov_ind = np.expand_dims(
        np.int_(
            (
                (y_intervals[:, 0, :] <= y_true_per_alpha)
                & (y_intervals[:, 1, :] >= y_true_per_alpha)
            )
        ).transpose(),
        axis=2,
    )

    k_mat = _gaussian_kernel(widths, kernel_sizes[0])
    l_mat = _gaussian_kernel(cov_ind, kernel_sizes[1])
    h_mat = np.eye(n_samples) - 1 / n_samples * np.ones((n_samples, n_samples))
    hsic_mat = np.matmul(l_mat, np.matmul(h_mat, np.matmul(k_mat, h_mat)))
    hsic_mat /= (n_samples - 1) ** 2
    coef_hsic = np.sqrt(np.matrix.trace(hsic_mat, axis1=1, axis2=2))

    return cast(NDArray, coef_hsic)

mapie.metrics.regression.coverage_width_based

coverage_width_based(
    y_true: ArrayLike,
    y_pred_low: ArrayLike,
    y_pred_up: ArrayLike,
    eta: float,
    confidence_level: float,
) -> float

Coverage Width-based Criterion (CWC) obtained by the prediction intervals.

The effective coverage score is a criterion used to evaluate the quality of prediction intervals (PIs) based on their coverage and width.

Khosravi, Abbas, Saeid Nahavandi, and Doug Creighton. "Construction of optimal prediction intervals for load forecasting problems." IEEE Transactions on Power Systems 25.3 (2010): 1496-1503.

PARAMETER DESCRIPTION
y_true

True labels.

TYPE: ArrayLike of shape (n_samples,)

y_pred_low

Lower bound of the prediction intervals.

TYPE: ArrayLike of shape (n_samples,)

y_pred_up

Upper bound of the prediction intervals.

TYPE: ArrayLike of shape (n_samples,)

eta

A user-defined parameter that balances the contributions of mean width score and coverage score in the CWC calculation.

TYPE: float

confidence_level

A user-defined parameter representing the designed confidence level of the PI.

TYPE: float

RETURNS DESCRIPTION
float

Effective coverage score (CWC) obtained by the prediction intervals.

Notes

The effective coverage score (CWC) is calculated using the following formula: CWC = (1 - Mean Width Score) * exp(-eta * (Coverage score - (1-alpha))**2)

The CWC penalizes under- and overcoverage in the same way and summarizes the quality of the prediction intervals in a single value.

High Eta (Large Positive Value):

When eta is a high positive value, it will strongly emphasize the contribution of (1-Mean Width Score). This means that the algorithm will prioritize reducing the average width of the prediction intervals (Mean Width Score) over achieving a high coverage probability (Coverage score). The exponential term np.exp(-eta(Coverage score - (1-alpha))*2) will have a sharp decline as Coverage score deviates from (1-alpha). So, achieving a high Coverage score becomes less important compared to minimizing Mean Width Score. The impact will be narrower prediction intervals on average, which may result in more precise but less conservative predictions.

Low Eta (Small Positive Value):

When eta is a low positive value, it will still prioritize reducing the average width of the prediction intervals (Mean Width Score) but with less emphasis compared to higher eta values. The exponential term will be less steep, meaning that deviations of Coverage score from (1-alpha) will have a moderate impact. You'll get a balance between prediction precision and coverage, but the exact balance will depend on the specific value of eta.

Negative Eta (Any Negative Value):

When eta is negative, it will have a different effect on the formula. Negative values of eta will cause the exponential term np.exp(-eta(Coverage score - (1-alpha))*2) to become larger as Coverage score deviates from (1-alpha). This means that a negative eta prioritizes achieving a high coverage probability (Coverage score) over minimizing Mean Width Score. In this case, the algorithm will aim to produce wider prediction intervals to ensure a higher likelihood of capturing the true values within those intervals, even if it sacrifices precision. Negative eta values might be used in scenarios where avoiding errors or outliers is critical.

Null Eta (Eta = 0):

Specifically, when eta is zero, the CWC score becomes equal to (1 - Mean Width Score), which is equivalent to (1 - average width of the prediction intervals). Therefore, in this case, the CWC score is primarily based on the size of the prediction interval.

Examples:

>>> from mapie.metrics.regression import coverage_width_based
>>> import numpy as np
>>> y_true = np.array([5, 7.5, 9.5, 10.5, 12.5])
>>> y_preds_low = np.array([4, 6, 9, 8.5, 10.5])
>>> y_preds_up = np.array([6, 9, 10, 12.5, 12])
>>> eta = 0.01
>>> confidence_level = 0.9
>>> cwb = coverage_width_based(
... y_true, y_preds_low, y_preds_up, eta, confidence_level
... )
>>> print(np.round(cwb ,2))
0.69
Source code in mapie/metrics/regression.py
def coverage_width_based(
    y_true: ArrayLike,
    y_pred_low: ArrayLike,
    y_pred_up: ArrayLike,
    eta: float,
    confidence_level: float,
) -> float:
    """
    Coverage Width-based Criterion (CWC) obtained by the prediction intervals.

    The effective coverage score is a criterion used to evaluate the quality
    of prediction intervals (PIs) based on their coverage and width.

    Khosravi, Abbas, Saeid Nahavandi, and Doug Creighton.
    "Construction of optimal prediction intervals for load forecasting
    problems."
    IEEE Transactions on Power Systems 25.3 (2010): 1496-1503.

    Parameters
    ----------
    y_true : ArrayLike of shape (n_samples,)
        True labels.
    y_pred_low : ArrayLike of shape (n_samples,)
        Lower bound of the prediction intervals.
    y_pred_up : ArrayLike of shape (n_samples,)
        Upper bound of the prediction intervals.
    eta : float
        A user-defined parameter that balances the contributions of
        mean width score and coverage score in the CWC calculation.
    confidence_level : float
        A user-defined parameter representing the designed confidence level of
        the PI.

    Returns
    -------
    float
        Effective coverage score (CWC) obtained by the prediction intervals.

    Notes
    -----
    The effective coverage score (CWC) is calculated using the following
    formula:
    CWC = (1 - Mean Width Score) * exp(-eta * (Coverage score - (1-alpha))**2)

    The CWC penalizes under- and overcoverage in the same way and summarizes
    the quality of the prediction intervals in a single value.

    High Eta (Large Positive Value):

    When eta is a high positive value, it will strongly
    emphasize the contribution of (1-Mean Width Score). This means that the
    algorithm will prioritize reducing the average width of the prediction
    intervals (Mean Width Score) over achieving a high coverage probability
    (Coverage score). The exponential term np.exp(-eta*(Coverage score -
    (1-alpha))**2) will have a sharp decline as Coverage score deviates
    from (1-alpha). So, achieving a high Coverage score becomes less important
    compared to minimizing Mean Width Score.
    The impact will be narrower prediction intervals on average, which may
    result in more precise but less conservative predictions.

    Low Eta (Small Positive Value):

    When eta is a low positive value, it will still
    prioritize reducing the average width of the prediction intervals
    (Mean Width Score) but with less emphasis compared to higher
    eta values.
    The exponential term will be less steep, meaning that deviations of
    Coverage score from (1-alpha) will have a moderate impact.
    You'll get a balance between prediction precision and coverage, but the
    exact balance will depend on the specific value of eta.

    Negative Eta (Any Negative Value):

    When eta is negative, it will have a different effect on the formula.
    Negative values of eta will cause the exponential term
    np.exp(-eta*(Coverage score - (1-alpha))**2) to become larger as
    Coverage score deviates from (1-alpha). This means that
    a negative eta prioritizes achieving a high coverage probability
    (Coverage score) over minimizing Mean Width Score.
    In this case, the algorithm will aim to produce wider prediction intervals
    to ensure a higher likelihood of capturing the true values within those
    intervals, even if it sacrifices precision.
    Negative eta values might be used in scenarios where avoiding errors or
    outliers is critical.

    Null Eta (Eta = 0):

    Specifically, when eta is zero, the CWC score becomes equal to
    (1 - Mean Width Score), which is equivalent to
    (1 - average width of the prediction intervals).
    Therefore, in this case, the CWC score is primarily based on the size of
    the prediction interval.

    Examples
    --------
    >>> from mapie.metrics.regression import coverage_width_based
    >>> import numpy as np
    >>> y_true = np.array([5, 7.5, 9.5, 10.5, 12.5])
    >>> y_preds_low = np.array([4, 6, 9, 8.5, 10.5])
    >>> y_preds_up = np.array([6, 9, 10, 12.5, 12])
    >>> eta = 0.01
    >>> confidence_level = 0.9
    >>> cwb = coverage_width_based(
    ... y_true, y_preds_low, y_preds_up, eta, confidence_level
    ... )
    >>> print(np.round(cwb ,2))
    0.69
    """
    y_true = cast(NDArray, column_or_1d(y_true))
    y_pred_low = cast(NDArray, column_or_1d(y_pred_low))
    y_pred_up = cast(NDArray, column_or_1d(y_pred_up))

    _check_alpha(confidence_level)

    coverage_score = regression_coverage_score(
        y_true,
        np.column_stack((y_pred_low, y_pred_up)),
    )[0]
    mean_width = regression_mean_width_score(
        np.column_stack((y_pred_low, y_pred_up))[:, :, np.newaxis]
    )[0]
    ref_length = np.subtract(float(y_true.max()), float(y_true.min()))
    avg_length = mean_width / ref_length

    cwc = (1 - avg_length) * np.exp(-eta * (coverage_score - confidence_level) ** 2)

    return float(cwc)

mapie.metrics.regression.regression_mwi_score

regression_mwi_score(
    y_true: NDArray, y_pis: NDArray, confidence_level: float
) -> float

The Winkler score, proposed by Winkler (1972), is a measure used to evaluate prediction intervals, combining the length of the interval with a penalty that increases proportionally to the distance of an observation outside the interval.

PARAMETER DESCRIPTION
y_true

Ground truth values

TYPE: NDArray

y_pis

Lower and upper bounds of prediction intervals output from a MAPIE regressor

TYPE: NDArray

confidence_level

The value of confidence_level

TYPE: float

RETURNS DESCRIPTION
float

The mean Winkler interval score

References

[1] Robert L. Winkler "A Decision-Theoretic Approach to Interval Estimation", Journal of the American Statistical Association, volume 67, pages 187-191 (1972) (https://doi.org/10.1080/01621459.1972.10481224) [2] Tilmann Gneiting and Adrian E Raftery "Strictly Proper Scoring Rules, Prediction, and Estimation", Journal of the American Statistical Association, volume 102, pages 359-378 (2007) (https://doi.org/10.1198/016214506000001437) (Section 6.2)

Source code in mapie/metrics/regression.py
def regression_mwi_score(
    y_true: NDArray, y_pis: NDArray, confidence_level: float
) -> float:
    """
    The Winkler score, proposed by Winkler (1972), is a measure used to
    evaluate prediction intervals, combining the length of the interval
    with a penalty that increases proportionally to the distance of an
    observation outside the interval.

    Parameters
    ----------
    y_true: ArrayLike of shape (n_samples,)
        Ground truth values
    y_pis: ArrayLike of shape (n_samples, 2, 1)
        Lower and upper bounds of prediction intervals
        output from a MAPIE regressor
    confidence_level: float
        The value of confidence_level

    Returns
    -------
    float
        The mean Winkler interval score

    References
    ----------
    [1] Robert L. Winkler
    "A Decision-Theoretic Approach to Interval Estimation",
    Journal of the American Statistical Association,
    volume 67, pages 187-191 (1972)
    (https://doi.org/10.1080/01621459.1972.10481224)
    [2] Tilmann Gneiting and Adrian E Raftery
    "Strictly Proper Scoring Rules, Prediction, and Estimation",
    Journal of the American Statistical Association,
    volume 102, pages 359-378 (2007)
    (https://doi.org/10.1198/016214506000001437) (Section 6.2)
    """

    # Undo any possible quantile crossing
    y_pred_low = np.minimum(y_pis[:, 0, 0], y_pis[:, 1, 0])
    y_pred_up = np.maximum(y_pis[:, 0, 0], y_pis[:, 1, 0])

    _check_arrays_length(y_true, y_pred_low, y_pred_up)

    # Checking for NaN and inf values
    for array in (y_true, y_pred_low, y_pred_up):
        _check_array_nan(array)
        _check_array_inf(array)

    width = np.sum(y_pred_up) - np.sum(y_pred_low)  # type: ignore
    error_above: float = np.sum((y_true - y_pred_up)[y_true > y_pred_up])
    error_below: float = np.sum((y_pred_low - y_true)[y_true < y_pred_low])
    total_error = error_above + error_below
    mwi = float((width + total_error * 2 / (1 - confidence_level)) / len(y_true))
    return mwi

Classification

mapie.metrics.classification.classification_coverage_score

classification_coverage_score(
    y_true: NDArray, y_pred_set: NDArray
) -> NDArray

Effective coverage score obtained by the prediction sets.

The effective coverage is obtained by estimating the fraction of true labels that lie within the prediction sets.

Prediction sets obtained by the predict method can be passed directly to the y_pred_set argument (see example below).

Beside this intended use, this function also works with:

  • y_true of shape (n_sample,) and y_pred_set of shape (n_sample, n_class)
  • y_true of shape (n_sample, n) and y_pred_set of shape (n_sample, n_class, n)
PARAMETER DESCRIPTION
y_true

True labels.

TYPE: NDArray

y_pred_set

Prediction sets with different confidence levels, given by booleans of labels with the predict method.

TYPE: NDArray

RETURNS DESCRIPTION
NDArray of shape (n_confidence_level,)

Effective coverage obtained by the prediction sets for each confidence level.

Examples:

>>> from mapie.metrics.classification import classification_coverage_score
>>> from mapie.classification import SplitConformalClassifier
>>> from mapie.utils import train_conformalize_test_split
>>> from sklearn.datasets import make_classification
>>> from sklearn.model_selection import train_test_split
>>> from sklearn.neighbors import KNeighborsClassifier
>>> X, y = make_classification(n_samples=500)
>>> (
...     X_train, X_conformalize, X_test,
...     y_train, y_conformalize, y_test
... ) = train_conformalize_test_split(
...     X, y, train_size=0.6, conformalize_size=0.2, test_size=0.2, random_state=1
... )
>>> mapie_classifier = SplitConformalClassifier(
...     estimator=KNeighborsClassifier(),
...     confidence_level=[0.9, 0.95, 0.99],
...     prefit=False,
... ).fit(X_train, y_train).conformalize(X_conformalize, y_conformalize)
>>> predicted_points, predicted_sets = mapie_classifier.predict_set(X_test)
>>> coverage = classification_coverage_score(y_test, predicted_sets)[0]
Source code in mapie/metrics/classification.py
def classification_coverage_score(y_true: NDArray, y_pred_set: NDArray) -> NDArray:
    """
    Effective coverage score obtained by the prediction sets.

    The effective coverage is obtained by estimating the fraction
    of true labels that lie within the prediction sets.

    Prediction sets obtained by the `predict` method can be passed directly to the
    `y_pred_set` argument (see example below).

    Beside this intended use, this function also works with:

    - `y_true` of shape (n_sample,) and `y_pred_set` of shape (n_sample, n_class)
    - `y_true` of shape (n_sample, n) and `y_pred_set` of shape
      (n_sample, n_class, n)

    Parameters
    ----------
    y_true: NDArray of shape (n_samples,)
        True labels.

    y_pred_set: NDArray of shape (n_samples, n_class, n_confidence_level)
        Prediction sets with different confidence levels, given by booleans of labels
        with the `predict` method.

    Returns
    -------
    NDArray of shape (n_confidence_level,)
        Effective coverage obtained by the prediction sets for each confidence level.

    Examples
    --------
    >>> from mapie.metrics.classification import classification_coverage_score
    >>> from mapie.classification import SplitConformalClassifier
    >>> from mapie.utils import train_conformalize_test_split
    >>> from sklearn.datasets import make_classification
    >>> from sklearn.model_selection import train_test_split
    >>> from sklearn.neighbors import KNeighborsClassifier

    >>> X, y = make_classification(n_samples=500)
    >>> (
    ...     X_train, X_conformalize, X_test,
    ...     y_train, y_conformalize, y_test
    ... ) = train_conformalize_test_split(
    ...     X, y, train_size=0.6, conformalize_size=0.2, test_size=0.2, random_state=1
    ... )

    >>> mapie_classifier = SplitConformalClassifier(
    ...     estimator=KNeighborsClassifier(),
    ...     confidence_level=[0.9, 0.95, 0.99],
    ...     prefit=False,
    ... ).fit(X_train, y_train).conformalize(X_conformalize, y_conformalize)

    >>> predicted_points, predicted_sets = mapie_classifier.predict_set(X_test)
    >>> coverage = classification_coverage_score(y_test, predicted_sets)[0]
    """
    _check_arrays_length(y_true, y_pred_set)
    _check_array_nan(y_true)
    _check_array_inf(y_true)
    _check_array_nan(y_pred_set)
    _check_array_inf(y_pred_set)

    y_pred_set = _check_array_shape_classification(y_true, y_pred_set)
    if len(y_true.shape) != 2:
        y_true = cast(NDArray, column_or_1d(y_true))
        y_true = np.expand_dims(y_true, axis=1)
    y_true = np.expand_dims(y_true, axis=1)
    coverage = np.nanmean(np.take_along_axis(y_pred_set, y_true, axis=1), axis=0)
    return cast(NDArray, coverage[0])

mapie.metrics.classification.classification_mean_width_score

classification_mean_width_score(
    y_pred_set: ArrayLike,
) -> NDArray

Mean width of prediction set output by mapie.classification._MapieClassifier.

PARAMETER DESCRIPTION
y_pred_set

Prediction sets given by booleans of labels.

TYPE: ArrayLike

RETURNS DESCRIPTION
NDArray of shape (n_confidence_level,)

Mean width of the prediction sets for each confidence level.

Examples:

>>> import numpy as np
>>> from mapie.metrics.classification import classification_mean_width_score
>>> y_pred_set = np.array([
...     [[False, False], [False, True], [True, True]],
...     [[False, True], [True, False], [True, True]],
...     [[True, False], [True, True], [True, False]],
...     [[False, False], [True, True], [True, True]],
...     [[True, True], [False, True], [True, False]]
... ])
>>> print(classification_mean_width_score(y_pred_set))
[2.  1.8]
Source code in mapie/metrics/classification.py
def classification_mean_width_score(y_pred_set: ArrayLike) -> NDArray:
    """
    Mean width of prediction set output by
    `mapie.classification._MapieClassifier`.

    Parameters
    ----------
    y_pred_set: NDArray of shape (n_samples, n_class, n_confidence_level)
        Prediction sets given by booleans of labels.

    Returns
    -------
    NDArray of shape (n_confidence_level,)
        Mean width of the prediction sets for each confidence level.

    Examples
    --------
    >>> import numpy as np
    >>> from mapie.metrics.classification import classification_mean_width_score
    >>> y_pred_set = np.array([
    ...     [[False, False], [False, True], [True, True]],
    ...     [[False, True], [True, False], [True, True]],
    ...     [[True, False], [True, True], [True, False]],
    ...     [[False, False], [True, True], [True, True]],
    ...     [[True, True], [False, True], [True, False]]
    ... ])
    >>> print(classification_mean_width_score(y_pred_set))
    [2.  1.8]
    """
    y_pred_set = np.asarray(y_pred_set, dtype=bool)
    _check_array_nan(y_pred_set)
    _check_array_inf(y_pred_set)
    width = y_pred_set.sum(axis=1)
    mean_width = width.mean(axis=0)
    return cast(NDArray, mean_width)

mapie.metrics.classification.classification_ssc

classification_ssc(
    y_true: NDArray,
    y_pred_set: NDArray,
    num_bins: Union[int, None] = None,
) -> NDArray

Compute Size-Stratified Coverage metrics proposed in [3] that is the conditional coverage conditioned by the size of the predictions sets. The sets are ranked by their size (ascending) and then divided into num_bins groups: one value of coverage by groups is computed.

[3] Angelopoulos, A. N., & Bates, S. (2021). A gentle introduction to conformal prediction and distribution-free uncertainty quantification. arXiv preprint arXiv:2107.07511.

PARAMETER DESCRIPTION
y_true

True labels.

TYPE: NDArray

y_pred_set

Prediction sets given by booleans of labels.

TYPE: NDArray

num_bins

Number of groups. If None, one value of coverage by possible size of sets (n_classes +1) is computed. Should be less than the number of different set sizes.

TYPE: Union[int, None] DEFAULT: None

RETURNS DESCRIPTION
NDArray of shape (n_confidence_level, num_bins)

Examples:

>>> from mapie.metrics.classification import classification_ssc
>>> import numpy as np
>>> y_true = y_true_class = np.array([3, 3, 1, 2, 2])
>>> y_pred_set = np.array([
...    [True, True, True, True],
...    [False, True, False, True],
...    [True, True, True, False],
...    [False, False, True, True],
...    [True, True, False, True]])
>>> print(classification_ssc(y_true, y_pred_set, num_bins=2))
[[1.         0.66666667]]
Source code in mapie/metrics/classification.py
def classification_ssc(
    y_true: NDArray, y_pred_set: NDArray, num_bins: Union[int, None] = None
) -> NDArray:
    """
    Compute Size-Stratified Coverage metrics proposed in [3] that is
    the conditional coverage conditioned by the size of the predictions sets.
    The sets are ranked by their size (ascending) and then divided into
    num_bins groups: one value of coverage by groups is computed.

    [3] Angelopoulos, A. N., & Bates, S. (2021).
    A gentle introduction to conformal prediction and
    distribution-free uncertainty quantification.
    arXiv preprint arXiv:2107.07511.

    Parameters
    ----------
    y_true: NDArray of shape (n_samples,)
        True labels.
    y_pred_set: NDArray of shape (n_samples, n_class, n_confidence_level) or (n_samples, n_class)
        Prediction sets given by booleans of labels.
    num_bins: int or None
        Number of groups. If None, one value of coverage by possible
        size of sets (n_classes +1) is computed. Should be less than the
        number of different set sizes.

    Returns
    -------
    NDArray of shape (n_confidence_level, num_bins)

    Examples
    --------
    >>> from mapie.metrics.classification import classification_ssc
    >>> import numpy as np
    >>> y_true = y_true_class = np.array([3, 3, 1, 2, 2])
    >>> y_pred_set = np.array([
    ...    [True, True, True, True],
    ...    [False, True, False, True],
    ...    [True, True, True, False],
    ...    [False, False, True, True],
    ...    [True, True, False, True]])
    >>> print(classification_ssc(y_true, y_pred_set, num_bins=2))
    [[1.         0.66666667]]
    """
    y_true = cast(NDArray, column_or_1d(y_true))
    y_pred_set = _check_array_shape_classification(y_true, y_pred_set)

    _check_arrays_length(y_true, y_pred_set)
    _check_array_nan(y_true)
    _check_array_inf(y_true)
    _check_array_nan(y_pred_set)
    _check_array_inf(y_pred_set)

    sizes = np.sum(y_pred_set, axis=1)
    n_classes = y_pred_set.shape[1]
    if num_bins is None:
        bins = list(range(n_classes + 1))
    else:
        _check_nb_sets_sizes(sizes, num_bins)
        _check_number_bins(num_bins)
        bins = [b[0] for b in np.array_split(range(n_classes + 1), num_bins)]

    digitized_sizes: NDArray = np.digitize(sizes, bins)
    coverages = np.zeros((y_pred_set.shape[2], len(bins)))
    for alpha in range(y_pred_set.shape[2]):
        indexes_bybins = [
            np.argwhere(digitized_sizes[:, alpha] == i) for i in range(1, len(bins) + 1)
        ]

        for i, indexes in enumerate(indexes_bybins):
            coverages[alpha, i] = classification_coverage_score(
                y_true[indexes],
                np.take_along_axis(y_pred_set[:, :, alpha], indexes, axis=0),
            ).item()
    return coverages

mapie.metrics.classification.classification_ssc_score

classification_ssc_score(
    y_true: NDArray,
    y_pred_set: NDArray,
    num_bins: Union[int, None] = None,
) -> NDArray

Aggregate by the minimum for each confidence level the Size-Stratified Coverage [3]: returns the maximum violation of the conditional coverage (with the groups defined).

PARAMETER DESCRIPTION
y_true

True labels.

TYPE: NDArray

y_pred_set

Prediction sets given by booleans of labels.

TYPE: NDArray

num_bins

Number of groups. If None, one value of coverage by possible size of sets (n_classes +1) is computed. Should be less than the number of different set sizes.

TYPE: Union[int, None] DEFAULT: None

RETURNS DESCRIPTION
NDArray of shape (n_confidence_level,)

Examples:

>>> from mapie.metrics.classification import classification_ssc_score
>>> import numpy as np
>>> y_true = y_true_class = np.array([3, 3, 1, 2, 2])
>>> y_pred_set = np.array([
...    [True, True, True, True],
...    [False, True, False, True],
...    [True, True, True, False],
...    [False, False, True, True],
...    [True, True, False, True]])
>>> print(classification_ssc_score(y_true, y_pred_set, num_bins=2))
[0.66666667]
Source code in mapie/metrics/classification.py
def classification_ssc_score(
    y_true: NDArray, y_pred_set: NDArray, num_bins: Union[int, None] = None
) -> NDArray:
    """
    Aggregate by the minimum for each confidence level the Size-Stratified Coverage [3]:
    returns the maximum violation of the conditional coverage
    (with the groups defined).

    Parameters
    ----------
    y_true: NDArray of shape (n_samples,)
        True labels.
    y_pred_set: NDArray of shape (n_samples, n_class, n_confidence_level) or (n_samples, n_class)
        Prediction sets given by booleans of labels.
    num_bins: int or None
        Number of groups. If None, one value of coverage by possible
        size of sets (n_classes +1) is computed. Should be less than
        the number of different set sizes.

    Returns
    -------
    NDArray of shape (n_confidence_level,)

    Examples
    --------
    >>> from mapie.metrics.classification import classification_ssc_score
    >>> import numpy as np
    >>> y_true = y_true_class = np.array([3, 3, 1, 2, 2])
    >>> y_pred_set = np.array([
    ...    [True, True, True, True],
    ...    [False, True, False, True],
    ...    [True, True, True, False],
    ...    [False, False, True, True],
    ...    [True, True, False, True]])
    >>> print(classification_ssc_score(y_true, y_pred_set, num_bins=2))
    [0.66666667]
    """
    _check_arrays_length(y_true, y_pred_set)
    _check_array_nan(y_true)
    _check_array_inf(y_true)
    _check_array_nan(y_pred_set)
    _check_array_inf(y_pred_set)

    return cast(
        NDArray, np.nanmin(classification_ssc(y_true, y_pred_set, num_bins), axis=1)
    )

Calibration

mapie.metrics.calibration.expected_calibration_error

expected_calibration_error(
    y_true: ArrayLike,
    y_scores: ArrayLike,
    num_bins: int = 50,
    split_strategy: Optional[str] = None,
) -> float

The expected calibration error, which is the difference between the confidence scores and accuracy per bin [1].

[1] Naeini, Mahdi Pakdaman, Gregory Cooper, and Milos Hauskrecht. "Obtaining well calibrated probabilities using bayesian binning." Twenty-Ninth AAAI Conference on Artificial Intelligence. 2015.

PARAMETER DESCRIPTION
y_true

The target values for the calibrator.

TYPE: ArrayLike

y_scores

The predictions scores.

TYPE: ArrayLike

num_bins

Number of bins to make the split in the y_score. The allowed values are num_bins above 0.

TYPE: int DEFAULT: 50

split_strategy

The way of splitting the predictions into different bins. The allowed split strategies are "uniform", "quantile" and "array split".

TYPE: Optional[str] DEFAULT: None

RETURNS DESCRIPTION
float

The score of ECE (Expected Calibration Error).

Source code in mapie/metrics/calibration.py
def expected_calibration_error(
    y_true: ArrayLike,
    y_scores: ArrayLike,
    num_bins: int = 50,
    split_strategy: Optional[str] = None,
) -> float:
    """
    The expected calibration error, which is the difference between
    the confidence scores and accuracy per bin [1].

    [1] Naeini, Mahdi Pakdaman, Gregory Cooper, and Milos Hauskrecht.
    "Obtaining well calibrated probabilities using bayesian binning."
    Twenty-Ninth AAAI Conference on Artificial Intelligence. 2015.

    Parameters
    ----------
    y_true: ArrayLike of shape (n_samples,)
        The target values for the calibrator.
    y_scores: ArrayLike of shape (n_samples,) or (n_samples, n_classes)
        The predictions scores.
    num_bins: int
        Number of bins to make the split in the y_score. The allowed
        values are num_bins above 0.
    split_strategy: str
        The way of splitting the predictions into different bins.
        The allowed split strategies are "uniform", "quantile" and
        "array split".
    Returns
    -------
    float
        The score of ECE (Expected Calibration Error).
    """
    split_strategy = _check_split_strategy(split_strategy)
    num_bins = _check_number_bins(num_bins)
    y_true_ = _check_binary_zero_one(y_true)
    y_scores = cast(NDArray, y_scores)

    _check_arrays_length(y_true_, y_scores)
    _check_array_nan(y_true_)
    _check_array_inf(y_true_)
    _check_array_nan(y_scores)
    _check_array_inf(y_scores)

    if np.size(y_scores.shape) == 2:
        y_score = cast(NDArray, column_or_1d(np.nanmax(y_scores, axis=1)))
    else:
        y_score = cast(NDArray, column_or_1d(y_scores))

    _, bin_accs, bin_confs, bin_sizes = _calc_bins(
        y_true_, y_score, num_bins, split_strategy
    )

    return float(
        np.divide(np.sum(bin_sizes * np.abs(bin_accs - bin_confs)), np.sum(bin_sizes))
    )

mapie.metrics.calibration.top_label_ece

top_label_ece(
    y_true: ArrayLike,
    y_scores: ArrayLike,
    y_score_arg: Optional[ArrayLike] = None,
    num_bins: int = 50,
    split_strategy: Optional[str] = None,
    classes: Optional[ArrayLike] = None,
) -> float

The Top-Label ECE which is a method adapted to fit the ECE to a Top-Label setting [2].

[2] Gupta, Chirag, and Aaditya K. Ramdas. "Top-label calibration and multiclass-to-binary reductions." arXiv preprint arXiv:2107.08353 (2021).

PARAMETER DESCRIPTION
y_true

The target values for the calibrator.

TYPE: ArrayLike

y_scores

The predictions scores, either the maximum score and the argmax needs to be inputted or in the form of the prediction probabilities.

TYPE: ArrayLike

y_score_arg

If only the maximum is provided in the y_scores, the argmax must be provided here. This is optional and could be directly infered from the y_scores.

TYPE: Optional[ArrayLike] DEFAULT: None

num_bins

Number of bins to make the split in the y_score. The allowed values are num_bins above 0.

TYPE: int DEFAULT: 50

split_strategy

The way of splitting the predictions into different bins. The allowed split strategies are "uniform", "quantile" and "array split".

TYPE: Optional[str] DEFAULT: None

classes

The different classes, in order of the indices that would be present in a pred_proba.

TYPE: Optional[ArrayLike] DEFAULT: None

RETURNS DESCRIPTION
float

The ECE score adapted in the top label setting.

Source code in mapie/metrics/calibration.py
def top_label_ece(
    y_true: ArrayLike,
    y_scores: ArrayLike,
    y_score_arg: Optional[ArrayLike] = None,
    num_bins: int = 50,
    split_strategy: Optional[str] = None,
    classes: Optional[ArrayLike] = None,
) -> float:
    """
    The Top-Label ECE which is a method adapted to fit the
    ECE to a Top-Label setting [2].

    [2] Gupta, Chirag, and Aaditya K. Ramdas.
    "Top-label calibration and multiclass-to-binary reductions."
    arXiv preprint arXiv:2107.08353 (2021).

    Parameters
    ----------
    y_true: ArrayLike of shape (n_samples,)
        The target values for the calibrator.
    y_scores: ArrayLike of shape (n_samples, n_classes) or (n_samples,)
        The predictions scores, either the maximum score and the
        argmax needs to be inputted or in the form of the prediction
        probabilities.
    y_score_arg: Optional[ArrayLike] of shape (n_samples,)
        If only the maximum is provided in the y_scores, the argmax must
        be provided here. This is optional and could be directly infered
        from the y_scores.
    num_bins: int
        Number of bins to make the split in the y_score. The allowed
        values are num_bins above 0.
    split_strategy: str
        The way of splitting the predictions into different bins.
        The allowed split strategies are "uniform", "quantile" and
        "array split".
    classes: ArrayLike of shape (n_samples,)
        The different classes, in order of the indices that would be
        present in a pred_proba.

    Returns
    -------
    float
        The ECE score adapted in the top label setting.
    """
    y_scores = cast(NDArray, y_scores)
    y_true = cast(NDArray, y_true)
    _check_array_nan(y_true)
    _check_array_inf(y_true)
    _check_array_nan(y_scores)
    _check_array_inf(y_scores)

    if y_score_arg is None:
        _check_arrays_length(y_true, y_scores)
    else:
        y_score_arg = cast(NDArray, y_score_arg)
        _check_array_nan(y_score_arg)
        _check_array_inf(y_score_arg)
        _check_arrays_length(y_true, y_scores, y_score_arg)

    ece = float(0.0)
    split_strategy = _check_split_strategy(split_strategy)
    num_bins = _check_number_bins(num_bins)
    y_true = cast(NDArray, column_or_1d(y_true))
    if y_score_arg is None:
        y_score = cast(NDArray, column_or_1d(np.nanmax(y_scores, axis=1)))
        if classes is None:
            y_score_arg = cast(NDArray, column_or_1d(np.nanargmax(y_scores, axis=1)))
        else:
            classes = cast(NDArray, classes)
            y_score_arg = cast(
                NDArray, column_or_1d(classes[np.nanargmax(y_scores, axis=1)])
            )
    else:
        y_score = cast(NDArray, column_or_1d(y_scores))
        y_score_arg = cast(NDArray, column_or_1d(y_score_arg))
    labels = np.unique(y_score_arg)

    for label in labels:
        label_ind = np.where(label == y_score_arg)[0]
        y_true_ = np.array(y_true[label_ind] == label, dtype=int)
        ece += expected_calibration_error(
            y_true_,
            y_scores=y_score[label_ind],
            num_bins=num_bins,
            split_strategy=split_strategy,
        )
    ece /= len(labels)
    return ece

mapie.metrics.calibration.kolmogorov_smirnov_statistic

kolmogorov_smirnov_statistic(
    y_true: NDArray, y_score: NDArray
) -> float

Compute Kolmogorov-smirnov's statistic for calibration test. Also called ECCE-MAD (Estimated Cumulative Calibration Errors - Maximum Absolute Deviation). The closer to zero, the better the scores are calibrated. Indeed, if the scores are perfectly calibrated, the cumulative differences between y_true and y_score should share the same properties of a standard Brownian motion asymptotically.

PARAMETER DESCRIPTION
y_true

An array of ground truth.

TYPE: NDArray of shape (n_samples,)

y_score

An array of scores..

TYPE: NDArray of shape (n_samples,)

RETURNS DESCRIPTION
float

Kolmogorov-smirnov's statistic.

References

Arrieta-Ibarra I, Gujral P, Tannen J, Tygert M, Xu C. Metrics of calibration for probabilistic predictions. The Journal of Machine Learning Research. 2022 Jan 1;23(1):15886-940.

Examples:

>>> import numpy as np
>>> from mapie.metrics.calibration import kolmogorov_smirnov_statistic
>>> y_true = np.array([0, 1, 0, 1, 0])
>>> y_score = np.array([0.1, 0.9, 0.21, 0.9, 0.5])
>>> print(np.round(kolmogorov_smirnov_statistic(y_true, y_score), 3))
0.978
Source code in mapie/metrics/calibration.py
def kolmogorov_smirnov_statistic(y_true: NDArray, y_score: NDArray) -> float:
    """
    Compute Kolmogorov-smirnov's statistic for calibration test.
    Also called ECCE-MAD
    (Estimated Cumulative Calibration Errors - Maximum Absolute Deviation).
    The closer to zero, the better the scores are calibrated.
    Indeed, if the scores are perfectly calibrated,
    the cumulative differences between `y_true` and `y_score`
    should share the same properties of a standard Brownian motion
    asymptotically.

    Parameters
    ----------
    y_true : NDArray of shape (n_samples,)
        An array of ground truth.

    y_score : NDArray of shape (n_samples,)
        An array of scores..

    Returns
    -------
    float
        Kolmogorov-smirnov's statistic.

    References
    ----------
    Arrieta-Ibarra I, Gujral P, Tannen J, Tygert M, Xu C.
    Metrics of calibration for probabilistic predictions.
    The Journal of Machine Learning Research.
    2022 Jan 1;23(1):15886-940.

    Examples
    --------
    >>> import numpy as np
    >>> from mapie.metrics.calibration import kolmogorov_smirnov_statistic
    >>> y_true = np.array([0, 1, 0, 1, 0])
    >>> y_score = np.array([0.1, 0.9, 0.21, 0.9, 0.5])
    >>> print(np.round(kolmogorov_smirnov_statistic(y_true, y_score), 3))
    0.978
    """
    _check_arrays_length(y_true, y_score)
    _check_array_nan(y_true)
    _check_array_inf(y_true)
    _check_array_nan(y_score)
    _check_array_inf(y_score)

    y_true = column_or_1d(y_true)
    y_score = column_or_1d(y_score)

    cum_diff = cumulative_differences(y_true, y_score)
    sigma = length_scale(y_score)
    ks_stat = np.max(np.abs(cum_diff)) / sigma
    return float(ks_stat)

mapie.metrics.calibration.kolmogorov_smirnov_cdf

kolmogorov_smirnov_cdf(x: float) -> float

Compute the Kolmogorov-smirnov cumulative distribution function (CDF) for the float x. This is interpreted as the CDF of the maximum absolute value of the standard Brownian motion over the unit interval [0, 1]. The function is approximated by its power series, truncated so as to hit machine precision error.

PARAMETER DESCRIPTION
x

The float x to compute the cumulative distribution function on.

TYPE: float

RETURNS DESCRIPTION
float

The Kolmogorov-smirnov cumulative distribution function.

References

Tygert M. Calibration of P-values for calibration and for deviation of a subpopulation from the full population. arXiv preprint arXiv:2202.00100. 2022 Jan 31.

D. A. Darling. A. J. F. Siegert. The First Passage Problem for a Continuous Markov Process. Ann. Math. Statist. 24 (4) 624 - 639, December, 1953.

Examples:

>>> import numpy as np
>>> from mapie.metrics.calibration import kolmogorov_smirnov_cdf
>>> print(np.round(kolmogorov_smirnov_cdf(1), 4))
0.3708
Source code in mapie/metrics/calibration.py
def kolmogorov_smirnov_cdf(x: float) -> float:
    """
    Compute the Kolmogorov-smirnov cumulative distribution
    function (CDF) for the float x.
    This is interpreted as the CDF of the maximum absolute value
    of the standard Brownian motion over the unit interval [0, 1].
    The function is approximated by its power series, truncated so as to hit
    machine precision error.

    Parameters
    ----------
    x : float
        The float x to compute the cumulative distribution function on.

    Returns
    -------
    float
        The Kolmogorov-smirnov cumulative distribution function.

    References
    ----------
    Tygert M.
    Calibration of P-values for calibration and for deviation
    of a subpopulation from the full population.
    arXiv preprint arXiv:2202.00100.
    2022 Jan 31.

    D. A. Darling. A. J. F. Siegert.
    The First Passage Problem for a Continuous Markov Process.
    Ann. Math. Statist. 24 (4) 624 - 639, December,
    1953.

    Examples
    --------
    >>> import numpy as np
    >>> from mapie.metrics.calibration import kolmogorov_smirnov_cdf
    >>> print(np.round(kolmogorov_smirnov_cdf(1), 4))
    0.3708
    """
    kmax = np.ceil(
        0.5 + x * np.sqrt(2) / np.pi * np.sqrt(np.log(4 / (np.pi * EPSILON)))
    )
    c = 0.0
    for k in range(int(kmax)):
        kplus = k + 1 / 2
        c += (-1) ** k / kplus * np.exp(-(kplus**2) * np.pi**2 / (2 * x**2))
    c *= 2 / np.pi
    return c

mapie.metrics.calibration.kolmogorov_smirnov_p_value

kolmogorov_smirnov_p_value(
    y_true: NDArray, y_score: NDArray
) -> float

Compute Kolmogorov Smirnov p-value. Deduced from the corresponding statistic and CDF. It represents the probability of the observed statistic under the null hypothesis of perfect calibration.

PARAMETER DESCRIPTION
y_true

An array of ground truth.

TYPE: NDArray of shape (n_samples,)

y_score

An array of scores.

TYPE: NDArray of shape (n_samples,)

RETURNS DESCRIPTION
float

The Kolmogorov Smirnov p-value.

References

Tygert M. Calibration of P-values for calibration and for deviation of a subpopulation from the full population. arXiv preprint arXiv:2202.00100. 2022 Jan 31.

D. A. Darling. A. J. F. Siegert. The First Passage Problem for a Continuous Markov Process. Ann. Math. Statist. 24 (4) 624 - 639, December, 1953.

Examples:

>>> import pandas as pd
>>> import numpy as np
>>> from mapie.metrics.calibration import kolmogorov_smirnov_p_value
>>> y_true = np.array([1, 0, 1, 0, 1, 0])
>>> y_score = np.array([0.8, 0.3, 0.5, 0.5, 0.7, 0.1])
>>> ks_p_value = kolmogorov_smirnov_p_value(y_true, y_score)
>>> print(np.round(ks_p_value, 4))
0.7857
Source code in mapie/metrics/calibration.py
def kolmogorov_smirnov_p_value(y_true: NDArray, y_score: NDArray) -> float:
    """
    Compute Kolmogorov Smirnov p-value.
    Deduced from the corresponding statistic and CDF.
    It represents the probability of the observed statistic
    under the null hypothesis of perfect calibration.

    Parameters
    ----------
    y_true : NDArray of shape (n_samples,)
        An array of ground truth.

    y_score : NDArray of shape (n_samples,)
        An array of scores.

    Returns
    -------
    float
        The Kolmogorov Smirnov p-value.

    References
    ----------
    Tygert M.
    Calibration of P-values for calibration and for deviation
    of a subpopulation from the full population.
    arXiv preprint arXiv:2202.00100.
    2022 Jan 31.

    D. A. Darling. A. J. F. Siegert.
    The First Passage Problem for a Continuous Markov Process.
    Ann. Math. Statist. 24 (4) 624 - 639, December,
    1953.

    Examples
    --------
    >>> import pandas as pd
    >>> import numpy as np
    >>> from mapie.metrics.calibration import kolmogorov_smirnov_p_value
    >>> y_true = np.array([1, 0, 1, 0, 1, 0])
    >>> y_score = np.array([0.8, 0.3, 0.5, 0.5, 0.7, 0.1])
    >>> ks_p_value = kolmogorov_smirnov_p_value(y_true, y_score)
    >>> print(np.round(ks_p_value, 4))
    0.7857
    """
    _check_arrays_length(y_true, y_score)
    _check_array_nan(y_true)
    _check_array_inf(y_true)
    _check_array_nan(y_score)
    _check_array_inf(y_score)

    ks_stat = kolmogorov_smirnov_statistic(y_true, y_score)
    ks_p_value = 1 - kolmogorov_smirnov_cdf(ks_stat)
    return ks_p_value

mapie.metrics.calibration.kuiper_statistic

kuiper_statistic(
    y_true: NDArray, y_score: NDArray
) -> float

Compute Kuiper's statistic for calibration test. Also called ECCE-R (Estimated Cumulative Calibration Errors - Range). The closer to zero, the better the scores are calibrated. Indeed, if the scores are perfectly calibrated, the cumulative differences between y_true and y_score should share the same properties of a standard Brownian motion asymptotically.

PARAMETER DESCRIPTION
y_true

An array of ground truth.

TYPE: NDArray of shape (n_samples,)

y_score

An array of scores.

TYPE: NDArray of shape (n_samples,)

RETURNS DESCRIPTION
float

Kuiper's statistic.

References

Arrieta-Ibarra I, Gujral P, Tannen J, Tygert M, Xu C. Metrics of calibration for probabilistic predictions. The Journal of Machine Learning Research. 2022 Jan 1;23(1):15886-940.

Examples:

>>> import numpy as np
>>> from mapie.metrics.calibration import kuiper_statistic
>>> y_true = np.array([0, 1, 0, 1, 0])
>>> y_score = np.array([0.1, 0.9, 0.21, 0.9, 0.5])
>>> print(np.round(kuiper_statistic(y_true, y_score), 3))
0.857
Source code in mapie/metrics/calibration.py
def kuiper_statistic(y_true: NDArray, y_score: NDArray) -> float:
    """
    Compute Kuiper's statistic for calibration test.
    Also called ECCE-R (Estimated Cumulative Calibration Errors - Range).
    The closer to zero, the better the scores are calibrated.
    Indeed, if the scores are perfectly calibrated,
    the cumulative differences between `y_true` and `y_score`
    should share the same properties of a standard Brownian motion
    asymptotically.

    Parameters
    ----------
    y_true : NDArray of shape (n_samples,)
        An array of ground truth.

    y_score : NDArray of shape (n_samples,)
        An array of scores.

    Returns
    -------
    float
        Kuiper's statistic.

    References
    ----------
    Arrieta-Ibarra I, Gujral P, Tannen J, Tygert M, Xu C.
    Metrics of calibration for probabilistic predictions.
    The Journal of Machine Learning Research.
    2022 Jan 1;23(1):15886-940.

    Examples
    --------
    >>> import numpy as np
    >>> from mapie.metrics.calibration import kuiper_statistic
    >>> y_true = np.array([0, 1, 0, 1, 0])
    >>> y_score = np.array([0.1, 0.9, 0.21, 0.9, 0.5])
    >>> print(np.round(kuiper_statistic(y_true, y_score), 3))
    0.857
    """
    _check_arrays_length(y_true, y_score)
    _check_array_nan(y_true)
    _check_array_inf(y_true)
    _check_array_nan(y_score)
    _check_array_inf(y_score)

    y_true = column_or_1d(y_true)
    y_score = column_or_1d(y_score)
    cum_diff = cumulative_differences(y_true, y_score)
    sigma = length_scale(y_score)
    ku_stat = (np.max(cum_diff) - np.min(cum_diff)) / sigma  # type: ignore
    return float(ku_stat)

mapie.metrics.calibration.kuiper_cdf

kuiper_cdf(x: float) -> float

Compute the Kuiper cumulative distribution function (CDF) for the float x. This is interpreted as the CDF of the range of the standard Brownian motion over the unit interval [0, 1]. The function is approximated by its power series, truncated so as to hit machine precision error.

PARAMETER DESCRIPTION
x

The float x to compute the cumulative distribution function.

TYPE: float

RETURNS DESCRIPTION
float

The Kuiper cumulative distribution function.

References

Tygert M. Calibration of P-values for calibration and for deviation of a subpopulation from the full population. arXiv preprint arXiv:2202.00100. 2022 Jan 31.

William Feller. The Asymptotic Distribution of the Range of Sums of Independent Random Variables. Ann. Math. Statist. 22 (3) 427 - 432 September, 1951.

Examples:

>>> import numpy as np
>>> from mapie.metrics.calibration import kuiper_cdf
>>> print(np.round(kuiper_cdf(1), 4))
0.0634
Source code in mapie/metrics/calibration.py
def kuiper_cdf(x: float) -> float:
    """
    Compute the Kuiper cumulative distribution function (CDF) for the float x.
    This is interpreted as the CDF of the range
    of the standard Brownian motion over the unit interval [0, 1].
    The function is approximated by its power series, truncated so as to hit
    machine precision error.

    Parameters
    ----------
    x : float
        The float x to compute the cumulative distribution function.

    Returns
    -------
    float
        The Kuiper cumulative distribution function.

    References
    ----------
    Tygert M.
    Calibration of P-values for calibration and for deviation
    of a subpopulation from the full population.
    arXiv preprint arXiv:2202.00100.
    2022 Jan 31.

    William Feller.
    The Asymptotic Distribution of the Range of Sums of
    Independent Random Variables.
    Ann. Math. Statist. 22 (3) 427 - 432
    September, 1951.

    Examples
    --------
    >>> import numpy as np
    >>> from mapie.metrics.calibration import kuiper_cdf
    >>> print(np.round(kuiper_cdf(1), 4))
    0.0634
    """
    kmax = np.ceil(
        (
            0.5
            + x
            / (np.pi * np.sqrt(2))
            * np.sqrt(
                np.log(4 / (np.sqrt(2 * np.pi) * EPSILON) * (1 / x + x / np.pi**2))
            )
        )
    )
    c = 0.0
    for k in range(int(kmax)):
        kplus = k + 1 / 2
        c += (8 / x**2 + 2 / kplus**2 / np.pi**2) * np.exp(
            -2 * kplus**2 * np.pi**2 / x**2
        )
    return c

mapie.metrics.calibration.kuiper_p_value

kuiper_p_value(y_true: NDArray, y_score: NDArray) -> float

Compute Kuiper statistic p-value. Deduced from the corresponding statistic and CDF. It represents the probability of the observed statistic under the null hypothesis of perfect calibration.

PARAMETER DESCRIPTION
y_true

An array of ground truth.

TYPE: NDArray of shape (n_samples,)

y_score

An array of scores.

TYPE: NDArray of shape (n_samples,)

RETURNS DESCRIPTION
float

The Kuiper p-value.

References

Tygert M. Calibration of P-values for calibration and for deviation of a subpopulation from the full population. arXiv preprint arXiv:2202.00100. 2022 Jan 31.

William Feller. The Asymptotic Distribution of the Range of Sums of Independent Random Variables. Ann. Math. Statist. 22 (3) 427 - 432 September, 1951.

Examples:

>>> import pandas as pd
>>> import numpy as np
>>> from mapie.metrics.calibration import kuiper_p_value
>>> y_true = np.array([1, 0, 1, 0, 1, 0])
>>> y_score = np.array([0.8, 0.3, 0.5, 0.5, 0.7, 0.1])
>>> ku_p_value = kuiper_p_value(y_true, y_score)
>>> print(np.round(ku_p_value, 4))
0.9684
Source code in mapie/metrics/calibration.py
def kuiper_p_value(y_true: NDArray, y_score: NDArray) -> float:
    """
    Compute Kuiper statistic p-value.
    Deduced from the corresponding statistic and CDF.
    It represents the probability of the observed statistic
    under the null hypothesis of perfect calibration.

    Parameters
    ----------
    y_true : NDArray of shape (n_samples,)
        An array of ground truth.

    y_score : NDArray of shape (n_samples,)
        An array of scores.

    Returns
    -------
    float
        The Kuiper p-value.

    References
    ----------
    Tygert M.
    Calibration of P-values for calibration and for deviation
    of a subpopulation from the full population.
    arXiv preprint arXiv:2202.00100.
    2022 Jan 31.

    William Feller.
    The Asymptotic Distribution of the Range of Sums of
    Independent Random Variables.
    Ann. Math. Statist. 22 (3) 427 - 432
    September, 1951.

    Examples
    --------
    >>> import pandas as pd
    >>> import numpy as np
    >>> from mapie.metrics.calibration import kuiper_p_value
    >>> y_true = np.array([1, 0, 1, 0, 1, 0])
    >>> y_score = np.array([0.8, 0.3, 0.5, 0.5, 0.7, 0.1])
    >>> ku_p_value = kuiper_p_value(y_true, y_score)
    >>> print(np.round(ku_p_value, 4))
    0.9684
    """
    _check_arrays_length(y_true, y_score)
    _check_array_nan(y_true)
    _check_array_inf(y_true)
    _check_array_nan(y_score)
    _check_array_inf(y_score)

    ku_stat = kuiper_statistic(y_true, y_score)
    ku_p_value = 1 - kuiper_cdf(ku_stat)
    return ku_p_value

mapie.metrics.calibration.spiegelhalter_statistic

spiegelhalter_statistic(
    y_true: NDArray, y_score: NDArray
) -> float

Compute Spiegelhalter's statistic for calibration test. The closer to zero, the better the scores are calibrated. Indeed, if the scores are perfectly calibrated, the Brier score simplifies to an expression whose expectancy and variance are easy to compute. The statistic is no more that a z-score on this normalized expression.

PARAMETER DESCRIPTION
y_true

An array of ground truth.

TYPE: NDArray of shape (n_samples,)

y_score

An array of scores.

TYPE: NDArray of shape (n_samples,)

RETURNS DESCRIPTION
float

Spiegelhalter's statistic.

References

Spiegelhalter DJ. Probabilistic prediction in patient management and clinical trials. Statistics in medicine. 1986 Sep;5(5):421-33.

Examples:

>>> import numpy as np
>>> from mapie.metrics.calibration import spiegelhalter_statistic
>>> y_true = np.array([0, 1, 0, 1, 0])
>>> y_score = np.array([0.1, 0.9, 0.21, 0.9, 0.5])
>>> print(np.round(spiegelhalter_statistic(y_true, y_score), 3))
-0.757
Source code in mapie/metrics/calibration.py
def spiegelhalter_statistic(y_true: NDArray, y_score: NDArray) -> float:
    """
    Compute Spiegelhalter's statistic for calibration test.
    The closer to zero, the better the scores are calibrated.
    Indeed, if the scores are perfectly calibrated,
    the Brier score simplifies to an expression whose expectancy
    and variance are easy to compute. The statistic is no more that
    a z-score on this normalized expression.

    Parameters
    ----------
    y_true : NDArray of shape (n_samples,)
        An array of ground truth.

    y_score : NDArray of shape (n_samples,)
        An array of scores.

    Returns
    -------
    float
        Spiegelhalter's statistic.

    References
    ----------
    Spiegelhalter DJ.
    Probabilistic prediction in patient management and clinical trials.
    Statistics in medicine.
    1986 Sep;5(5):421-33.

    Examples
    --------
    >>> import numpy as np
    >>> from mapie.metrics.calibration import spiegelhalter_statistic
    >>> y_true = np.array([0, 1, 0, 1, 0])
    >>> y_score = np.array([0.1, 0.9, 0.21, 0.9, 0.5])
    >>> print(np.round(spiegelhalter_statistic(y_true, y_score), 3))
    -0.757
    """
    _check_arrays_length(y_true, y_score)
    _check_array_nan(y_true)
    _check_array_inf(y_true)
    _check_array_nan(y_score)
    _check_array_inf(y_score)

    y_true = column_or_1d(y_true)
    y_score = column_or_1d(y_score)
    numerator: float = np.sum((y_true - y_score) * (1 - 2 * y_score))
    denominator = np.sqrt(np.sum((1 - 2 * y_score) ** 2 * y_score * (1 - y_score)))
    sp_stat = numerator / denominator
    return float(sp_stat)

mapie.metrics.calibration.spiegelhalter_p_value

spiegelhalter_p_value(
    y_true: NDArray, y_score: NDArray
) -> float

Compute Spiegelhalter statistic p-value. Deduced from the corresponding statistic and CDF, which is no more than the normal distribution. It represents the probability of the observed statistic under the null hypothesis of perfect calibration.

PARAMETER DESCRIPTION
y_true

An array of ground truth.

TYPE: NDArray of shape (n_samples,)

y_score

An array of scores.

TYPE: NDArray of shape (n_samples,)

RETURNS DESCRIPTION
float

The Spiegelhalter statistic p_value.

References

Spiegelhalter DJ. Probabilistic prediction in patient management and clinical trials. Statistics in medicine. 1986 Sep;5(5):421-33.

Examples:

>>> import numpy as np
>>> from mapie.metrics.calibration import spiegelhalter_p_value
>>> y_true = np.array([1, 0, 1, 0, 1, 0])
>>> y_score = np.array([0.8, 0.3, 0.5, 0.5, 0.7, 0.1])
>>> sp_p_value = spiegelhalter_p_value(y_true, y_score)
>>> print(np.round(sp_p_value, 4))
0.8486
Source code in mapie/metrics/calibration.py
def spiegelhalter_p_value(y_true: NDArray, y_score: NDArray) -> float:
    """
    Compute Spiegelhalter statistic p-value.
    Deduced from the corresponding statistic and CDF,
    which is no more than the normal distribution.
    It represents the probability of the observed statistic
    under the null hypothesis of perfect calibration.

    Parameters
    ----------
    y_true : NDArray of shape (n_samples,)
        An array of ground truth.

    y_score : NDArray of shape (n_samples,)
        An array of scores.

    Returns
    -------
    float
        The Spiegelhalter statistic p_value.

    References
    ----------
    Spiegelhalter DJ.
    Probabilistic prediction in patient management and clinical trials.
    Statistics in medicine.
    1986 Sep;5(5):421-33.

    Examples
    --------
    >>> import numpy as np
    >>> from mapie.metrics.calibration import spiegelhalter_p_value
    >>> y_true = np.array([1, 0, 1, 0, 1, 0])
    >>> y_score = np.array([0.8, 0.3, 0.5, 0.5, 0.7, 0.1])
    >>> sp_p_value = spiegelhalter_p_value(y_true, y_score)
    >>> print(np.round(sp_p_value, 4))
    0.8486
    """
    _check_arrays_length(y_true, y_score)
    _check_array_nan(y_true)
    _check_array_inf(y_true)
    _check_array_nan(y_score)
    _check_array_inf(y_score)
    sp_stat = spiegelhalter_statistic(y_true, y_score)
    sp_p_value = 1 - scipy.stats.norm.cdf(sp_stat)
    return float(sp_p_value)