Source code for hcrystalball.metrics._scorer

from collections import defaultdict

import numpy as np
import pandas as pd
from sklearn.metrics import SCORERS
from sklearn.metrics._scorer import _BaseScorer

from hcrystalball.utils import generate_estimator_hash
from hcrystalball.utils import get_estimator_repr


class PersistCVDataMixin:
    def _save_prediction(self, y_pred, estimator_label, y_true):
        """Persist the prediction in cross validation

        Parameters
        ----------
        y_pred: `pandas.DataFrame`
            Predictions. A DataFrame container with a single column and datetime index

        estimator_label: str
            Label of the estimator used to identify the model with a given parameter set in the presisted
            data

        y_true: `pandas.DataFrame`
            True values. A DataFrame container with a single column and the same datetime index as
            y_pred. If not set, the 'y_true' column will be omitted in the data persistence without
            raising any warning or exception.

        Returns
        -------
        None
        """
        # Check if the predicted indices exist already in the dataframe
        if not y_pred.index.isin(self._cv_data.index).all():
            # We're in a new split
            new_split_df = pd.DataFrame({"y_true": y_true}, index=y_pred.index).assign(
                split=self._split_index[estimator_label]
            )
            self._cv_data = self._cv_data.append(new_split_df, sort=False)

        # Add the new predictions to the cv data container
        self._cv_data.loc[
            lambda x: x["split"] == self._split_index[estimator_label], estimator_label
        ] = y_pred.values[:, 0]
        self._split_index[estimator_label] += 1

    def _upsert_estimator_hash(self, estimator_repr, estimator_hash):
        if estimator_hash not in self._estimator_ids:
            self._estimator_ids[estimator_hash] = estimator_repr


class _TSPredictScorer(_BaseScorer, PersistCVDataMixin):
    def __call__(self, estimator, X, y_true, sample_weight=None):
        """Evaluate predicted target values for X relative to y_true.

        Parameters
        ----------
        estimator : object
            Trained estimator to use for scoring. Must have a predict_proba
            method; the output of that is used to compute the score.

        X : array-like or sparse matrix
            Test data that will be fed to estimator.predict.

        y_true : array-like
            Gold standard target values for X.

        sample_weight : array-like
            Sample weights.

        Returns
        -------
        score : float
            Score function applied to prediction of estimator on X.
        """
        return self._score(None, estimator, X, y_true, sample_weight)

    def _score(self, method_caller, estimator, X, y_true, sample_weight=None):
        """Evaluate predicted target values for X relative to y_true.

        Parameters
        ----------
        method_caller : callable
            Returns predictions given an estimator, method name, and other
            arguments, potentially caching results.

        estimator : object
            Trained estimator to use for scoring. Must have a predict_proba
            method; the output of that is used to compute the score.

        X : array-like or sparse matrix
            Test data that will be fed to estimator.predict.

        y_true : array-like
            Gold standard target values for X.

        sample_weight : array-like
            Sample weights.

        Returns
        -------
        score : float
            Score function applied to prediction of estimator on X.
        """

        y_pred = estimator.predict(X)

        estimator_repr = get_estimator_repr(estimator)
        estimator_hash = generate_estimator_hash(estimator)
        self._upsert_estimator_hash(estimator_repr, estimator_hash)
        self._save_prediction(y_pred=y_pred, estimator_label=estimator_hash, y_true=y_true)

        if y_pred.isna().any().any() or np.isinf(y_pred).any().any():
            return np.nan
        else:
            if sample_weight is not None:
                return self._sign * self._score_func(
                    y_true, y_pred, sample_weight=sample_weight, **self._kwargs
                )
            else:
                return self._sign * self._score_func(y_true, y_pred, **self._kwargs)

    def __init__(self, score_func, sign, kwargs):
        """Enhances inherited init with cv data, split index and estimator ids

        Parameters
        ----------
        score_func : callable
            Scoring function
        sign : int
            Whether to minimize or maximize scoring function
        kwargs : dict
            Additional arguments to be passed to inherited init
        """
        super().__init__(score_func, sign, kwargs)

        self._cv_data = pd.DataFrame(columns=["split"])
        self._estimator_ids = dict()
        self._split_index = defaultdict(int)

    @property
    def estimator_ids(self):
        return self._estimator_ids

    @property
    def cv_data(self):
        if self._cv_data.shape[0] > 0:
            return self._cv_data
        else:
            return None


[docs]def get_scorer(function="neg_mean_absolute_error"): """Get a scorer supporting storing data for gridsearch from string. Parameters ---------- function : callable or str callable your own function Returns ------- sklearn compatible scorer Scorer with data and estimator ids storage """ if isinstance(function, str): scorer = SCORERS[function] greater_is_better = True if scorer._sign == 1 else False return make_ts_scorer(scorer._score_func, greater_is_better) elif hasattr(function, "_cv_data") and hasattr(function, "_estimator_ids"): return function else: raise ValueError( f"Provided scoring function must be instance of `_TSPredictScorer`" f"(use make_ts_scorer) or one of {SCORERS.keys()}" )
[docs]def make_ts_scorer( score_func, greater_is_better=True, needs_proba=False, needs_threshold=False, **kwargs, ): """Make a scorer from a performance metric or loss function. This factory function wraps scoring functions for use in `~sklearn.model_selection.GridSearchCV` and `~sklearn.model_selection.cross_validate`. It takes a score function, such as ``accuracy_score``, ``mean_squared_error``, ``adjusted_rand_index`` or ``average_precision`` and returns a callable that scores an estimator's output. Read more in the :ref:`User Guide <scoring>`. Parameters ---------- score_func : callable, Score function (or loss function) with signature ``score_func(y, y_pred, **kwargs)``. greater_is_better : boolean Whether score_func is a score function (default), meaning high is good, or a loss function, meaning low is good. In the latter case, the scorer object will sign-flip the outcome of the score_func. needs_proba : boolean Not yet implemented, kept only to be compatible with the scikit-learn API needs_threshold : boolean Not yet implemented, kept only to be compatible with the scikit-learn API **kwargs : additional arguments Additional parameters to be passed to score_func. Returns ------- callable scorer object that returns a scalar score """ sign = 1 if greater_is_better else -1 if needs_proba and needs_threshold: raise ValueError("Set either needs_proba or needs_threshold to True," " but not both.") if needs_proba: raise NotImplementedError("Usage/evaluation of prediction probabilities are not yet implemented.") elif needs_threshold: raise NotImplementedError("Evaluation of decision function output is not yet implemented.") else: cls = _TSPredictScorer return cls(score_func, sign, kwargs)