Source code for hcrystalball.model_selection._configuration

import logging
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from ._split import FinerTimeSplit
from hcrystalball.compose import TSColumnTransformer
from hcrystalball.metrics import get_scorer

logger = logging.getLogger(__name__)


[docs]def get_gridsearch(
    frequency,
    horizon=10,
    n_splits=5,
    between_split_lag=None,
    scoring="neg_mean_absolute_error",
    country_code_column=None,
    country_code=None,
    sklearn_models=True,
    sklearn_models_optimize_for_horizon=False,
    autosarimax_models=False,
    autoarima_dict=None,
    prophet_models=False,
    tbats_models=False,
    exp_smooth_models=False,
    average_ensembles=False,
    stacking_ensembles=False,
    stacking_ensembles_train_horizon=10,
    stacking_ensembles_train_n_splits=20,
    clip_predictions_lower=None,
    clip_predictions_upper=None,
    exog_cols=None,
):
    """Get grid search object based on selection criteria.

    Parameters
    ----------
    frequency : str
        Frequency of timeseries. Pandas compatible frequncies

    horizon : int
        How many units of frequency (e.g. 4 quarters), should be used to find the best models

    n_splits : int
        How many cross-validation folds should be used in model selection

    between_split_lag : int
        How big lag of observations should cv_splits have
        If kept as None, horizon is used resulting in non-overlaping cv_splits

    scoring : str, callable
        String of sklearn regression metric name, or hcrystalball compatible scorer. For creation
        of hcrystalball compatible scorer use `make_ts_scorer` function.

    country_code_column : str
        Column in data, that contains country code in str (e.g. 'DE'). Used in holiday transformer.
        Only one of `country_code_column` or `country_code` can be set.

    country_code : str
        Country code in str (e.g. 'DE'). Used in holiday transformer.
        Only one of `country_code_column` or `country_code` can be set.

    sklearn_models : bool
        Whether to consider sklearn models

    sklearn_models_optimize_for_horizon: bool
        Whether to add to default sklearn behavior also models, that optimize predictions for each horizon

    autosarimax_models : bool
        Whether to consider auto sarimax models

    autoarima_dict : dict
        Specification of pmdautoarima search space

    prophet_models : bool
        Whether to consider FB prophet models

    exp_smooth_models : bool
        Whether to consider exponential smoothing models

    average_ensembles : bool
        Whether to consider average ensemble models

    stacking_ensembles : bool
        Whether to consider stacking ensemble models

    stacking_ensembles_train_horizon : int
        Which horizon should be used in meta model in stacking ensembles

    stacking_ensembles_train_n_splits : int
        Number of splits used in meta model in stacking ensembles

    clip_predictions_lower : float, int
        Minimal number allowed in the predictions

    clip_predictions_upper : float, int
        Maximal number allowed in the predictions

    exog_cols : list
        List of columns to be used as exogenous variables

    Returns
    -------
    sklearn.model_selection.GridSearchCV
        CV / Model selection configuration
    """
    exog_cols = exog_cols if exog_cols is not None else []
    # ensures only exogenous columns and country code column will be passed to model if provided
    # and columns names will be stored in TSColumnTransformer
    if exog_cols:
        cols = exog_cols + [country_code_column] if country_code_column else exog_cols
        exog_passthrough = TSColumnTransformer(transformers=[("raw_cols", "passthrough", cols)])
    else:
        exog_passthrough = "passthrough"
    # ensures holiday transformer is added to the pipeline if requested
    if country_code or country_code_column:
        from hcrystalball.feature_extraction import HolidayTransformer

        holiday = HolidayTransformer(country_code=country_code, country_code_column=country_code_column)
    else:
        holiday = "passthrough"

    estimator = Pipeline(
        [("exog_passthrough", exog_passthrough), ("holiday", holiday), ("model", "passthrough")]
    )

    scoring = get_scorer(scoring)
    cv = FinerTimeSplit(n_splits=n_splits, horizon=horizon, between_split_lag=between_split_lag)

    grid_search = GridSearchCV(
        estimator=estimator, param_grid=[], scoring=scoring, cv=cv, refit=False, error_score=np.nan,
    )

    if autosarimax_models:
        # adding autosarimax to param_grid might cause differently found models
        # for different splits and raise inconsistency based errors.
        # sarimax pipeline is added to new grid_search's attribute (`grid_search.autosarimax`)
        # and handled in `hcrystalball.model_seleciton.select_model` function in following way
        # 1. get best model for the data part on last split
        # 2. append this best model to original `param_grid`
        # 3. run full grid search with `param_grid` containing
        #    sarimax model selected from autosarimax in point 1
        from hcrystalball.wrappers import SarimaxWrapper

        if autoarima_dict is None:
            autoarima_dict = {}
        if "error_action" not in autoarima_dict:
            autoarima_dict.update({"error_action": "raise"})

        grid_search.autosarimax = Pipeline(estimator.steps[:-1])
        grid_search.autosarimax.steps.append(
            (
                "model",
                SarimaxWrapper(
                    init_with_autoarima=True,
                    autoarima_dict=autoarima_dict,
                    clip_predictions_lower=clip_predictions_lower,
                    clip_predictions_upper=clip_predictions_upper,
                ),
            )
        )

    if stacking_ensembles or average_ensembles or sklearn_models:
        from sklearn.linear_model import ElasticNet
        from sklearn.ensemble import RandomForestRegressor

        # TODO when scoring time is fixed, add HistGradientBoostingRegressor
        # from sklearn.experimental import enable_hist_gradient_boosting
        # from sklearn.ensemble import HistGradientBoostingRegressor
        from hcrystalball.wrappers import get_sklearn_wrapper
        from hcrystalball.feature_extraction import SeasonalityTransformer

        sklearn_model = get_sklearn_wrapper(
            RandomForestRegressor,
            clip_predictions_lower=clip_predictions_lower,
            clip_predictions_upper=clip_predictions_upper,
        )

        sklearn_model_pipeline = Pipeline(
            [("seasonality", SeasonalityTransformer(auto=True, freq=frequency)), ("model", sklearn_model)]
        )
        # TODO make sure naming here works as expected
        sklearn_model_pipeline.name = f"seasonality_{sklearn_model.name}"

    if sklearn_models:
        classes = [ElasticNet, RandomForestRegressor]
        models = {
            model_class.__name__: get_sklearn_wrapper(
                model_class,
                clip_predictions_lower=clip_predictions_lower,
                clip_predictions_upper=clip_predictions_upper,
            )
            for model_class in classes
        }

        optimize_for_horizon = [False, True] if sklearn_models_optimize_for_horizon else [False]

        grid_search.param_grid.append(
            {
                "model": [sklearn_model_pipeline],
                "model__seasonality__weekly": [True, False],
                "model__model": list(models.values()),
                # TODO change add once HistGradientBoostingRegressor is back
                # "model__model": list(models.values()) + [sklearn_model]
                "model__model__optimize_for_horizon": optimize_for_horizon,
                "model__model__lags": [3, 7, 10, 14],
            }
        )

        grid_search.param_grid.append(
            {
                "model": [sklearn_model_pipeline],
                "model__seasonality__weekly": [True, False],
                "model__model__optimize_for_horizon": optimize_for_horizon,
                "model__model": [sklearn_model],
                "model__model__max_depth": [6],
            }
        )

    if prophet_models:
        from hcrystalball.wrappers import ProphetWrapper

        extra_regressors = [None] if exog_cols is None else [None, exog_cols]

        grid_search.param_grid.append(
            {
                "model": [
                    ProphetWrapper(
                        clip_predictions_lower=clip_predictions_lower,
                        clip_predictions_upper=clip_predictions_upper,
                    )
                ],
                "model__seasonality_mode": ["multiplicative", "additive"],
                "model__extra_regressors": extra_regressors,
            }
        )

        grid_search.param_grid.append(
            {
                "model": [
                    ProphetWrapper(
                        clip_predictions_lower=clip_predictions_lower,
                        clip_predictions_upper=clip_predictions_upper,
                    )
                ],
                "model__extra_seasonalities": [
                    [
                        {
                            "name": "quarterly",
                            "period": 90.0625,
                            "fourier_order": 5,
                            "prior_scale": 15.0,
                            "mode": None,
                        }
                    ]
                ],
                "model__extra_regressors": extra_regressors,
            }
        )

    if exp_smooth_models:
        from hcrystalball.wrappers import ExponentialSmoothingWrapper
        from hcrystalball.wrappers import HoltSmoothingWrapper
        from hcrystalball.wrappers import SimpleSmoothingWrapper

        # commented options show non deterministic behavior
        grid_search.param_grid.append(
            {
                "model": [
                    ExponentialSmoothingWrapper(
                        freq=frequency,
                        clip_predictions_lower=clip_predictions_lower,
                        clip_predictions_upper=clip_predictions_upper,
                    )
                ],
                "model__trend": ["add"],
                "model__seasonal": [None, "add"],
                "model__damped": [True, False],
                "model__fit_params": [
                    {"use_boxcox": True, "use_basinhopping": False},
                    # {'use_boxcox':True, 'use_basinhopping':True},
                    {"use_boxcox": False, "use_basinhopping": False},
                    # {'use_boxcox':False, 'use_basinhopping':True}
                ],
            }
        )

        grid_search.param_grid.append(
            {
                "model": [
                    ExponentialSmoothingWrapper(
                        freq=frequency,
                        clip_predictions_lower=clip_predictions_lower,
                        clip_predictions_upper=clip_predictions_upper,
                    )
                ],
                "model__trend": ["add"],
                "model__seasonal": ["mul"],
                "model__damped": [True, False],
                "model__fit_params": [
                    {"use_boxcox": False, "use_basinhopping": False},
                    # {'use_boxcox':False, 'use_basinhopping':True}
                ],
            }
        )

        grid_search.param_grid.append(
            {
                "model": [
                    ExponentialSmoothingWrapper(
                        freq=frequency,
                        clip_predictions_lower=clip_predictions_lower,
                        clip_predictions_upper=clip_predictions_upper,
                    )
                ],
                "model__trend": [None],
                "model__seasonal": [None, "add", "mul"],
                "model__damped": [False],
                "model__fit_params": [
                    {"use_boxcox": False, "use_basinhopping": False},
                    # {'use_boxcox':False, 'use_basinhopping':True}
                ],
            }
        )

        grid_search.param_grid.append(
            {
                "model": [
                    SimpleSmoothingWrapper(
                        clip_predictions_lower=clip_predictions_lower,
                        clip_predictions_upper=clip_predictions_upper,
                    ),
                    HoltSmoothingWrapper(
                        clip_predictions_lower=clip_predictions_lower,
                        clip_predictions_upper=clip_predictions_upper,
                    ),
                ]
            }
        )

    if tbats_models:
        from hcrystalball.wrappers import TBATSWrapper

        grid_search.param_grid.append(
            {
                "model": [
                    TBATSWrapper(
                        use_arma_errors=False,
                        clip_predictions_lower=clip_predictions_lower,
                        clip_predictions_upper=clip_predictions_upper,
                    )
                ]
            }
        )

    if stacking_ensembles:
        from hcrystalball.ensemble import StackingEnsemble
        from hcrystalball.wrappers import ProphetWrapper
        from sklearn.ensemble import RandomForestRegressor

        grid_search.param_grid.append(
            {
                "model": [
                    StackingEnsemble(
                        train_n_splits=stacking_ensembles_train_n_splits,
                        train_horizon=stacking_ensembles_train_horizon,
                        meta_model=ElasticNet(),
                        horizons_as_features=True,
                        weekdays_as_features=True,
                        base_learners=[],
                        clip_predictions_lower=clip_predictions_lower,
                        clip_predictions_upper=clip_predictions_upper,
                    )
                ],
                "model__meta_model": [ElasticNet(), RandomForestRegressor()],
                "model__base_learners": [
                    [
                        ProphetWrapper(
                            clip_predictions_lower=clip_predictions_lower,
                            clip_predictions_upper=clip_predictions_upper,
                        ),
                        sklearn_model_pipeline,
                    ],
                ],
            }
        )
    if average_ensembles:
        from hcrystalball.ensemble import SimpleEnsemble
        from hcrystalball.wrappers import ProphetWrapper

        grid_search.param_grid.append(
            {
                "model": [
                    SimpleEnsemble(
                        base_learners=[],
                        clip_predictions_lower=clip_predictions_lower,
                        clip_predictions_upper=clip_predictions_upper,
                    )
                ],
                "model__base_learners": [
                    [
                        ProphetWrapper(
                            clip_predictions_lower=clip_predictions_lower,
                            clip_predictions_upper=clip_predictions_upper,
                        ),
                        sklearn_model_pipeline,
                    ],
                ],
            }
        )

    return grid_search


[docs]def add_model_to_gridsearch(model, grid_search):
    """Extends gridsearch with provided model.

    Adds given model or list of models to the gridsearch under 'model' step

    Parameters
    ----------
    model : sklearn compatible model or list of sklearn compatible models
        model(s) to be added to provided grid search

    grid_search : sklearn.model_selection.GridSearchCV
        grid search, that has 'model' step as the last step

    Returns
    -------
    sklearn.model_selection.GridSearchCV
        Grid search enriched with given models
    """
    if isinstance(model, list):
        for mod in model:
            grid_search.param_grid.append({"model": [mod]})
    else:
        grid_search.param_grid.append({"model": [model]})

    return grid_search