Source code for hcrystalball.model_selection._configuration

import logging

import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from hcrystalball.compose import TSColumnTransformer
from hcrystalball.feature_extraction import HolidayTransformer
from hcrystalball.metrics import get_scorer

from ._split import FinerTimeSplit

logger = logging.getLogger(__name__)

RANDOM_STATE = 42


[docs]def get_gridsearch(
    frequency,
    horizon=10,
    n_splits=5,
    between_split_lag=None,
    scoring="neg_mean_absolute_error",
    country_code_column=None,
    country_code=None,
    holidays_days_before=0,
    holidays_days_after=0,
    holidays_bridge_days=False,
    sklearn_models=True,
    sklearn_models_optimize_for_horizon=False,
    autosarimax_models=False,
    autoarima_dict=None,
    prophet_models=False,
    tbats_models=False,
    exp_smooth_models=False,
    theta_models=False,
    average_ensembles=False,
    stacking_ensembles=False,
    stacking_ensembles_train_horizon=10,
    stacking_ensembles_train_n_splits=20,
    clip_predictions_lower=None,
    clip_predictions_upper=None,
    exog_cols=None,
    hcb_verbose=False,
):
    """Get grid search object based on selection criteria.

    Parameters
    ----------
    frequency : str
        Frequency of timeseries. Pandas compatible frequncies

    horizon : int
        How many units of frequency (e.g. 4 quarters), should be used to find the best models

    n_splits : int
        How many cross-validation folds should be used in model selection

    between_split_lag : int
        How big lag of observations should cv_splits have
        If kept as None, horizon is used resulting in non-overlaping cv_splits

    scoring : str, callable
        String of sklearn regression metric name, or hcrystalball compatible scorer. For creation
        of hcrystalball compatible scorer use `make_ts_scorer` function.

    country_code_column : str, list
        Column(s) in data, that contain country code in str (e.g. 'DE'). Used in holiday transformer.
        Only one of `country_code_column` or `country_code` can be set.

    country_code : str, list
        Country code(s) in str (e.g. 'DE'). Used in holiday transformer.
        Only one of `country_code_column` or `country_code` can be set.

    holidays_days_before : int
        Number of days before the holiday which will be taken into account
        (i.e. 2 means that new bool column will be created and will be True for 2 days before holidays,
        otherwise False)

    holidays_days_after : int
        Number of days after the holiday which will be taken into account
        (i.e. 2 means that new bool column will be created and will be True for 2 days after holidays,
        otherwise False)

    holidays_bridge_days : bool
        Overlaping `holidays_days_before` and `holidays_days_after` feature which serves for modeling between
        holidays working days

    sklearn_models : bool
        Whether to consider sklearn models

    sklearn_models_optimize_for_horizon: bool
        Whether to add to default sklearn behavior also models, that optimize predictions for each horizon

    autosarimax_models : bool
        Whether to consider auto sarimax models

    autoarima_dict : dict
        Specification of pmdautoarima search space

    prophet_models : bool
        Whether to consider FB prophet models

    exp_smooth_models : bool
        Whether to consider exponential smoothing models

    average_ensembles : bool
        Whether to consider average ensemble models

    stacking_ensembles : bool
        Whether to consider stacking ensemble models

    stacking_ensembles_train_horizon : int
        Which horizon should be used in meta model in stacking ensembles

    stacking_ensembles_train_n_splits : int
        Number of splits used in meta model in stacking ensembles

    clip_predictions_lower : float, int
        Minimal number allowed in the predictions

    clip_predictions_upper : float, int
        Maximal number allowed in the predictions

    exog_cols : list
        List of columns to be used as exogenous variables

    hcb_verbose : bool
        Whtether to keep (True) or suppress (False) messages to stdout and stderr from the wrapper
        and 3rd party libraries during fit and predict

    Returns
    -------
    sklearn.model_selection.GridSearchCV
        CV / Model selection configuration
    """
    exog_cols = exog_cols or []
    country_code_columns = (
        [country_code_column] if isinstance(country_code_column, str) else country_code_column
    )
    country_codes = [country_code] if isinstance(country_code, str) else country_code

    # ensures only exogenous columns and country code column will be passed to model if provided
    # and columns names will be stored in TSColumnTransformer
    if exog_cols:
        cols = exog_cols + country_code_columns if country_code_columns else exog_cols
        exog_passthrough = TSColumnTransformer(transformers=[("raw_cols", "passthrough", cols)])
    else:
        exog_passthrough = "passthrough"
    # ensures holiday transformer is added to the pipeline if requested
    if country_codes:
        holiday = Pipeline(
            [
                (
                    f"holiday_{code}",
                    HolidayTransformer(
                        country_code=code,
                        days_before=holidays_days_before,
                        days_after=holidays_days_after,
                        bridge_days=holidays_bridge_days,
                    ),
                )
                for code in country_codes
            ]
        )
    elif country_code_columns:
        holiday = Pipeline(
            [
                (
                    f"holiday_{col}",
                    HolidayTransformer(
                        country_code_column=col,
                        days_before=holidays_days_before,
                        days_after=holidays_days_after,
                        bridge_days=holidays_bridge_days,
                    ),
                )
                for col in country_code_columns
            ]
        )
    else:
        holiday = "passthrough"

    estimator = Pipeline(
        [("exog_passthrough", exog_passthrough), ("holiday", holiday), ("model", "passthrough")]
    )

    cv = FinerTimeSplit(n_splits=n_splits, horizon=horizon, between_split_lag=between_split_lag)

    grid_search = GridSearchCV(
        estimator=estimator,
        param_grid=[],
        scoring=get_scorer(scoring),
        cv=cv,
        refit=False,
        error_score=np.nan,
    )

    if autosarimax_models:
        # adding autosarimax to param_grid might cause differently found models
        # for different splits and raise inconsistency based errors.
        # sarimax pipeline is added to new grid_search's attribute (`grid_search.autosarimax`)
        # and handled in `hcrystalball.model_seleciton.select_model` function in following way
        # 1. get best model for the data part on last split
        # 2. append this best model to original `param_grid`
        # 3. run full grid search with `param_grid` containing
        #    sarimax model selected from autosarimax in point 1
        from hcrystalball.wrappers import SarimaxWrapper

        if autoarima_dict is None:
            autoarima_dict = {}
        if "error_action" not in autoarima_dict:
            autoarima_dict.update({"error_action": "raise"})

        grid_search.autosarimax = Pipeline(estimator.steps[:-1])
        grid_search.autosarimax.steps.append(
            (
                "model",
                SarimaxWrapper(
                    init_with_autoarima=True,
                    autoarima_dict=autoarima_dict,
                    clip_predictions_lower=clip_predictions_lower,
                    clip_predictions_upper=clip_predictions_upper,
                    hcb_verbose=hcb_verbose,
                ),
            )
        )

    if stacking_ensembles or average_ensembles or sklearn_models:
        from sklearn.ensemble import RandomForestRegressor
        from sklearn.linear_model import ElasticNet

        from hcrystalball.feature_extraction import SeasonalityTransformer

        # TODO when scoring time is fixed, add HistGradientBoostingRegressor
        # from sklearn.experimental import enable_hist_gradient_boosting
        # from sklearn.ensemble import HistGradientBoostingRegressor
        from hcrystalball.wrappers import get_sklearn_wrapper

        sklearn_model = get_sklearn_wrapper(
            RandomForestRegressor,
            random_state=RANDOM_STATE,
            clip_predictions_lower=clip_predictions_lower,
            clip_predictions_upper=clip_predictions_upper,
            hcb_verbose=hcb_verbose,
        )

        sklearn_model_pipeline = Pipeline(
            [("seasonality", SeasonalityTransformer(auto=True, freq=frequency)), ("model", sklearn_model)]
        )
        # TODO make sure naming here works as expected
        sklearn_model_pipeline.name = f"seasonality_{sklearn_model.name}"

    if sklearn_models:
        models = {
            "ElasticNet": get_sklearn_wrapper(
                ElasticNet,
                clip_predictions_lower=clip_predictions_lower,
                clip_predictions_upper=clip_predictions_upper,
                hcb_verbose=hcb_verbose,
            ),
            "RandomForestRegressor": sklearn_model,
        }

        optimize_for_horizon = [False, True] if sklearn_models_optimize_for_horizon else [False]

        grid_search.param_grid.append(
            {
                "model": [sklearn_model_pipeline],
                "model__seasonality__weekly": [True, False],
                "model__model": list(models.values()),
                # TODO change add once HistGradientBoostingRegressor is back
                # "model__model": list(models.values()) + [sklearn_model]
                "model__model__optimize_for_horizon": optimize_for_horizon,
                "model__model__lags": [3, 7, 10, 14],
            }
        )

        grid_search.param_grid.append(
            {
                "model": [sklearn_model_pipeline],
                "model__seasonality__weekly": [True, False],
                "model__model__optimize_for_horizon": optimize_for_horizon,
                "model__model": [sklearn_model],
                "model__model__max_depth": [6],
            }
        )

    if prophet_models:
        from hcrystalball.wrappers import ProphetWrapper

        extra_regressors = [None] if exog_cols is None else [None, exog_cols]

        grid_search.param_grid.append(
            {
                "model": [
                    ProphetWrapper(
                        clip_predictions_lower=clip_predictions_lower,
                        clip_predictions_upper=clip_predictions_upper,
                        hcb_verbose=hcb_verbose,
                    )
                ],
                "model__seasonality_mode": ["multiplicative", "additive"],
                "model__extra_regressors": extra_regressors,
            }
        )

        grid_search.param_grid.append(
            {
                "model": [
                    ProphetWrapper(
                        clip_predictions_lower=clip_predictions_lower,
                        clip_predictions_upper=clip_predictions_upper,
                        hcb_verbose=hcb_verbose,
                    )
                ],
                "model__extra_seasonalities": [
                    [
                        {
                            "name": "quarterly",
                            "period": 90.0625,
                            "fourier_order": 5,
                            "prior_scale": 15.0,
                            "mode": None,
                        }
                    ]
                ],
                "model__extra_regressors": extra_regressors,
            }
        )

    if exp_smooth_models:
        from hcrystalball.wrappers import ExponentialSmoothingWrapper
        from hcrystalball.wrappers import HoltSmoothingWrapper
        from hcrystalball.wrappers import SimpleSmoothingWrapper

        # commented options show non deterministic behavior
        grid_search.param_grid.append(
            {
                "model": [
                    ExponentialSmoothingWrapper(
                        freq=frequency,
                        clip_predictions_lower=clip_predictions_lower,
                        clip_predictions_upper=clip_predictions_upper,
                        hcb_verbose=hcb_verbose,
                    )
                ],
                "model__trend": ["add"],
                "model__seasonal": [None, "add"],
                "model__damped_trend": [True, False],
                "model__use_boxcox": [True, False],
                "model__use_basinhopping": [False],
            }
        )

        grid_search.param_grid.append(
            {
                "model": [
                    ExponentialSmoothingWrapper(
                        freq=frequency,
                        clip_predictions_lower=clip_predictions_lower,
                        clip_predictions_upper=clip_predictions_upper,
                        hcb_verbose=hcb_verbose,
                    )
                ],
                "model__trend": ["add"],
                "model__seasonal": ["mul"],
                "model__damped_trend": [True, False],
                "model__use_boxcox": [False],
                "model__use_basinhopping": [False],
            }
        )

        grid_search.param_grid.append(
            {
                "model": [
                    ExponentialSmoothingWrapper(
                        freq=frequency,
                        clip_predictions_lower=clip_predictions_lower,
                        clip_predictions_upper=clip_predictions_upper,
                        hcb_verbose=hcb_verbose,
                    )
                ],
                "model__trend": [None],
                "model__seasonal": [None, "add", "mul"],
                "model__damped_trend": [False],
                "model__use_boxcox": [False],
                "model__use_basinhopping": [False],
            }
        )

        grid_search.param_grid.append(
            {
                "model": [
                    SimpleSmoothingWrapper(
                        clip_predictions_lower=clip_predictions_lower,
                        clip_predictions_upper=clip_predictions_upper,
                        hcb_verbose=hcb_verbose,
                    ),
                    HoltSmoothingWrapper(
                        clip_predictions_lower=clip_predictions_lower,
                        clip_predictions_upper=clip_predictions_upper,
                        hcb_verbose=hcb_verbose,
                    ),
                ]
            }
        )

    if theta_models:
        from hcrystalball.wrappers import ThetaWrapper

        grid_search.param_grid.append(
            {
                "model": [
                    ThetaWrapper(
                        clip_predictions_lower=clip_predictions_lower,
                        clip_predictions_upper=clip_predictions_upper,
                        hcb_verbose=hcb_verbose,
                    )
                ]
            }
        )

    if tbats_models:
        from hcrystalball.wrappers import TBATSWrapper

        grid_search.param_grid.append(
            {
                "model": [
                    TBATSWrapper(
                        use_arma_errors=False,
                        clip_predictions_lower=clip_predictions_lower,
                        clip_predictions_upper=clip_predictions_upper,
                        hcb_verbose=hcb_verbose,
                    )
                ]
            }
        )

    if stacking_ensembles:
        from sklearn.ensemble import RandomForestRegressor

        from hcrystalball.ensemble import StackingEnsemble
        from hcrystalball.wrappers import ProphetWrapper
        from hcrystalball.wrappers import ThetaWrapper

        grid_search.param_grid.append(
            {
                "model": [
                    StackingEnsemble(
                        train_n_splits=stacking_ensembles_train_n_splits,
                        train_horizon=stacking_ensembles_train_horizon,
                        meta_model=ElasticNet(),
                        horizons_as_features=True,
                        weekdays_as_features=True,
                        base_learners=[],
                        clip_predictions_lower=clip_predictions_lower,
                        clip_predictions_upper=clip_predictions_upper,
                    )
                ],
                "model__meta_model": [ElasticNet(), RandomForestRegressor(random_state=RANDOM_STATE)],
                "model__base_learners": [
                    [
                        ProphetWrapper(
                            clip_predictions_lower=clip_predictions_lower,
                            clip_predictions_upper=clip_predictions_upper,
                            hcb_verbose=hcb_verbose,
                        ),
                        sklearn_model_pipeline,
                        ThetaWrapper(
                            clip_predictions_lower=clip_predictions_lower,
                            clip_predictions_upper=clip_predictions_upper,
                            hcb_verbose=hcb_verbose,
                        ),
                    ],
                ],
            }
        )
    if average_ensembles:
        from hcrystalball.ensemble import SimpleEnsemble
        from hcrystalball.wrappers import ProphetWrapper
        from hcrystalball.wrappers import ThetaWrapper

        grid_search.param_grid.append(
            {
                "model": [
                    SimpleEnsemble(
                        base_learners=[],
                        clip_predictions_lower=clip_predictions_lower,
                        clip_predictions_upper=clip_predictions_upper,
                    )
                ],
                "model__base_learners": [
                    [
                        ProphetWrapper(
                            clip_predictions_lower=clip_predictions_lower,
                            clip_predictions_upper=clip_predictions_upper,
                            hcb_verbose=hcb_verbose,
                        ),
                        sklearn_model_pipeline,
                        ThetaWrapper(
                            clip_predictions_lower=clip_predictions_lower,
                            clip_predictions_upper=clip_predictions_upper,
                            hcb_verbose=hcb_verbose,
                        ),
                    ],
                ],
            }
        )

    return grid_search


[docs]def add_model_to_gridsearch(model, grid_search):
    """Extends gridsearch with provided model.

    Adds given model or list of models to the gridsearch under 'model' step

    Parameters
    ----------
    model : sklearn compatible model or list of sklearn compatible models
        model(s) to be added to provided grid search

    grid_search : sklearn.model_selection.GridSearchCV
        grid search, that has 'model' step as the last step

    Returns
    -------
    sklearn.model_selection.GridSearchCV
        Grid search enriched with given models
    """
    if isinstance(model, list):
        for mod in model:
            grid_search.param_grid.append({"model": [mod]})
    else:
        grid_search.param_grid.append({"model": [model]})

    return grid_search