Source code for hcrystalball.model_selection._configuration

import logging
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from ._split import FinerTimeSplit
from hcrystalball.compose import TSColumnTransformer
from hcrystalball.metrics import get_scorer

logger = logging.getLogger(__name__)


[docs]def get_gridsearch( frequency, horizon=10, n_splits=5, between_split_lag=None, scoring="neg_mean_absolute_error", country_code_column=None, country_code=None, sklearn_models=True, sklearn_models_optimize_for_horizon=False, autosarimax_models=False, autoarima_dict=None, prophet_models=False, tbats_models=False, exp_smooth_models=False, average_ensembles=False, stacking_ensembles=False, stacking_ensembles_train_horizon=10, stacking_ensembles_train_n_splits=20, clip_predictions_lower=None, clip_predictions_upper=None, exog_cols=None, ): """Get grid search object based on selection criteria. Parameters ---------- frequency : str Frequency of timeseries. Pandas compatible frequncies horizon : int How many units of frequency (e.g. 4 quarters), should be used to find the best models n_splits : int How many cross-validation folds should be used in model selection between_split_lag : int How big lag of observations should cv_splits have If kept as None, horizon is used resulting in non-overlaping cv_splits scoring : str, callable String of sklearn regression metric name, or hcrystalball compatible scorer. For creation of hcrystalball compatible scorer use `make_ts_scorer` function. country_code_column : str Column in data, that contains country code in str (e.g. 'DE'). Used in holiday transformer. Only one of `country_code_column` or `country_code` can be set. country_code : str Country code in str (e.g. 'DE'). Used in holiday transformer. Only one of `country_code_column` or `country_code` can be set. sklearn_models : bool Whether to consider sklearn models sklearn_models_optimize_for_horizon: bool Whether to add to default sklearn behavior also models, that optimize predictions for each horizon autosarimax_models : bool Whether to consider auto sarimax models autoarima_dict : dict Specification of pmdautoarima search space prophet_models : bool Whether to consider FB prophet models exp_smooth_models : bool Whether to consider exponential smoothing models average_ensembles : bool Whether to consider average ensemble models stacking_ensembles : bool Whether to consider stacking ensemble models stacking_ensembles_train_horizon : int Which horizon should be used in meta model in stacking ensembles stacking_ensembles_train_n_splits : int Number of splits used in meta model in stacking ensembles clip_predictions_lower : float, int Minimal number allowed in the predictions clip_predictions_upper : float, int Maximal number allowed in the predictions exog_cols : list List of columns to be used as exogenous variables Returns ------- sklearn.model_selection.GridSearchCV CV / Model selection configuration """ exog_cols = exog_cols if exog_cols is not None else [] # ensures only exogenous columns and country code column will be passed to model if provided # and columns names will be stored in TSColumnTransformer if exog_cols: cols = exog_cols + [country_code_column] if country_code_column else exog_cols exog_passthrough = TSColumnTransformer(transformers=[("raw_cols", "passthrough", cols)]) else: exog_passthrough = "passthrough" # ensures holiday transformer is added to the pipeline if requested if country_code or country_code_column: from hcrystalball.feature_extraction import HolidayTransformer holiday = HolidayTransformer(country_code=country_code, country_code_column=country_code_column) else: holiday = "passthrough" estimator = Pipeline( [("exog_passthrough", exog_passthrough), ("holiday", holiday), ("model", "passthrough")] ) scoring = get_scorer(scoring) cv = FinerTimeSplit(n_splits=n_splits, horizon=horizon, between_split_lag=between_split_lag) grid_search = GridSearchCV( estimator=estimator, param_grid=[], scoring=scoring, cv=cv, refit=False, error_score=np.nan, ) if autosarimax_models: # adding autosarimax to param_grid might cause differently found models # for different splits and raise inconsistency based errors. # sarimax pipeline is added to new grid_search's attribute (`grid_search.autosarimax`) # and handled in `hcrystalball.model_seleciton.select_model` function in following way # 1. get best model for the data part on last split # 2. append this best model to original `param_grid` # 3. run full grid search with `param_grid` containing # sarimax model selected from autosarimax in point 1 from hcrystalball.wrappers import SarimaxWrapper if autoarima_dict is None: autoarima_dict = {} if "error_action" not in autoarima_dict: autoarima_dict.update({"error_action": "raise"}) grid_search.autosarimax = Pipeline(estimator.steps[:-1]) grid_search.autosarimax.steps.append( ( "model", SarimaxWrapper( init_with_autoarima=True, autoarima_dict=autoarima_dict, clip_predictions_lower=clip_predictions_lower, clip_predictions_upper=clip_predictions_upper, ), ) ) if stacking_ensembles or average_ensembles or sklearn_models: from sklearn.linear_model import ElasticNet from sklearn.ensemble import RandomForestRegressor # TODO when scoring time is fixed, add HistGradientBoostingRegressor # from sklearn.experimental import enable_hist_gradient_boosting # from sklearn.ensemble import HistGradientBoostingRegressor from hcrystalball.wrappers import get_sklearn_wrapper from hcrystalball.feature_extraction import SeasonalityTransformer sklearn_model = get_sklearn_wrapper( RandomForestRegressor, clip_predictions_lower=clip_predictions_lower, clip_predictions_upper=clip_predictions_upper, ) sklearn_model_pipeline = Pipeline( [("seasonality", SeasonalityTransformer(auto=True, freq=frequency)), ("model", sklearn_model)] ) # TODO make sure naming here works as expected sklearn_model_pipeline.name = f"seasonality_{sklearn_model.name}" if sklearn_models: classes = [ElasticNet, RandomForestRegressor] models = { model_class.__name__: get_sklearn_wrapper( model_class, clip_predictions_lower=clip_predictions_lower, clip_predictions_upper=clip_predictions_upper, ) for model_class in classes } optimize_for_horizon = [False, True] if sklearn_models_optimize_for_horizon else [False] grid_search.param_grid.append( { "model": [sklearn_model_pipeline], "model__seasonality__weekly": [True, False], "model__model": list(models.values()), # TODO change add once HistGradientBoostingRegressor is back # "model__model": list(models.values()) + [sklearn_model] "model__model__optimize_for_horizon": optimize_for_horizon, "model__model__lags": [3, 7, 10, 14], } ) grid_search.param_grid.append( { "model": [sklearn_model_pipeline], "model__seasonality__weekly": [True, False], "model__model__optimize_for_horizon": optimize_for_horizon, "model__model": [sklearn_model], "model__model__max_depth": [6], } ) if prophet_models: from hcrystalball.wrappers import ProphetWrapper extra_regressors = [None] if exog_cols is None else [None, exog_cols] grid_search.param_grid.append( { "model": [ ProphetWrapper( clip_predictions_lower=clip_predictions_lower, clip_predictions_upper=clip_predictions_upper, ) ], "model__seasonality_mode": ["multiplicative", "additive"], "model__extra_regressors": extra_regressors, } ) grid_search.param_grid.append( { "model": [ ProphetWrapper( clip_predictions_lower=clip_predictions_lower, clip_predictions_upper=clip_predictions_upper, ) ], "model__extra_seasonalities": [ [ { "name": "quarterly", "period": 90.0625, "fourier_order": 5, "prior_scale": 15.0, "mode": None, } ] ], "model__extra_regressors": extra_regressors, } ) if exp_smooth_models: from hcrystalball.wrappers import ExponentialSmoothingWrapper from hcrystalball.wrappers import HoltSmoothingWrapper from hcrystalball.wrappers import SimpleSmoothingWrapper # commented options show non deterministic behavior grid_search.param_grid.append( { "model": [ ExponentialSmoothingWrapper( freq=frequency, clip_predictions_lower=clip_predictions_lower, clip_predictions_upper=clip_predictions_upper, ) ], "model__trend": ["add"], "model__seasonal": [None, "add"], "model__damped": [True, False], "model__fit_params": [ {"use_boxcox": True, "use_basinhopping": False}, # {'use_boxcox':True, 'use_basinhopping':True}, {"use_boxcox": False, "use_basinhopping": False}, # {'use_boxcox':False, 'use_basinhopping':True} ], } ) grid_search.param_grid.append( { "model": [ ExponentialSmoothingWrapper( freq=frequency, clip_predictions_lower=clip_predictions_lower, clip_predictions_upper=clip_predictions_upper, ) ], "model__trend": ["add"], "model__seasonal": ["mul"], "model__damped": [True, False], "model__fit_params": [ {"use_boxcox": False, "use_basinhopping": False}, # {'use_boxcox':False, 'use_basinhopping':True} ], } ) grid_search.param_grid.append( { "model": [ ExponentialSmoothingWrapper( freq=frequency, clip_predictions_lower=clip_predictions_lower, clip_predictions_upper=clip_predictions_upper, ) ], "model__trend": [None], "model__seasonal": [None, "add", "mul"], "model__damped": [False], "model__fit_params": [ {"use_boxcox": False, "use_basinhopping": False}, # {'use_boxcox':False, 'use_basinhopping':True} ], } ) grid_search.param_grid.append( { "model": [ SimpleSmoothingWrapper( clip_predictions_lower=clip_predictions_lower, clip_predictions_upper=clip_predictions_upper, ), HoltSmoothingWrapper( clip_predictions_lower=clip_predictions_lower, clip_predictions_upper=clip_predictions_upper, ), ] } ) if tbats_models: from hcrystalball.wrappers import TBATSWrapper grid_search.param_grid.append( { "model": [ TBATSWrapper( use_arma_errors=False, clip_predictions_lower=clip_predictions_lower, clip_predictions_upper=clip_predictions_upper, ) ] } ) if stacking_ensembles: from hcrystalball.ensemble import StackingEnsemble from hcrystalball.wrappers import ProphetWrapper from sklearn.ensemble import RandomForestRegressor grid_search.param_grid.append( { "model": [ StackingEnsemble( train_n_splits=stacking_ensembles_train_n_splits, train_horizon=stacking_ensembles_train_horizon, meta_model=ElasticNet(), horizons_as_features=True, weekdays_as_features=True, base_learners=[], clip_predictions_lower=clip_predictions_lower, clip_predictions_upper=clip_predictions_upper, ) ], "model__meta_model": [ElasticNet(), RandomForestRegressor()], "model__base_learners": [ [ ProphetWrapper( clip_predictions_lower=clip_predictions_lower, clip_predictions_upper=clip_predictions_upper, ), sklearn_model_pipeline, ], ], } ) if average_ensembles: from hcrystalball.ensemble import SimpleEnsemble from hcrystalball.wrappers import ProphetWrapper grid_search.param_grid.append( { "model": [ SimpleEnsemble( base_learners=[], clip_predictions_lower=clip_predictions_lower, clip_predictions_upper=clip_predictions_upper, ) ], "model__base_learners": [ [ ProphetWrapper( clip_predictions_lower=clip_predictions_lower, clip_predictions_upper=clip_predictions_upper, ), sklearn_model_pipeline, ], ], } ) return grid_search
[docs]def add_model_to_gridsearch(model, grid_search): """Extends gridsearch with provided model. Adds given model or list of models to the gridsearch under 'model' step Parameters ---------- model : sklearn compatible model or list of sklearn compatible models model(s) to be added to provided grid search grid_search : sklearn.model_selection.GridSearchCV grid search, that has 'model' step as the last step Returns ------- sklearn.model_selection.GridSearchCV Grid search enriched with given models """ if isinstance(model, list): for mod in model: grid_search.param_grid.append({"model": [mod]}) else: grid_search.param_grid.append({"model": [model]}) return grid_search