Source code for hcrystalball.ensemble._stacking_ensemble

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator

from hcrystalball.exceptions import DuplicatedModelNameError
from hcrystalball.model_selection import FinerTimeSplit
from hcrystalball.utils import check_fit_before_predict
from hcrystalball.utils import check_X_y
from hcrystalball.utils import enforce_y_type
from hcrystalball.utils import get_estimator_name


[docs]class StackingEnsemble(BaseEstimator): """StackingEnsemble model, which takes a list of any hcrystalball model wrapper instance(s) as base learners. During fitting the base learners are fitted and prediction(s) will be made for the requested horizon, using possibly more than one splits. The predictions for each model in all splits are concatenated and serve as the feature matrix for the meta model, with the prediction of each model over all splits being a distinct feature. Finally the meta model, which is just a regular regressor, will then be fitted to the data to determine the relative weights of each base learner in the prediction of the ensemble. As a default behaviour the meta model is fitted only the first time the fit() method is called, then in each subsequent calls of the fit() method (of a given StackingEnsemble instance) omits the fitting of the meta model and fits only the base learners. This behaviour can, however be changed using the fit_meta_model_always parameter to force the meta model to be refitted every time the fit method is called. Note, however, that this latter behaviour can be computationally expensive, as fitting the meta model requires fitting the base learners train_n_splits times. Parameters ---------- name: str Unique name / identifier of the model instance base_learners: list List of fully instantiated hcrystalball model wrappers meta_model: sklearn.base.BaseEstimator Scikit-learn compatible regressor train_n_splits: int Number of splits used for fitting the meta model train_horizon: int Max. number of steps ahead to be predicted. Ideally this value should not be identical to the forecasting horizon in prediction. horizons_as_features: bool Adds horizon feature for meta model weekdays_as_features: bool Adds weekdays feature for meta model fit_meta_model_always: bool If True the meta model will always be re-fitted, each time the fit() method is called, if False the meta model will only be fitted the first time the fit() method is called and in subsequent calls of the fit() method only the base learners will be re-fitted. """ def __init__( self, base_learners, meta_model, name="stacking_ensemble", train_n_splits=1, train_horizon=10, horizons_as_features=True, weekdays_as_features=True, fit_meta_model_always=False, clip_predictions_lower=None, clip_predictions_upper=None, ): self._check_base_learners_names(base_learners) self.name = name self.base_learners = base_learners self.meta_model = meta_model self.train_n_splits = train_n_splits self.train_horizon = train_horizon self.fit_meta_model_always = fit_meta_model_always self.horizons_as_features = horizons_as_features self.weekdays_as_features = weekdays_as_features self.fitted = False self.clip_predictions_lower = clip_predictions_lower self.clip_predictions_upper = clip_predictions_upper @staticmethod def _check_base_learners_names(models): """Check if the base learner models have all unique names Parameters ---------- models: list List of instatiated hcrystalball model wrapper instances Returns ------- None Raises ------ DuplicatedModelNameError If multiple models have the same `name` attribute. """ names = [get_estimator_name(model) for model in models] if len(names) != len(set(names)): raise DuplicatedModelNameError( "There seems to be duplicates in model names among StackingEnsemble base learners. " "Model names should be unique." ) def _fit_base_learners(self, X, y=None): """Fit the base learners Parameters ---------- X: pandas.DataFrame Input features. y: numpy.ndarray Target vector.s Returns ------- None """ for model in self.base_learners: model.fit(X, y) def _predict_features_for_meta_models(self, X): """Provide predictions from all base learners Parameters ---------- X: pandas.DataFrame Input features. Returns ------- pandas.DataFrame Container with the X['date'] as index and the names of the base learners as column names. Each column should contain the prediction of a base learner with a name found in the column name. """ prediction = pd.DataFrame( index=X.index, columns=[get_estimator_name(model) for model in self.base_learners], ) for model in self.base_learners: model_name = get_estimator_name(model) prediction.loc[:, model_name] = model.predict(X).values.squeeze() return prediction @staticmethod def _create_horizons_as_features(cross_results_index, horizon, n_splits): """DataFrame with dummy columns describing the horizon variable. Dummy column is created for each horizon(i.e. horizon 5 == 5 new columns). Column itself will be 1 only for it's particular horizon, for the rest will be 0. This method is intended for use when 'variable_horizon' is set to True. Returns ------- pandas.DataFrame A DataFrame container with dummy column for each value in horizon (i.e. horizon 5 == 5 new columns). These features should help meta models to model properly weight meta predictions based on horizon """ return pd.get_dummies(pd.Series(list(np.arange(horizon)) * n_splits)).set_index(cross_results_index) @staticmethod def _create_weekdays_as_features(cross_results_index): """DataFrame with dummy columns for each week_day based on provided `cross_results_index` Returns ------- pandas.DataFrame """ return pd.get_dummies(pd.to_datetime(cross_results_index).day_name()).set_index(cross_results_index)
[docs] @enforce_y_type @check_X_y def fit(self, X, y=None): """Fit the stacking ensemble model Parameters ---------- X: pandas.DataFrame Input features. y: numpy.ndarray Target vector. Returns ------- StackingEnsemble A fitted StackingEnsemble instance """ self._check_base_learners_names(self.base_learners) # Fit the base learners and the meta_model if (not self.fitted) or self.fit_meta_model_always: splitter = FinerTimeSplit(horizon=self.train_horizon, n_splits=self.train_n_splits) n_train_meta = self.train_n_splits * self.train_horizon X_meta = pd.DataFrame( index=X.index[-n_train_meta:], columns=[get_estimator_name(bl) for bl in self.base_learners], ) y_meta = y[-n_train_meta:] # Get base learners predictions for ind_train, ind_pred in splitter.split(X): X_train = X.iloc[ind_train, :] X_pred = X.iloc[ind_pred, :] y_train = y[ind_train] self._fit_base_learners(X_train, y_train) X_meta.loc[X_pred.index, :] = self._predict_features_for_meta_models(X_pred) # Add dummy horizon variable for meta model if self.horizons_as_features: X_meta = pd.concat( [ X_meta, self._create_horizons_as_features( cross_results_index=X_meta.index, horizon=self.train_horizon, n_splits=self.train_n_splits, ), ], axis=1, ) if self.weekdays_as_features: X_meta = pd.concat( [X_meta, self._create_weekdays_as_features(cross_results_index=X_meta.index)], axis=1, ) self._fit_columns = X_meta.columns self.meta_model.fit(X_meta.values, y_meta) # Fit the base learners on the whole training set self._fit_base_learners(X, y) self.fitted = True return self
def _ensure_pred_and_train_cols_equals(self, X): """Returns Pandas dataframe for inference with the same features as during training (i.e. Test data could miss some months...). This method is important as most regressors expect the same structure of data for training as for inference Parameters ---------- data: pandas.DataFrame Input features. Returns ------- data pandas.DataFrame with the same features as train set had """ miss_cols = list(self._fit_columns.difference(X.columns)) if len(miss_cols) > 0: miss_data = pd.DataFrame( data=np.zeros((len(X.index), len(miss_cols))), columns=miss_cols, index=X.index, ) data = X.join(miss_data) return data[self._fit_columns] else: return X[self._fit_columns]
[docs] @check_fit_before_predict def predict(self, X): """Calculate the prediction of the ensemble for a given set of date / time Parameters ---------- X: pandas.DataFrame Input features. Returns ------- pandas.DataFrame A DataFrame container with the index being the input (date)time vector. The single column in the DataFrame contains the prediction and the column name is the name of the model (i.e. the `name` parameter passed to the constructor) """ X_meta = self._predict_features_for_meta_models(X) y_pred = pd.DataFrame(index=X.index, columns=[self.name]) if self.horizons_as_features: X_meta = pd.concat( [ X_meta, self._create_horizons_as_features( cross_results_index=X_meta.index, horizon=len(X_meta), n_splits=1, ), ], axis=1, ) if self.weekdays_as_features: X_meta = pd.concat( [X_meta, self._create_weekdays_as_features(cross_results_index=X_meta.index)], axis=1, ) X_meta = self._ensure_pred_and_train_cols_equals(X_meta) y_pred[self.name] = self.meta_model.predict(X_meta.values) y_pred[self.name] = y_pred[self.name].clip( lower=self.clip_predictions_lower, upper=self.clip_predictions_upper ) return y_pred