Source code for hcrystalball.model_selection._model_selector_result

import functools

import pandas as pd

from hcrystalball.utils import generate_estimator_hash
from hcrystalball.utils import generate_partition_hash
from hcrystalball.utils import get_estimator_name

from .utils import _load_file
from .utils import _persist_to_file

load_model_selector_result = functools.partial(_load_file, expert_type="model_selector_result")


[docs]class ModelSelectorResult: """Consolidate infromation/methods from cross validation for 1 time series Store all relevant information about model selection and provide utility methods (e.g. plot_model_performance) and data (e.g. df_plot) for easier access to further insights. Parameters ---------- best_model : sklearn compatible estimator best model found during model selection cv_results : pandas.DataFrame cv_results of sklearn.model_selection.GridSearchCV in form of DataFrame cv_data : pandas.DataFrame data with models predictions, cv split indication and true target values model_reprs : dict dictionary of model representations used in model selection in form of {model_hash : model_repr} partition : dict dictionary indicating for which part of the data the model selection results belong to e.g. {"Region":"Canada", "Product":"Chips"} X_train : pandas.DataFrame training data features y_train : pandas.Series training data target frequency : str temporal frequency of data on which the model was trained / selected horizon : int how many steps ahead predictions were made country_code_column : str Name of the column with ISO code of country/region, which can be used for supplying holiday. e.g. 'State' with values like 'DE', 'CZ' or 'Region' with values like 'DE-NW', 'DE-HE', etc. """ def __init__( self, best_model, cv_results, cv_data, model_reprs, partition, X_train, y_train, frequency, horizon, country_code_column, best_model_rank, ): self.best_model = best_model self.cv_results = cv_results self.cv_data = cv_data self.model_reprs = model_reprs self.partition = partition self.X_train = X_train self.y_train = y_train self.frequency = frequency self.horizon = horizon self.country_code_column = country_code_column self.best_model_rank = best_model_rank self.best_model_hash = generate_estimator_hash(best_model) self.best_model_cv_data = self.cv_data.rename({self.best_model_hash: "best_model"}, axis=1)[ ["split", "y_true", "best_model"] ] self.best_model_name = get_estimator_name(best_model).replace("model__", "") self.best_model_cv_results = self.cv_results[ self.cv_results["rank_test_score"] == self.best_model_rank ].iloc[0] self.best_model_repr = self.model_reprs[self.best_model_hash] self.partition_hash = generate_partition_hash(self.partition) self._persist_attrs = sorted(set(self.__dict__.keys()).difference(["self"])) self._df_plot = None
[docs] def persist(self, attribute_name=None, path=""): """Persist whole object or particular object attributes Parameters ---------- attribute_name : str Name of the attribute to be stored - stores whole object path: str Where to store the object or object attribute Creates file named as {partition_hash}.{attribute_name} by default at current working directory Raises ------ ValueError If attribute not a valid option. Lists available ones """ if attribute_name is None: _persist_to_file( data=self, expert_type="model_selector_result", partition_hash=self.partition_hash, path=path, ) else: if attribute_name not in self._persist_attrs: raise ValueError( f"Parameter attribute must be one of {self._persist_attrs}, " f"but you provided {attribute_name}" ) _persist_to_file( data=getattr(self, attribute_name), expert_type=attribute_name, partition_hash=self.partition_hash, path=path, )
@property def df_plot(self): """Training data suitable for plotting. Utility, that prepares data from model selection to be used for further model performance analysis Returns ------- pandas.DataFrame Data suitable for plotting """ # ensures this will be called only once if self._df_plot is None: self._df_plot = ( pd.merge( self.y_train.rename("actuals"), self.best_model_cv_data[["best_model", "split"]].rename( {"best_model": f"cv_forecast({self.best_model_name})", "split": "cv_split"}, axis=1, ), left_index=True, right_index=True, how="outer", ) # TODO add different error functions?? .assign(error=lambda x: (x["actuals"] - x[f"cv_forecast({self.best_model_name})"]).abs()) .assign(cv_split_str=lambda x: "cv_split=" + x["cv_split"].astype(str)) .assign( mae=lambda x: x["cv_split_str"].map(x.groupby(["cv_split_str"])["error"].mean().to_dict()) ) .assign(cv_split_str=lambda x: x["cv_split_str"] + ", mae=" + x["mae"].round(2).astype(str)) ) return self._df_plot @property def cv_splits_overlap(self): """Indicator for cv_splits overlap in training data Returns ------- bool Whether cv_splits in training data contain overlap """ return sum(self.df_plot.reset_index().groupby("index")["cv_split"].count() > 1) > 0
[docs] def plot_result(self, plot_from=None, **plot_params): """Plot model performance from given `plot_from` timestamp Parameters ---------- plot_from : str date from which to show actuals, cv_forecast and forecast, Default behavior does not filter dates plot_params : kwargs plotting parameters passed down to pandas.DataFrame.plot() dependent on your plotting backend e.g. figsize = (16,9), `title = 'Performance of Model'` Returns ------- pandas.DataFrame.plot() plot depending on your plotting backend, by default plot from matplotlib """ df = self.df_plot plot_from = plot_from or df.index.min() if self.cv_splits_overlap: # plot each split separately plts = [] cv_fcst_col = f"cv_forecast({self.best_model_name})" for split in df["cv_split"].dropna().unique(): plt = ( df[plot_from:] .drop( ["error", cv_fcst_col, "cv_split", "cv_split_str", "mae"], axis=1, ) .plot(**plot_params) ) # get limits for shaded areas min_y, max_y = plt.get_ylim() plt_cv = df[plot_from:].loc[lambda x: x["cv_split"] == split, [cv_fcst_col, "cv_split_str"]] plt = plt_cv[[cv_fcst_col]].plot( ax=plt, title=f"{plt.get_title()} | ({plt_cv['cv_split_str'].unique()[0]})", ) if not plt_cv.empty: plt.fill_between( x=plt_cv.index.values, y1=max_y, y2=min_y, alpha=[0.2, 0.4][split % 2], color=["gray"], label=plt_cv["cv_split_str"].unique()[0], ) plt.legend(facecolor="white", framealpha=0.8, frameon=True) plts.append(plt) return plts else: plt = ( df[plot_from:].drop(["error", "cv_split", "cv_split_str", "mae"], axis=1).plot(**plot_params) ) # get limits for shaded areas min_y, max_y = plt.get_ylim() for split in df["cv_split"].dropna().unique(): tmp_df = df[plot_from:].loc[lambda x: x["cv_split"] == split, ["cv_split_str"]] if not tmp_df.empty: plt.fill_between( x=tmp_df.index.values, y1=max_y, y2=min_y, alpha=[0.2, 0.4][split % 2], color=["gray"], label=tmp_df["cv_split_str"].unique()[0], ) # include shaded area labels in the legend plt.legend(facecolor="white", framealpha=0.8, frameon=True) return plt
[docs] def plot_error(self, **plot_params): """Plot model absolute error during model selection Parameters ---------- plot_params : kwargs plotting parameters passed down to pandas.DataFrame.plot() dependent on your plotting backend e.g. figsize = (16,9), `title = 'Performance of Model'` Returns ------- pandas.DataFrame.plot() plot depending on your plotting backend, by default plot from matplotlib """ # TODO add different error functions?? df = self.df_plot return df.dropna().groupby("cv_split_str")["error"].plot(legend=True, **plot_params)
def __repr__(self): return ( "ModelSelectorResult\n" f"-------------------\n" f" best_model_name: {self.best_model_name}\n" f" frequency: {self.frequency}\n" f" horizon: {self.horizon}\n\n" f" country_code_column: {self.country_code_column}\n\n" f" partition: {dict(self.partition)}\n" f" partition_hash: {self.partition_hash}\n\n" f" df_plot: DataFrame {self.df_plot.shape} suited for plotting cv results with .plot()\n" f" X_train: DataFrame {self.X_train.shape} with training feature values\n" f" y_train: DataFrame {self.y_train.shape} with training target values\n" f" cv_results: DataFrame {self.cv_results.shape} with gridsearch cv info\n" f" best_model_cv_results: Series with gridsearch cv info\n" f" cv_data: DataFrame {self.cv_data.shape} " f"with models predictions, split and true target values\n" f" best_model_cv_data: DataFrame {self.best_model_cv_data.shape} " f"with model predictions, split and true target values\n\n" f" model_reprs: Dict of model_hash and model_reprs\n" f" best_model_hash: {self.best_model_hash}\n" f" best_model: {self.best_model}\n" "-------------------\n" )