Source code for hcrystalball.model_selection._model_selector

from collections import Counter
from pathlib import Path

import pandas as pd

from ._configuration import add_model_to_gridsearch
from ._configuration import get_gridsearch
from ._large_scale_cross_validation import select_model_general
from ._model_selector_result import load_model_selector_result
from .utils import persist_experts_in_physical_partition


[docs]def load_model_selector(folder_path): """Load information about stored model selection Parameters ---------- folder_path : str path where .model_selector_result files are stored Returns ------- ModelSelector Information about model selection for each partition """ results = [ load_model_selector_result(path=r.parent, partition_hash=r.stem) for r in Path(folder_path).glob("*.model_selector_result") ] model_selector = ModelSelector( horizon=results[0].horizon, frequency=results[0].frequency, country_code_column=results[0].country_code_column, ) model_selector.results = results return model_selector
[docs]class ModelSelector: """Enable large scale cross validation easily accessible. Through `create_gridsearch` and `select_model` methods run cross validation and present / persist all relevant information. Parameters ---------- horizon : int How many steps ahead the predictions during model selection are to be made frequency : str Temporal frequency of the data which is to be used in model selection Data with different frequency will be resampled to this frequency. country_code_column : str, list Name of the column(s) with ISO code of country/region, which can be used for supplying holiday. If used, later provided data must have daily frequency. e.g. 'State' with values like 'DE', 'CZ' or 'Region' with values like 'DE-NW', 'DE-HE', etc. """ def __init__(self, horizon, frequency, country_code_column=None): self.horizon = horizon self.frequency = frequency self.country_code_column = country_code_column self._partitions = None self._results = None self._stored_path = None
[docs] def select_model( self, df, target_col_name, partition_columns=None, parallel_over_columns=None, executor=None, include_rules=None, exclude_rules=None, output_path="", persist_cv_results=False, persist_cv_data=False, persist_model_reprs=False, persist_best_model=False, persist_partition=False, persist_model_selector_results=False, ): """Run cross validation on data and selects best model. Best models are selected for each timeseries, stored in attribute `self.results` and if wanted also persisted. Parameters ---------- df : pandas.DataFrame Container holding historical data for training target_col_name : str Name of target column partition_columns : list, tuple Column names based on which the data should be split up / partitioned parallel_over_columns : list, tuple Subset of partition_columns, that are used to parallel split. executor : prefect.executors Provide prefect's executor. Only valid when `parallel_over_columns` is set. For more information see https://docs.prefect.io/api/latest/engine/executors.html include_rules : dict Dictionary with keys being column names and values being list of values to include in the output. exclude_rules : dict Dictionary with keys being column names and values being list of values to exclude from the output. output_path : str Path to directory for storing the output, default behavior is current working directory persist_cv_results : bool If True cv_results of sklearn.model_selection.GridSearchCV as pandas df will be saved as pickle for each partition persist_cv_data : bool If True the pandas df detail cv data will be saved as pickle for each partition persist_model_reprs : bool If True model reprs will be saved as json for each partition persist_best_model : bool If True best model will be saved as pickle for each partition persist_partition : bool If True dictionary of partition label will be saved as json for each partition persist_model_selector_results : bool If True ModelSelectoResults with all important information will be saved as pickle for each partition """ params = {k: v for k, v in locals().items() if k != "self"} self.results = select_model_general( frequency=self.frequency, grid_search=self.grid_search, country_code_column=self.country_code_column, **params, )
[docs] def create_gridsearch( self, n_splits=5, between_split_lag=None, scoring="neg_mean_absolute_error", country_code=None, holidays_days_before=0, holidays_days_after=0, holidays_bridge_days=False, sklearn_models=True, sklearn_models_optimize_for_horizon=False, autosarimax_models=False, autoarima_dict=None, prophet_models=False, tbats_models=False, exp_smooth_models=False, theta_models=False, average_ensembles=False, stacking_ensembles=False, stacking_ensembles_train_horizon=10, stacking_ensembles_train_n_splits=20, clip_predictions_lower=None, clip_predictions_upper=None, exog_cols=None, hcb_verbose=False, ): """Create grid_search attribute (`sklearn.model_selection.GridSearchCV`) based on selection criteria Parameters ---------- n_splits : int How many cross-validation folds should be used in model selection between_split_lag : int How big lag of observations should cv_splits have If kept as None, horizon is used resulting in non-overlaping cv_splits scoring : str, callable String of sklearn regression metric name, or hcrystalball compatible scorer. For creation of hcrystalball compatible scorer use `make_ts_scorer` function. country_code : str Country code in str (e.g. 'DE'). Used in holiday transformer. Only one of `country_code_column` or `country_code` can be set. holidays_days_before : int Number of days before the holiday which will be taken into account (i.e. 2 means that new bool column will be created and will be True for 2 days before holidays, otherwise False) holidays_days_after : int Number of days after the holiday which will be taken into account (i.e. 2 means that new bool column will be created and will be True for 2 days after holidays, otherwise False) holidays_bridge_days : bool Overlaping `holidays_days_before` and `holidays_days_after` feature which serves for modeling between holidays working days sklearn_models : bool Whether to consider sklearn models sklearn_optimize_for_horizon: bool Whether to add to default sklearn behavior also models, that optimize predictions for each horizon autosarimax_models : bool Whether to consider auto sarimax models autoarima_dict : dict Specification of pmdautoarima search space prophet_models : bool Whether to consider FB prophet models exp_smooth_models : bool Whether to consider exponential smoothing models average_ensembles : bool Whether to consider average ensemble models stacking_ensembles : bool Whether to consider stacking ensemble models stacking_ensembles_train_horizon : int Which horizon should be used in meta model in stacking ensebmles stacking_ensembles_train_n_splits : int Number of splits used in meta model in stacking ensebmles clip_predictions_lower: float, int Minimal number allowed in the predictions clip_predictions_upper: float, int Maximal number allowed in the predictions exog_cols: list List of columns to be used as exogenous variables hcb_verbose : bool Whtether to keep (True) or suppress (False) messages to stdout and stderr from the wrapper and 3rd party libraries during fit and predict """ if self.country_code_column is not None and country_code is not None: raise ValueError( "You can use either `country_code_column` in ModelSelector constructor " "or `country_code` here, not both." ) params = {k: v for k, v in locals().items() if k not in ["self"]} self.grid_search = get_gridsearch( frequency=self.frequency, horizon=self.horizon, country_code_column=self.country_code_column, **params, )
[docs] def add_model_to_gridsearch(self, model): """Extend `self.grid_search` parameter grid with provided model. Adds given model or list of models to the gridsearch under 'model' step Parameters ---------- model : sklearn compatible model or list of sklearn compatible models model(s) to be added to provided grid search """ self.grid_search = add_model_to_gridsearch(model, self.grid_search)
[docs] def persist_results( self, folder_path="results", persist_cv_results=False, persist_cv_data=False, persist_model_reprs=False, persist_best_model=False, persist_partition=False, persist_model_selector_results=True, ): """Store expert files for each partition. The file names follow {partition_hash}.{expert_type} e.g. 795dab1813f05b1abe9ae6ded93e1ec4.cv_data Stores value of folder_path argument to `self.stored_path` Parameters ---------- folder_path : str Path to the directory, where expert files are stored, by default '' resulting in current working directory persist_cv_results : bool If True `cv_results` of sklearn.model_selection.GridSearchCV as pandas df will be saved as pickle for each partition persist_cv_data : bool If True the pandas df detail cv data will be saved as pickle for each partition persist_model_reprs : bool If True model reprs will be saved as json for each partition persist_best_model : bool If True best model will be saved as pickle for each partition persist_partition : bool If True dictionary of partition label will be saved as json for each partition persist_model_selector_results : bool If True ModelSelectoResults with all important information will be saved as pickle for each partition """ params = {k: v for k, v in locals().items() if k != "self"} self.stored_path = persist_experts_in_physical_partition(results=self.results, **params)
@property def results(self): """Results for each partition Returns ------- list List of `ModelSelectorResult` objects Raises ------ ValueError If `select_model` was not called before """ if self._results is None: raise ValueError("You need to run `select_model` first to obtain the `results`!") return self._results @results.setter def results(self, value): self._results = value @property def partitions(self): """List of partitions the model selection was ran on. Created only after calling `select_model`. Returns ------- list List of dictionaries of partitions Raises ------ ValueError If `select_model` was not called before """ if self._partitions is None: if self._results is None: raise ValueError("You need to run `select_model` first to obtain the `partitions`!") else: self._partitions = self.get_partitions() return self._partitions @partitions.setter def partitions(self, value): self._partitions = value @property def stored_path(self): """Path where `ModelSelector` object was stored Created only after calling `persist_results`. Returns ------- str Pathlike string to the folder containing stored ModelSelector object Raises ------ ValueError If `presist_results` was not called before """ if self._stored_path is None: raise ValueError("You need to run `persist_results` first to obtain the `stored_path`!") return self._stored_path @stored_path.setter def stored_path(self, value): self._stored_path = value
[docs] def get_result_for_partition(self, partition=None): """Provide result for given partition Parameters ---------- partition : str, dict partition_hash or partition_dict of data to which result is tied to Returns ------- ModelSelectorResult result of model selection for given partition Raises ------ ValueError if partition is not present in the results """ if isinstance(partition, dict): result = [result for result in self.results if result.partition == partition] else: result = [result for result in self.results if result.partition_hash == partition] if not result: raise ValueError( f"Partition {partition} does not exist. Run 'get_partitions()' to see available options." ) return result[0]
[docs] def get_partitions(self, as_dataframe=False): """Provide overview of partitions for which results are available Parameters ---------- as_dataframe : bool Whether to return partitions as pandas.DataFrame -> returns list of dicts Returns ------- pandas.DataFrame, list[dict, dict, ...] partitions available in model selector results """ partitions = [result.partition for result in self.results] if as_dataframe: return pd.DataFrame(partitions) return partitions
[docs] def plot_best_wrapper_classes(self, title="Most often selected classes", **plot_params): """Plot number of selected wrapper classes that were picked as best models Parameters ---------- title : str Title of the plot Returns ------- matplotlib.axes._subplots.AxesSubplot Plot of most selected wrapper classes """ no_of_best_wrapper_classes = Counter([res.best_model_name for res in self.results]) return pd.Series(no_of_best_wrapper_classes).plot(kind="barh", title=title, **plot_params)
[docs] def plot_results(self, partitions=None, plot_from=None, **plot_params): """Plot training data and cv forecasts for each of the partition Parameters ---------- partitions : list List of partitions to plot results for plot_from : str Date from which to show the plot e.g. '2019-12-31', '2019', or '2019-12' Returns ------- list List of `matplotlib.axes._subplots.AxesSubplot` for each partition """ partitions = partitions or self.partitions plts = [] for partition in partitions: partition_result = self.get_result_for_partition(partition) plts.append( partition_result.plot_result( plot_from=plot_from, title=(" ").join([f"{k}={v}" for k, v in partition.items()]), **plot_params, ) ) return plts
def __repr__(self): r = "ModelSelector\n" r += "-------------\n" r += f" frequency: {self.frequency}\n" r += f" horizon: {self.horizon}\n" r += f" country_code_column: {self.country_code_column}\n" if self._results is not None: r += f" results: List of {len(self.results)} ModelSelectorResults\n" r += f" paritions: List of {len(self.partitions)} partitions\n" for partition in self.partitions: r += f" {dict(partition)}\n" r += "-------------\n" return r