Source code for hcrystalball.feature_extraction._seasonal_transformer

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin


# TODO adding possibility to infer frequency from the data
[docs]class SeasonalityTransformer(BaseEstimator, TransformerMixin): """Generate seasonal feature columns using one-hot encoding. Parameters ---------- auto : bool Automatically generate week_day, monthly, quarterly, yearly, weekly if it makes sense given the data frequency freq : str Frequency of data week_day : bool Whether to add day name as a feature monthly : bool Whether to add month as a feature quarterly : bool Whether to add quarter as a feature yearly : bool Whether to add year as a feature weekly : bool Whether to add week number as a feature Raises ------ ValueError Error is raised if freq is not in ['D', 'W', 'M','Q', 'Y', None] ValueError Error is raised if freq is not provided when using auto=True """ def __init__( self, auto=True, freq=None, week_day=None, monthly=None, quarterly=None, yearly=None, weekly=None, month_start=False, month_end=False, quarter_start=False, quarter_end=False, year_start=False, year_end=False, ): self.auto = auto self.freq = freq if self.freq is not None and self.freq not in ["D", "W", "M", "Q", "Y"]: raise ValueError("`freq` needs to be one of 'D', 'W', 'M', 'Q', 'Y', None") if self.auto is True and self.freq is None: raise ValueError("`freq` needs to be provided if `auto` is set to True") self.week_day = week_day self.monthly = monthly self.quarterly = quarterly self.yearly = yearly self.weekly = weekly self.month_start = month_start self.month_end = month_end self.quarter_start = quarter_start self.quarter_end = quarter_end self.year_start = year_start self.year_end = year_end self._fit_columns = None
[docs] def get_feature_names(self): """Provide handle to get column names for created data Returns ------- list : Name of the generated feature vectors when the transformer is fitted. """ return self._fit_columns
[docs] def fit(self, X, y): """Set fit columns to None Parameters ---------- X : pandas.DataFrame Ignored. y : numpy.ndarray Ignored. Returns ------- SeasonalityTransformer self """ self._fit_columns = None return self
def _ensure_pred_and_train_cols_equals(self, X): """Ensure match between fit and transform columns Returns Pandas dataframe for inference with the same features as during training (i.e. Test data could miss some months...). This method is important as most regressors expect the same structure of data for training as for inference Parameters ---------- X : pandas.DataFrame Input features. Returns ------- pandas.DataFrame Data with the same features as train set had """ miss_cols = list(self._fit_columns.difference(X.columns)) if len(miss_cols) > 0: miss_data = pd.DataFrame( data=np.zeros((len(X.index), len(miss_cols)), dtype=int), columns=miss_cols, index=X.index, ) data = X.join(miss_data) return data[self._fit_columns] else: return X[self._fit_columns]
[docs] def transform(self, X): """Create seasonal columns from datetime index Parameters ---------- X: pandas.DataFrame Input features. Returns ------- pandas.DataFrame Contains the generated feature vector(s) """ date = pd.to_datetime(X.index) season_feat = [] if (self.week_day or (self.auto and self.freq in ["D"])) and self.week_day is not False: season_feat.append(pd.get_dummies(date.day_name())) if self.weekly or (self.auto and self.freq in ["D", "W"]) and self.weekly is not False: season_feat.append( pd.get_dummies(date.isocalendar().week.values).rename(columns=lambda x: f"{x}_week") ) # season_feat.append(pd.get_dummies(date.week).rename(columns=lambda x: f"{x}_week")) if self.monthly or (self.auto and self.freq in ["D", "W", "M"]) and self.monthly is not False: season_feat.append(pd.get_dummies(date.month_name())) if ( self.quarterly or (self.auto and self.freq in ["D", "W", "M", "Q"]) and self.quarterly is not False ): season_feat.append(pd.get_dummies(date.quarter).rename(columns=lambda x: f"{x}_quarter")) if self.yearly or (self.auto and self.freq in ["D", "W", "M", "Q", "Y"]) and self.yearly is not False: season_feat.append(pd.get_dummies(date.year)) _X = pd.concat(season_feat, axis=1) if self.month_start: _X["month_start"] = date.is_month_start if self.month_end: _X["month_end"] = date.is_month_end if self.quarter_start: _X["quarter_start"] = date.is_quarter_start if self.quarter_end: _X["quarter_end"] = date.is_quarter_end if self.year_start: _X["year_start"] = date.is_year_start if self.year_end: _X["year_end"] = date.is_year_end _X.columns = [f"_{col}" for col in _X.columns] if self._fit_columns is not None: _X = self._ensure_pred_and_train_cols_equals(_X) else: self._fit_columns = _X.columns _X.index = date return pd.merge(X, _X, left_index=True, right_index=True, how="left")