Source code for hcrystalball.compose._ts_column_transformer

from collections import namedtuple

import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.compose import ColumnTransformer
from sklearn.utils.validation import check_array
from sklearn.utils.validation import check_is_fitted


[docs]class TSColumnTransformer(ColumnTransformer): """Time Series compatible ColumnTransformer. Allow usage of hcrystalball wrappers and index based transformers. See also: `sklearn.compose.ColumnTransformer` Returns ------- pandas.DataFrame Data transformed on given column Raises ------ ValueError If `remainder=='passthrough'` is set. Use `passthrough` as an identity estimator If sparse output is requested, but not all columns are numeric """ @property def remainder(self): """Access to original remainder""" return self._remainder_original @remainder.setter def remainder(self, value): if value == "passthrough": raise ValueError( "TSColumnTransformer.remainder=='passthrough' is not supported." "Please use 'passthrough' as an identity estimator" ) else: self._remainder_original = value
[docs] def get_feature_names(self): """Get feature names from all transformers. Returns ------- feature_names : list of strings Names of the features produced by transform. """ check_is_fitted(self, "transformers_") # gather column names generated by transformers to defined structure # and solve name duplicities in more sophisticated way Columns = namedtuple("Columns", "col_name trans_name trans_index passthrough") feature_tuples = [] for index, (name, trans, apply_cols, _) in enumerate(self._iter(fitted=True)): if trans == "passthrough": col_tuple = Columns( col_name=apply_cols, trans_name=name, trans_index=index, passthrough=True, ) elif trans == "drop": continue elif hasattr(trans, "get_feature_names"): col_tuple = Columns( col_name=trans.get_feature_names(), trans_name=name, trans_index=index, passthrough=False, ) else: # TODO: for transformers that reduce/inflate dimensions, # this might cause unwanted behavior # Temporary fix for PCA if hasattr(trans, "n_components"): if trans.n_components != len(apply_cols): apply_cols = [name + "_" + str(i) for i in range(trans.n_components)] col_tuple = Columns( col_name=apply_cols, trans_name=name, trans_index=index, passthrough=False, ) feature_tuples.append(col_tuple) # make sure passthrough column names have precendece over other transformers # when duplicate colum names occur df = ( pd.DataFrame(feature_tuples) .explode("col_name") .reset_index(drop=True) .sort_values("passthrough", ascending=False) ) duplicates = df.duplicated(subset=["col_name"]) df.loc[duplicates, "col_name"] += "_" + df.loc[duplicates, "trans_name"] feature_names = df.sort_index()["col_name"].tolist() return feature_names
def _hstack(self, Xs): """Stack Xs horizontally. This allows subclasses to control the stacking behavior, while reusing everything else from ColumnTransformer and returning pandas.DataFrame version of data at the end. Parameters ---------- Xs : List List of numpy arrays, sparse arrays, or DataFrames Returns ------- pandas.DataFrame Stacked data with correct column names Raises ------ ValueError Raises ValueError when columns are not numeric for sparse output """ if self.sparse_output_: try: # since all columns should be numeric before stacking them # in a sparse matrix, `check_array` is used for the # dtype conversion if necessary. converted_Xs = [check_array(X, accept_sparse=True, force_all_finite=False) for X in Xs] except ValueError: raise ValueError( "For a sparse output, all columns should" " be a numeric or convertible to a numeric." ) return pd.DataFrame(sparse.hstack(converted_Xs).tocsr(), columns=self.get_feature_names()) else: Xs = [f.toarray() if sparse.issparse(f) else f for f in Xs] # addition, that turns nparray to dataframe with correct column names return pd.DataFrame(np.hstack(Xs), columns=self.get_feature_names())
[docs] def transform(self, X): """Run index aware transform Parameters ---------- X : pandas.DataFrame Input features. Returns ------- pandas.DataFrame Transformed data by given transformer on given column """ df = super().transform(X) df.index = X.index return df
[docs] def fit_transform(self, X, y=None): """Run index aware fit_transform Parameters ---------- X : pandas.DataFrame Input features. y : pandas.Series or numpy.array Target values Returns ------- pandas.DataFrame Transformed data by given transformer on given column """ df = super().fit_transform(X, y) df.index = X.index return df