Source code for hcrystalball.compose._ts_column_transformer
from collections import namedtuple
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.compose import ColumnTransformer
from sklearn.utils.validation import check_array
from sklearn.utils.validation import check_is_fitted
[docs]class TSColumnTransformer(ColumnTransformer):
"""Time Series compatible ColumnTransformer.
Allow usage of hcrystalball wrappers and index based transformers.
See also: `sklearn.compose.ColumnTransformer`
Returns
-------
pandas.DataFrame
Data transformed on given column
Raises
------
ValueError
If `remainder=='passthrough'` is set. Use `passthrough` as an identity estimator
If sparse output is requested, but not all columns are numeric
"""
@property
def remainder(self):
"""Access to original remainder"""
return self._remainder_original
@remainder.setter
def remainder(self, value):
if value == "passthrough":
raise ValueError(
"TSColumnTransformer.remainder=='passthrough' is not supported."
"Please use 'passthrough' as an identity estimator"
)
else:
self._remainder_original = value
[docs] def get_feature_names(self):
"""Get feature names from all transformers.
Returns
-------
feature_names : list of strings
Names of the features produced by transform.
"""
check_is_fitted(self, "transformers_")
# gather column names generated by transformers to defined structure
# and solve name duplicities in more sophisticated way
Columns = namedtuple("Columns", "col_name trans_name trans_index passthrough")
feature_tuples = []
for index, (name, trans, apply_cols, _) in enumerate(self._iter(fitted=True)):
if trans == "passthrough":
col_tuple = Columns(
col_name=apply_cols,
trans_name=name,
trans_index=index,
passthrough=True,
)
elif trans == "drop":
continue
elif hasattr(trans, "get_feature_names"):
col_tuple = Columns(
col_name=trans.get_feature_names(),
trans_name=name,
trans_index=index,
passthrough=False,
)
else:
# TODO: for transformers that reduce/inflate dimensions,
# this might cause unwanted behavior
# Temporary fix for PCA
if hasattr(trans, "n_components"):
if trans.n_components != len(apply_cols):
apply_cols = [name + "_" + str(i) for i in range(trans.n_components)]
col_tuple = Columns(
col_name=apply_cols,
trans_name=name,
trans_index=index,
passthrough=False,
)
feature_tuples.append(col_tuple)
# make sure passthrough column names have precendece over other transformers
# when duplicate colum names occur
df = (
pd.DataFrame(feature_tuples)
.explode("col_name")
.reset_index(drop=True)
.sort_values("passthrough", ascending=False)
)
duplicates = df.duplicated(subset=["col_name"])
df.loc[duplicates, "col_name"] += "_" + df.loc[duplicates, "trans_name"]
feature_names = df.sort_index()["col_name"].tolist()
return feature_names
def _hstack(self, Xs):
"""Stack Xs horizontally.
This allows subclasses to control the stacking behavior, while reusing
everything else from ColumnTransformer and returning pandas.DataFrame
version of data at the end.
Parameters
----------
Xs : List
List of numpy arrays, sparse arrays, or DataFrames
Returns
-------
pandas.DataFrame
Stacked data with correct column names
Raises
------
ValueError
Raises ValueError when columns are not numeric for sparse output
"""
if self.sparse_output_:
try:
# since all columns should be numeric before stacking them
# in a sparse matrix, `check_array` is used for the
# dtype conversion if necessary.
converted_Xs = [check_array(X, accept_sparse=True, force_all_finite=False) for X in Xs]
except ValueError:
raise ValueError(
"For a sparse output, all columns should" " be a numeric or convertible to a numeric."
)
return pd.DataFrame(sparse.hstack(converted_Xs).tocsr(), columns=self.get_feature_names())
else:
Xs = [f.toarray() if sparse.issparse(f) else f for f in Xs]
# addition, that turns nparray to dataframe with correct column names
return pd.DataFrame(np.hstack(Xs), columns=self.get_feature_names())
[docs] def transform(self, X):
"""Run index aware transform
Parameters
----------
X : pandas.DataFrame
Input features.
Returns
-------
pandas.DataFrame
Transformed data by given transformer on given column
"""
df = super().transform(X)
df.index = X.index
return df
[docs] def fit_transform(self, X, y=None):
"""Run index aware fit_transform
Parameters
----------
X : pandas.DataFrame
Input features.
y : pandas.Series or numpy.array
Target values
Returns
-------
pandas.DataFrame
Transformed data by given transformer on given column
"""
df = super().fit_transform(X, y)
df.index = X.index
return df