Source code for hcrystalball.compose._ts_column_transformer

from collections import namedtuple

import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.compose import ColumnTransformer
from sklearn.utils.validation import check_array
from sklearn.utils.validation import check_is_fitted


[docs]class TSColumnTransformer(ColumnTransformer):
    """Time Series compatible ColumnTransformer.

    Allow usage of hcrystalball wrappers and index based transformers.
    See also: `sklearn.compose.ColumnTransformer`

    Returns
    -------
    pandas.DataFrame
        Data transformed on given column

    Raises
    ------
    ValueError
        If `remainder=='passthrough'` is set. Use `passthrough` as an identity estimator
        If sparse output is requested, but not all columns are numeric
    """

    @property
    def remainder(self):
        """Access to original remainder"""
        return self._remainder_original

    @remainder.setter
    def remainder(self, value):
        if value == "passthrough":
            raise ValueError(
                "TSColumnTransformer.remainder=='passthrough' is not supported."
                "Please use 'passthrough' as an identity estimator"
            )
        else:
            self._remainder_original = value

[docs]    def get_feature_names(self):
        """Get feature names from all transformers.

        Returns
        -------
        feature_names : list of strings
            Names of the features produced by transform.
        """
        check_is_fitted(self, "transformers_")
        # gather column names generated by transformers to defined structure
        # and solve name duplicities in more sophisticated way
        Columns = namedtuple("Columns", "col_name trans_name trans_index passthrough")
        feature_tuples = []
        for index, (name, trans, apply_cols, _) in enumerate(self._iter(fitted=True)):
            if trans == "passthrough":
                col_tuple = Columns(
                    col_name=apply_cols,
                    trans_name=name,
                    trans_index=index,
                    passthrough=True,
                )
            elif trans == "drop":
                continue
            elif hasattr(trans, "get_feature_names"):
                col_tuple = Columns(
                    col_name=trans.get_feature_names(),
                    trans_name=name,
                    trans_index=index,
                    passthrough=False,
                )
            else:
                # TODO: for transformers that reduce/inflate dimensions,
                #      this might cause unwanted behavior
                # Temporary fix for PCA
                if hasattr(trans, "n_components"):
                    if trans.n_components != len(apply_cols):
                        apply_cols = [name + "_" + str(i) for i in range(trans.n_components)]
                col_tuple = Columns(
                    col_name=apply_cols,
                    trans_name=name,
                    trans_index=index,
                    passthrough=False,
                )

            feature_tuples.append(col_tuple)
        # make sure passthrough column names have precendece over other transformers
        # when duplicate colum names occur
        df = (
            pd.DataFrame(feature_tuples)
            .explode("col_name")
            .reset_index(drop=True)
            .sort_values("passthrough", ascending=False)
        )
        duplicates = df.duplicated(subset=["col_name"])
        df.loc[duplicates, "col_name"] += "_" + df.loc[duplicates, "trans_name"]
        feature_names = df.sort_index()["col_name"].tolist()

        return feature_names

    def _hstack(self, Xs):
        """Stack Xs horizontally.

        This allows subclasses to control the stacking behavior, while reusing
        everything else from ColumnTransformer and returning pandas.DataFrame
        version of data at the end.

        Parameters
        ----------
        Xs : List
            List of numpy arrays, sparse arrays, or DataFrames

        Returns
        -------
        pandas.DataFrame
            Stacked data with correct column names

        Raises
        ------
        ValueError
            Raises ValueError when columns are not numeric for sparse output
        """
        if self.sparse_output_:
            try:
                # since all columns should be numeric before stacking them
                # in a sparse matrix, `check_array` is used for the
                # dtype conversion if necessary.
                converted_Xs = [check_array(X, accept_sparse=True, force_all_finite=False) for X in Xs]
            except ValueError:
                raise ValueError(
                    "For a sparse output, all columns should" " be a numeric or convertible to a numeric."
                )

            return pd.DataFrame(sparse.hstack(converted_Xs).tocsr(), columns=self.get_feature_names())
        else:
            Xs = [f.toarray() if sparse.issparse(f) else f for f in Xs]
            # addition, that turns nparray to dataframe with correct column names
            return pd.DataFrame(np.hstack(Xs), columns=self.get_feature_names())

[docs]    def transform(self, X):
        """Run index aware transform

        Parameters
        ----------
        X : pandas.DataFrame
            Input features.

        Returns
        -------
        pandas.DataFrame
            Transformed data by given transformer on given column
        """
        df = super().transform(X)
        df.index = X.index
        return df

[docs]    def fit_transform(self, X, y=None):
        """Run index aware fit_transform

        Parameters
        ----------
        X : pandas.DataFrame
            Input features.
        y : pandas.Series or numpy.array
            Target values

        Returns
        -------
        pandas.DataFrame
            Transformed data by given transformer on given column
        """
        df = super().fit_transform(X, y)
        df.index = X.index
        return df