Skip to content

Add input_df init argument to pass df/series to transformers #85

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 17, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 34 additions & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ For these examples, we'll also use pandas, numpy, and sklearn::
Load some Data
**************

Normally you'll read the data from a file, but for demonstration purposes I'll create a data frame from a Python dict::
Normally you'll read the data from a file, but for demonstration purposes we'll create a data frame from a Python dict::

>>> data = pd.DataFrame({'pet': ['cat', 'dog', 'dog', 'fish', 'cat', 'dog', 'cat', 'fish'],
... 'children': [4., 6, 3, 3, 2, 3, 5, 4],
Expand Down Expand Up @@ -116,6 +116,37 @@ the dataframe mapper. We can do so by inspecting the automatically generated
['pet_cat', 'pet_dog', 'pet_fish', 'children']


Passing Series/DataFrames to the transformers
*********************************************

By default the transformers are passed a numpy array of the selected columns
as input. This is because ``sklearn`` transformers are historically designed to
work with numpy arrays, not with pandas dataframes, even though their basic
indexing interfaces are similar.

However we can pass a dataframe/series to the transformers to handle custom
cases initializing the dataframe mapper with ``input_df=True`::

>>> from sklearn.base import TransformerMixin
>>> class DateEncoder(TransformerMixin):
... def fit(self, X, y=None):
... return self
...
... def transform(self, X):
... dt = X.dt
... return pd.concat([dt.year, dt.month, dt.day], axis=1)
>>> dates_df = pd.DataFrame(
... {'dates': pd.date_range('2015-10-30', '2015-11-02')})
>>> mapper_dates = DataFrameMapper([
... ('dates', DateEncoder())
... ], input_df=True)
>>> mapper_dates.fit_transform(dates_df)
array([[2015, 10, 30],
[2015, 10, 31],
[2015, 11, 1],
[2015, 11, 2]])


Outputting a dataframe
**********************

Expand Down Expand Up @@ -289,6 +320,8 @@ Development
* Capture output columns generated names in ``transformed_names_`` attribute (#78).
* Add ``CategoricalImputer`` that replaces null-like values with the mode
for string-like columns.
* Add ``input_df`` init argument to allow inputting a dataframe/series to the
transformers instead of a numpy array (#60).


1.3.0 (2017-01-21)
Expand Down
19 changes: 16 additions & 3 deletions sklearn_pandas/dataframe_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ class DataFrameMapper(BaseEstimator, TransformerMixin):
sklearn transformation.
"""

def __init__(self, features, default=False, sparse=False, df_out=False):
def __init__(self, features, default=False, sparse=False, df_out=False,
input_df=False):
"""
Params:

Expand All @@ -57,6 +58,10 @@ def __init__(self, features, default=False, sparse=False, df_out=False):
if there's multiple inputs, and the name concatenated with
'_1', '_2' etc if there's multiple outputs. NB: does not
work if *default* or *sparse* are true

input_df If ``True`` pass the selected columns to the transformers
as a pandas DataFrame or Series. Otherwise pass them as a
numpy array. Defaults to ``False``.
"""
if isinstance(features, list):
features = [(columns, _build_transformer(transformers))
Expand All @@ -65,6 +70,7 @@ def __init__(self, features, default=False, sparse=False, df_out=False):
self.default = _build_transformer(default)
self.sparse = sparse
self.df_out = df_out
self.input_df = input_df
self.transformed_names_ = []

if (df_out and (sparse or default)):
Expand Down Expand Up @@ -108,6 +114,8 @@ def __setstate__(self, state):
self.default = state.get('default', False)
self.df_out = state.get('df_out', False)

self.input_df = state.get('input_df', False)

def _get_col_subset(self, X, cols):
"""
Get a subset of columns from the given table X.
Expand All @@ -132,10 +140,15 @@ def _get_col_subset(self, X, cols):
X = X.df

if return_vector:
t = X[cols[0]].values
t = X[cols[0]]
else:
t = X[cols].values
t = X[cols]

# return either a DataFrame/Series or a numpy array
if self.input_df:
return t
else:
return t.values
return t

def fit(self, X, y=None):
Expand Down
90 changes: 90 additions & 0 deletions tests/test_dataframe_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,15 @@ def predict(self, X):
return True


class DateEncoder():
def fit(self, X, y=None):
return self

def transform(self, X):
dt = X.dt
return pd.concat([dt.year, dt.month, dt.day], axis=1)


class ToSparseTransformer(BaseEstimator, TransformerMixin):
"""
Transforms numpy matrix to sparse format.
Expand Down Expand Up @@ -225,6 +234,87 @@ def test_pca(complex_dataframe):
assert cols[1] == 'feat1_feat2_1'


def test_input_df_true_first_transformer(simple_dataframe, monkeypatch):
"""
If input_df is True, the first transformer is passed
a pd.Series instead of an np.array
"""
df = simple_dataframe
monkeypatch.setattr(MockXTransformer, 'fit', Mock())
monkeypatch.setattr(MockXTransformer, 'transform',
Mock(return_value=np.array([1, 2, 3])))
mapper = DataFrameMapper([
('a', MockXTransformer())
], input_df=True)
out = mapper.fit_transform(df)

args, _ = MockXTransformer().fit.call_args
assert isinstance(args[0], pd.Series)

args, _ = MockXTransformer().transform.call_args
assert isinstance(args[0], pd.Series)

assert_array_equal(out, np.array([1, 2, 3]).reshape(-1, 1))


def test_input_df_true_next_transformers(simple_dataframe, monkeypatch):
"""
If input_df is True, the subsequent transformers get passed pandas
objects instead of numpy arrays (given the previous transformers
output pandas objects as well)
"""
df = simple_dataframe
monkeypatch.setattr(MockTClassifier, 'fit', Mock())
monkeypatch.setattr(MockTClassifier, 'transform',
Mock(return_value=pd.Series([1, 2, 3])))
mapper = DataFrameMapper([
('a', [MockXTransformer(), MockTClassifier()])
], input_df=True)
out = mapper.fit_transform(df)

args, _ = MockTClassifier().fit.call_args
assert isinstance(args[0], pd.Series)

assert_array_equal(out, np.array([1, 2, 3]).reshape(-1, 1))


def test_input_df_true_multiple_cols(complex_dataframe):
"""
When input_df is True, applying transformers to multiple columns
works as expected
"""
df = complex_dataframe

mapper = DataFrameMapper([
('target', MockXTransformer()),
('feat1', MockXTransformer()),
], input_df=True)
out = mapper.fit_transform(df)

assert_array_equal(out[:, 0], df['target'].values)
assert_array_equal(out[:, 1], df['feat1'].values)


def test_input_df_date_encoder():
"""
When input_df is True we can apply a transformer that only works
with pandas dataframes like a DateEncoder
"""
df = pd.DataFrame(
{'dates': pd.date_range('2015-10-30', '2015-11-02')})
mapper = DataFrameMapper([
('dates', DateEncoder())
], input_df=True)
out = mapper.fit_transform(df)
expected = np.array([
[2015, 10, 30],
[2015, 10, 31],
[2015, 11, 1],
[2015, 11, 2]
])
assert_array_equal(out, expected)


def test_nonexistent_columns_explicit_fail(simple_dataframe):
"""
If a nonexistent column is selected, KeyError is raised.
Expand Down