Skip to content

added y argument to fit methods #59

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Aug 3, 2016
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 26 additions & 9 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -167,19 +167,36 @@ passing it as the ``default`` argument to the mapper:
... ('pet', sklearn.preprocessing.LabelBinarizer()),
... ('children', None)
... ], default=sklearn.preprocessing.StandardScaler())
>>> np.round(mapper4.fit_transform(data.copy()))
array([[ 1., 0., 0., 4., 2.],
[ 0., 1., 0., 6., -1.],
[ 0., 1., 0., 3., 0.],
[ 0., 0., 1., 3., -1.],
[ 1., 0., 0., 2., -0.],
[ 0., 1., 0., 3., 1.],
[ 1., 0., 0., 5., -0.],
[ 0., 0., 1., 4., -1.]])
>>> np.round(mapper4.fit_transform(data.copy()), 1)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why does one need to pass "1" as second argument to this transform, and why is the output different from the previous case in the last column?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The "1" is the second argument of np.round. This test case was failing even though I don't think I modified anything that affected it. The issue seems to be that on my machine np.round(-0.3) equals "0.", not "-0." Changing it to round to 1 decimal place fixed the test case.

array([[ 1. , 0. , 0. , 4. , 2.3],
[ 0. , 1. , 0. , 6. , -0.9],
[ 0. , 1. , 0. , 3. , 0.1],
[ 0. , 0. , 1. , 3. , -0.7],
[ 1. , 0. , 0. , 2. , -0.5],
[ 0. , 1. , 0. , 3. , 0.8],
[ 1. , 0. , 0. , 5. , -0.3],
[ 0. , 0. , 1. , 4. , -0.7]])

Using ``default=False`` (the default) drops unselected columns. Using
``default=None`` pass the unselected columns unchanged.

Feature selection and other supervised transformations
******************************************************

``DataFrameMapper`` supports transformers that require both X and y arguments. An example of this is feature selection. Treating the 'pet' column as the target, we will select the column that best predicts it.

>>> from sklearn.feature_selection import SelectKBest, chi2
>>> mapper_fs = DataFrameMapper([(['children','salary'], SelectKBest(chi2, k=1))])
>>> mapper_fs.fit_transform(data[['children','salary']], data['pet'])
array([[ 90.],
[ 24.],
[ 44.],
[ 27.],
[ 32.],
[ 59.],
[ 36.],
[ 27.]])

Working with sparse features
****************************

Expand Down
13 changes: 8 additions & 5 deletions sklearn_pandas/dataframe_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from sklearn.base import BaseEstimator, TransformerMixin

from .cross_validation import DataWrapper
from .pipeline import make_transformer_pipeline
from .pipeline import make_transformer_pipeline, _call_fit

# load in the correct stringtype: str for py3, basestring for py2
string_types = str if sys.version_info >= (3, 0) else basestring
Expand Down Expand Up @@ -130,16 +130,19 @@ def fit(self, X, y=None):
Fit a transformation from the pipeline

X the data to fit

y the target vector relative to X, optional

"""
for columns, transformers in self.features:
if transformers is not None:
transformers.fit(self._get_col_subset(X, columns))
_call_fit(transformers.fit,
self._get_col_subset(X, columns), y)

# handle features not explicitly selected
if self.default: # not False and not None
self.default.fit(
self._get_col_subset(X, self._unselected_columns(X))
)
_call_fit(self.default.fit,
self._get_col_subset(X, self._unselected_columns(X)), y)
return self

def transform(self, X):
Expand Down
54 changes: 41 additions & 13 deletions sklearn_pandas/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,37 @@
from sklearn.utils import tosequence


def _call_fit(fit_method, X, y=None, **kwargs):
"""
helper function, calls the fit or fit_transform method with the correct
number of parameters

fit_method: fit or fit_transform method of the transformer
X: the data to fit
y: the target vector relative to X, optional
kwargs: any keyword arguments to the fit method

return: the result of the fit or fit_transform method

WARNING: if this function raises a TypeError exception, test the fit
or fit_transform method passed to it in isolation as _call_fit will not
distinguish TypeError due to incorrect number of arguments from
other TypeError
"""
try:
return fit_method(X, y, **kwargs)
except TypeError:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about:


except TypeError as e:
    if e.args and 'takes exactly' in e.args[0]:
        # fit takes only one argument
        return fit_method(X, **kwargs)
    # TypeError caused by some other unkown reason
    raise

I know it looks a bit hacky but I guess it will solve your warning above.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is pretty clever, I hadn't thought of doing something like this before. Unfortunately when I tested it, the error message accompanying a TypeError varies between Python 2 and 3. On 2, it's "test_func() takes exactly 2 arguments (1 given)" but on 3 it's "test_func() missing 1 required positional argument:". It's probably safer to leave as is.

# fit takes only one argument
return fit_method(X, **kwargs)


class TransformerPipeline(Pipeline):
"""
Pipeline that expects all steps to be transformers taking a single argument
Pipeline that expects all steps to be transformers taking a single X argument,
an optional y argument,
and having fit and transform methods.

Code is copied from sklearn's Pipeline, leaving out the `y=None` argument.
Code is copied from sklearn's Pipeline
"""
def __init__(self, steps):
names, estimators = zip(*steps)
Expand All @@ -31,31 +56,34 @@ def __init__(self, steps):
"'%s' (type %s) doesn't)"
% (estimator, type(estimator)))

def _pre_transform(self, X, **fit_params):
def _pre_transform(self, X, y=None, **fit_params):
fit_params_steps = dict((step, {}) for step, _ in self.steps)
for pname, pval in six.iteritems(fit_params):
step, param = pname.split('__', 1)
fit_params_steps[step][param] = pval
Xt = X
for name, transform in self.steps[:-1]:
if hasattr(transform, "fit_transform"):
Xt = transform.fit_transform(Xt, **fit_params_steps[name])
Xt = _call_fit(transform.fit_transform,
Xt, y, **fit_params_steps[name])
else:
Xt = transform.fit(Xt, **fit_params_steps[name]) \
.transform(Xt)
Xt = _call_fit(transform.fit,
Xt, y, **fit_params_steps[name]).transform(Xt)
return Xt, fit_params_steps[self.steps[-1][0]]

def fit(self, X, **fit_params):
Xt, fit_params = self._pre_transform(X, **fit_params)
self.steps[-1][-1].fit(Xt, **fit_params)
def fit(self, X, y=None, **fit_params):
Xt, fit_params = self._pre_transform(X, y, **fit_params)
_call_fit(self.steps[-1][-1].fit, Xt, y, **fit_params)
return self

def fit_transform(self, X, **fit_params):
Xt, fit_params = self._pre_transform(X, **fit_params)
def fit_transform(self, X, y=None, **fit_params):
Xt, fit_params = self._pre_transform(X, y, **fit_params)
if hasattr(self.steps[-1][-1], 'fit_transform'):
return self.steps[-1][-1].fit_transform(Xt, **fit_params)
return _call_fit(self.steps[-1][-1].fit_transform,
Xt, y, **fit_params)
else:
return self.steps[-1][-1].fit(Xt, **fit_params).transform(Xt)
return _call_fit(self.steps[-1][-1].fit,
Xt, y, **fit_params).transform(Xt)


def make_transformer_pipeline(*steps):
Expand Down
39 changes: 39 additions & 0 deletions tests/test_dataframe_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Imputer, StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
from numpy.testing import assert_array_equal
Expand Down Expand Up @@ -69,6 +70,13 @@ def simple_dataframe():
return pd.DataFrame({'a': [1, 2, 3]})


@pytest.fixture
def complex_dataframe():
return pd.DataFrame({'target': ['a', 'a', 'a', 'b', 'b', 'b'],
'feat1': [1, 2, 3, 4, 5, 6],
'feat2': [1, 2, 3, 2, 3, 4]})


def test_nonexistent_columns_explicit_fail(simple_dataframe):
"""
If a nonexistent column is selected, KeyError is raised.
Expand Down Expand Up @@ -306,6 +314,37 @@ def test_sparse_off(simple_dataframe):
assert type(dmatrix) != sparse.csr.csr_matrix


def test_fit_with_optional_y_arg(complex_dataframe):
"""
Transformers with an optional y argument in the fit method
are handled correctly
"""
df = complex_dataframe
mapper = DataFrameMapper([(['feat1', 'feat2'], MockTClassifier())])
# doesn't fail
mapper.fit(df[['feat1', 'feat2']], df['target'])


def test_fit_with_required_y_arg(complex_dataframe):
"""
Transformers with a required y argument in the fit method
are handled and perform correctly
"""
df = complex_dataframe
mapper = DataFrameMapper([(['feat1', 'feat2'], SelectKBest(chi2, k=1))])

# fit, doesn't fail
ft_arr = mapper.fit(df[['feat1', 'feat2']], df['target'])

# fit_transform
ft_arr = mapper.fit_transform(df[['feat1', 'feat2']], df['target'])
assert_array_equal(ft_arr, df[['feat1']].values)

# transform
t_arr = mapper.transform(df[['feat1', 'feat2']])
assert_array_equal(t_arr, df[['feat1']].values)


# Integration tests with real dataframes

@pytest.fixture
Expand Down
76 changes: 75 additions & 1 deletion tests/test_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
import pytest
from sklearn_pandas.pipeline import TransformerPipeline
from sklearn_pandas.pipeline import TransformerPipeline, _call_fit
from functools import partial
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Imported but unused; please remove this import.


# In py3, mock is included with the unittest standard library
# In py2, it's a separate package
try:
from unittest.mock import patch
except ImportError:
from mock import patch


class NoTransformT(object):
Expand All @@ -16,6 +24,39 @@ def transform(self, x):
return self


class Trans(object):
"""
Transformer with fit and transform methods
"""
def fit(self, x, y=None):
return self

def transform(self, x):
return self


def func_x_y(x, y, kwarg='kwarg'):
"""
Function with required x and y arguments
"""
return


def func_x(x, kwarg='kwarg'):
"""
Function with required x argument
"""
return


def func_raise_type_err(x, y, kwarg='kwarg'):
"""
Function with required x and y arguments,
raises TypeError
"""
raise TypeError


def test_all_steps_fit_transform():
"""
All steps must implement fit and transform. Otherwise, raise TypeError.
Expand All @@ -25,3 +66,36 @@ def test_all_steps_fit_transform():

with pytest.raises(TypeError):
TransformerPipeline([('svc', NoFitT())])


@patch.object(Trans, 'fit', side_effect=func_x_y)
def test_called_with_x_and_y(mock_fit):
"""
Fit method with required X and y arguments is called with both and with
any additional keywords
"""
_call_fit(Trans().fit, 'X', 'y', kwarg='kwarg')
mock_fit.assert_called_with('X', 'y', kwarg='kwarg')


@patch.object(Trans, 'fit', side_effect=func_x)
def test_called_with_x(mock_fit):
"""
Fit method with a required X arguments is called with it and with
any additional keywords
"""
_call_fit(Trans().fit, 'X', 'y', kwarg='kwarg')
mock_fit.assert_called_with('X', kwarg='kwarg')

_call_fit(Trans().fit, 'X', kwarg='kwarg')
mock_fit.assert_called_with('X', kwarg='kwarg')


@patch.object(Trans, 'fit', side_effect=func_raise_type_err)
def test_raises_type_error(mock_fit):
"""
If a fit method with required X and y arguments raises a TypeError, it's
re-raised (for a different reason) when it's called with one argument
"""
with pytest.raises(TypeError):
_call_fit(Trans().fit, 'X', 'y', kwarg='kwarg')