scikit-learn-contrib · dukebody · Aug 3, 2016 · Jul 10, 2016 · Jul 13, 2016 · Jul 16, 2016
diff --git a/README.rst b/README.rst
@@ -167,19 +167,36 @@ passing it as the ``default`` argument to the mapper:
     ...     ('pet', sklearn.preprocessing.LabelBinarizer()),
     ...     ('children', None)
     ... ], default=sklearn.preprocessing.StandardScaler())
-    >>> np.round(mapper4.fit_transform(data.copy()))
-    array([[ 1.,  0.,  0.,  4.,  2.],
-           [ 0.,  1.,  0.,  6., -1.],
-           [ 0.,  1.,  0.,  3.,  0.],
-           [ 0.,  0.,  1.,  3., -1.],
-           [ 1.,  0.,  0.,  2., -0.],
-           [ 0.,  1.,  0.,  3.,  1.],
-           [ 1.,  0.,  0.,  5., -0.],
-           [ 0.,  0.,  1.,  4., -1.]])
+    >>> np.round(mapper4.fit_transform(data.copy()), 1)
+    array([[ 1. ,  0. ,  0. ,  4. ,  2.3],
+           [ 0. ,  1. ,  0. ,  6. , -0.9],
+           [ 0. ,  1. ,  0. ,  3. ,  0.1],
+           [ 0. ,  0. ,  1. ,  3. , -0.7],
+           [ 1. ,  0. ,  0. ,  2. , -0.5],
+           [ 0. ,  1. ,  0. ,  3. ,  0.8],
+           [ 1. ,  0. ,  0. ,  5. , -0.3],
+           [ 0. ,  0. ,  1. ,  4. , -0.7]])
 
 Using ``default=False`` (the default) drops unselected columns. Using
 ``default=None`` pass the unselected columns unchanged.
 
+Feature selection and other supervised transformations
+******************************************************
+
+``DataFrameMapper`` supports transformers that require both X and y arguments. An example of this is feature selection. Treating the 'pet' column as the target, we will select the column that best predicts it.
+
+    >>> from sklearn.feature_selection import SelectKBest, chi2
+    >>> mapper_fs = DataFrameMapper([(['children','salary'], SelectKBest(chi2, k=1))])
+    >>> mapper_fs.fit_transform(data[['children','salary']], data['pet'])
+    array([[ 90.],
+           [ 24.],
+           [ 44.],
+           [ 27.],
+           [ 32.],
+           [ 59.],
+           [ 36.],
+           [ 27.]])
+
 Working with sparse features
 ****************************
 

diff --git a/sklearn_pandas/dataframe_mapper.py b/sklearn_pandas/dataframe_mapper.py
@@ -5,7 +5,7 @@
 from sklearn.base import BaseEstimator, TransformerMixin
 
 from .cross_validation import DataWrapper
-from .pipeline import make_transformer_pipeline
+from .pipeline import make_transformer_pipeline, _call_fit
 
 # load in the correct stringtype: str for py3, basestring for py2
 string_types = str if sys.version_info >= (3, 0) else basestring
@@ -130,16 +130,19 @@ def fit(self, X, y=None):
         Fit a transformation from the pipeline
 
         X       the data to fit
+
+        y       the target vector relative to X, optional
+
         """
         for columns, transformers in self.features:
             if transformers is not None:
-                transformers.fit(self._get_col_subset(X, columns))
+                _call_fit(transformers.fit,
+                          self._get_col_subset(X, columns), y)
 
         # handle features not explicitly selected
         if self.default:  # not False and not None
-            self.default.fit(
-                self._get_col_subset(X, self._unselected_columns(X))
-            )
+            _call_fit(self.default.fit,
+                      self._get_col_subset(X, self._unselected_columns(X)), y)
         return self
 
     def transform(self, X):

diff --git a/sklearn_pandas/pipeline.py b/sklearn_pandas/pipeline.py
@@ -3,12 +3,37 @@
 from sklearn.utils import tosequence
 
 
+def _call_fit(fit_method, X, y=None, **kwargs):
+    """
+    helper function, calls the fit or fit_transform method with the correct
+    number of parameters
+
+    fit_method: fit or fit_transform method of the transformer
+    X: the data to fit
+    y: the target vector relative to X, optional
+    kwargs: any keyword arguments to the fit method
+
+    return: the result of the fit or fit_transform method
+
+    WARNING: if this function raises a TypeError exception, test the fit
+    or fit_transform method passed to it in isolation as _call_fit will not
+    distinguish TypeError due to incorrect number of arguments from
+    other TypeError
+    """
+    try:
+        return fit_method(X, y, **kwargs)
+    except TypeError:
+        # fit takes only one argument
+        return fit_method(X, **kwargs)
+
+
 class TransformerPipeline(Pipeline):
     """
-    Pipeline that expects all steps to be transformers taking a single argument
+    Pipeline that expects all steps to be transformers taking a single X argument,
+    an optional y argument,
     and having fit and transform methods.
 
-    Code is copied from sklearn's Pipeline, leaving out the `y=None` argument.
+    Code is copied from sklearn's Pipeline
     """
     def __init__(self, steps):
         names, estimators = zip(*steps)
@@ -31,31 +56,34 @@ def __init__(self, steps):
                             "'%s' (type %s) doesn't)"
                             % (estimator, type(estimator)))
 
-    def _pre_transform(self, X, **fit_params):
+    def _pre_transform(self, X, y=None, **fit_params):
         fit_params_steps = dict((step, {}) for step, _ in self.steps)
         for pname, pval in six.iteritems(fit_params):
             step, param = pname.split('__', 1)
             fit_params_steps[step][param] = pval
         Xt = X
         for name, transform in self.steps[:-1]:
             if hasattr(transform, "fit_transform"):
-                Xt = transform.fit_transform(Xt, **fit_params_steps[name])
+                Xt = _call_fit(transform.fit_transform,
+                               Xt, y, **fit_params_steps[name])
             else:
-                Xt = transform.fit(Xt, **fit_params_steps[name]) \
-                              .transform(Xt)
+                Xt = _call_fit(transform.fit,
+                               Xt, y, **fit_params_steps[name]).transform(Xt)
         return Xt, fit_params_steps[self.steps[-1][0]]
 
-    def fit(self, X, **fit_params):
-        Xt, fit_params = self._pre_transform(X, **fit_params)
-        self.steps[-1][-1].fit(Xt, **fit_params)
+    def fit(self, X, y=None, **fit_params):
+        Xt, fit_params = self._pre_transform(X, y, **fit_params)
+        _call_fit(self.steps[-1][-1].fit, Xt, y, **fit_params)
         return self
 
-    def fit_transform(self, X, **fit_params):
-        Xt, fit_params = self._pre_transform(X, **fit_params)
+    def fit_transform(self, X, y=None, **fit_params):
+        Xt, fit_params = self._pre_transform(X, y, **fit_params)
         if hasattr(self.steps[-1][-1], 'fit_transform'):
-            return self.steps[-1][-1].fit_transform(Xt, **fit_params)
+            return _call_fit(self.steps[-1][-1].fit_transform,
+                             Xt, y, **fit_params)
         else:
-            return self.steps[-1][-1].fit(Xt, **fit_params).transform(Xt)
+            return _call_fit(self.steps[-1][-1].fit,
+                             Xt, y, **fit_params).transform(Xt)
 
 
 def make_transformer_pipeline(*steps):

diff --git a/tests/test_dataframe_mapper.py b/tests/test_dataframe_mapper.py
@@ -18,6 +18,7 @@
 from sklearn.svm import SVC
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.preprocessing import Imputer, StandardScaler, OneHotEncoder
+from sklearn.feature_selection import SelectKBest, chi2
 from sklearn.base import BaseEstimator, TransformerMixin
 import numpy as np
 from numpy.testing import assert_array_equal
@@ -69,6 +70,13 @@ def simple_dataframe():
     return pd.DataFrame({'a': [1, 2, 3]})
 
 
+@pytest.fixture
+def complex_dataframe():
+    return pd.DataFrame({'target': ['a', 'a', 'a', 'b', 'b', 'b'],
+                         'feat1': [1, 2, 3, 4, 5, 6],
+                         'feat2': [1, 2, 3, 2, 3, 4]})
+
+
 def test_nonexistent_columns_explicit_fail(simple_dataframe):
     """
     If a nonexistent column is selected, KeyError is raised.
@@ -306,6 +314,37 @@ def test_sparse_off(simple_dataframe):
     assert type(dmatrix) != sparse.csr.csr_matrix
 
 
+def test_fit_with_optional_y_arg(complex_dataframe):
+    """
+    Transformers with an optional y argument in the fit method
+    are handled correctly
+    """
+    df = complex_dataframe
+    mapper = DataFrameMapper([(['feat1', 'feat2'], MockTClassifier())])
+    # doesn't fail
+    mapper.fit(df[['feat1', 'feat2']], df['target'])
+
+
+def test_fit_with_required_y_arg(complex_dataframe):
+    """
+    Transformers with a required y argument in the fit method
+    are handled and perform correctly
+    """
+    df = complex_dataframe
+    mapper = DataFrameMapper([(['feat1', 'feat2'], SelectKBest(chi2, k=1))])
+
+    # fit, doesn't fail
+    ft_arr = mapper.fit(df[['feat1', 'feat2']], df['target'])
+
+    # fit_transform
+    ft_arr = mapper.fit_transform(df[['feat1', 'feat2']], df['target'])
+    assert_array_equal(ft_arr, df[['feat1']].values)
+
+    # transform
+    t_arr = mapper.transform(df[['feat1', 'feat2']])
+    assert_array_equal(t_arr, df[['feat1']].values)
+
+
 # Integration tests with real dataframes
 
 @pytest.fixture

diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py
@@ -1,5 +1,12 @@
 import pytest
-from sklearn_pandas.pipeline import TransformerPipeline
+from sklearn_pandas.pipeline import TransformerPipeline, _call_fit
+
+# In py3, mock is included with the unittest standard library
+# In py2, it's a separate package
+try:
+    from unittest.mock import patch
+except ImportError:
+    from mock import patch
 
 
 class NoTransformT(object):
@@ -16,6 +23,39 @@ def transform(self, x):
         return self
 
 
+class Trans(object):
+    """
+    Transformer with fit and transform methods
+    """
+    def fit(self, x, y=None):
+        return self
+
+    def transform(self, x):
+        return self
+
+
+def func_x_y(x, y, kwarg='kwarg'):
+    """
+    Function with required x and y arguments
+    """
+    return
+
+
+def func_x(x, kwarg='kwarg'):
+    """
+    Function with required x argument
+    """
+    return
+
+
+def func_raise_type_err(x, y, kwarg='kwarg'):
+    """
+    Function with required x and y arguments,
+    raises TypeError
+    """
+    raise TypeError
+
+
 def test_all_steps_fit_transform():
     """
     All steps must implement fit and transform. Otherwise, raise TypeError.
@@ -25,3 +65,36 @@ def test_all_steps_fit_transform():
 
     with pytest.raises(TypeError):
         TransformerPipeline([('svc', NoFitT())])
+
+
+@patch.object(Trans, 'fit', side_effect=func_x_y)
+def test_called_with_x_and_y(mock_fit):
+    """
+    Fit method with required X and y arguments is called with both and with
+    any additional keywords
+    """
+    _call_fit(Trans().fit, 'X', 'y', kwarg='kwarg')
+    mock_fit.assert_called_with('X', 'y', kwarg='kwarg')
+
+
+@patch.object(Trans, 'fit', side_effect=func_x)
+def test_called_with_x(mock_fit):
+    """
+    Fit method with a required X arguments is called with it and with
+    any additional keywords
+    """
+    _call_fit(Trans().fit, 'X', 'y', kwarg='kwarg')
+    mock_fit.assert_called_with('X', kwarg='kwarg')
+
+    _call_fit(Trans().fit, 'X', kwarg='kwarg')
+    mock_fit.assert_called_with('X', kwarg='kwarg')
+
+
+@patch.object(Trans, 'fit', side_effect=func_raise_type_err)
+def test_raises_type_error(mock_fit):
+    """
+    If a fit method with required X and y arguments raises a TypeError, it's
+    re-raised (for a different reason) when it's called with one argument
+    """
+    with pytest.raises(TypeError):
+        _call_fit(Trans().fit, 'X', 'y', kwarg='kwarg')