added unit tests and updated README

vzaretsk · vzaretsk · commit 9fef9aa4bdbe · 2016-07-16T18:49:46.000-07:00
added unit tests to test_dataframe_mapper and updated README
diff --git a/README.rst b/README.rst
@@ -167,19 +167,36 @@ passing it as the ``default`` argument to the mapper:
     ...     ('pet', sklearn.preprocessing.LabelBinarizer()),
     ...     ('children', None)
     ... ], default=sklearn.preprocessing.StandardScaler())
-    >>> np.round(mapper4.fit_transform(data.copy()))
-    array([[ 1.,  0.,  0.,  4.,  2.],
-           [ 0.,  1.,  0.,  6., -1.],
-           [ 0.,  1.,  0.,  3.,  0.],
-           [ 0.,  0.,  1.,  3., -1.],
-           [ 1.,  0.,  0.,  2., -0.],
-           [ 0.,  1.,  0.,  3.,  1.],
-           [ 1.,  0.,  0.,  5., -0.],
-           [ 0.,  0.,  1.,  4., -1.]])
+    >>> np.round(mapper4.fit_transform(data.copy()), 1)
+    array([[ 1. ,  0. ,  0. ,  4. ,  2.3],
+           [ 0. ,  1. ,  0. ,  6. , -0.9],
+           [ 0. ,  1. ,  0. ,  3. ,  0.1],
+           [ 0. ,  0. ,  1. ,  3. , -0.7],
+           [ 1. ,  0. ,  0. ,  2. , -0.5],
+           [ 0. ,  1. ,  0. ,  3. ,  0.8],
+           [ 1. ,  0. ,  0. ,  5. , -0.3],
+           [ 0. ,  0. ,  1. ,  4. , -0.7]])
 
 Using ``default=False`` (the default) drops unselected columns. Using
 ``default=None`` pass the unselected columns unchanged.
 
+Feature selection and other supervised transformations
+******************************************************
+
+``DataFrameMapper`` supports transformers that require both X and y arguments. An example of this is feature selection. Treating the 'pet' column as the target, we will select the column that best predicts it.
+
+    >>> from sklearn.feature_selection import SelectKBest, chi2
+    >>> mapper_fs = DataFrameMapper([(['children','salary'], SelectKBest(chi2, k=1))])
+    >>> mapper_fs.fit_transform(data[['children','salary']], data['pet'])
+    array([[ 90.],
+           [ 24.],
+           [ 44.],
+           [ 27.],
+           [ 32.],
+           [ 59.],
+           [ 36.],
+           [ 27.]])
+
 Working with sparse features
 ****************************
 
diff --git a/tests/test_dataframe_mapper.py b/tests/test_dataframe_mapper.py
@@ -18,6 +18,7 @@
 from sklearn.svm import SVC
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.preprocessing import Imputer, StandardScaler, OneHotEncoder
+from sklearn.feature_selection import SelectKBest, chi2
 from sklearn.base import BaseEstimator, TransformerMixin
 import numpy as np
 from numpy.testing import assert_array_equal
@@ -69,6 +70,13 @@ def simple_dataframe():
     return pd.DataFrame({'a': [1, 2, 3]})
 
 
+@pytest.fixture
+def complex_dataframe():
+    return pd.DataFrame({'target': ['a', 'a', 'a', 'b', 'b', 'b'],
+                         'feat1': [1, 2, 3, 4, 5, 6],
+                         'feat2': [1, 2, 3, 2, 3, 4]})
+
+
 def test_nonexistent_columns_explicit_fail(simple_dataframe):
     """
     If a nonexistent column is selected, KeyError is raised.
@@ -306,6 +314,37 @@ def test_sparse_off(simple_dataframe):
     assert type(dmatrix) != sparse.csr.csr_matrix
 
 
+def test_fit_with_optional_y_arg(complex_dataframe):
+    """
+    Transformers with an optional y argument in the fit method
+    are handled correctly
+    """
+    df = complex_dataframe
+    mapper = DataFrameMapper([(['feat1', 'feat2'], MockTClassifier())])
+    # doesn't fail
+    mapper.fit(df[['feat1', 'feat2']], df['target'])
+
+
+def test_fit_with_required_y_arg(complex_dataframe):
+    """
+    Transformers with a required y argument in the fit method
+    are handled and perform correctly
+    """
+    df = complex_dataframe
+    mapper = DataFrameMapper([(['feat1', 'feat2'], SelectKBest(chi2, k=1))])
+
+    # fit, doesn't fail
+    ft_arr = mapper.fit(df[['feat1', 'feat2']], df['target'])
+
+    # fit_transform
+    ft_arr = mapper.fit_transform(df[['feat1', 'feat2']], df['target'])
+    assert_array_equal(ft_arr, df[['feat1']].values)
+
+    # transform
+    t_arr = mapper.transform(df[['feat1', 'feat2']])
+    assert_array_equal(t_arr, df[['feat1']].values)
+
+
 # Integration tests with real dataframes
 
 @pytest.fixture