Skip to content

Commit 9fef9aa

Browse files
committed
added unit tests and updated README
added unit tests to test_dataframe_mapper and updated README
1 parent 8372f35 commit 9fef9aa

File tree

2 files changed

+65
-9
lines changed

2 files changed

+65
-9
lines changed

README.rst

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -167,19 +167,36 @@ passing it as the ``default`` argument to the mapper:
167167
... ('pet', sklearn.preprocessing.LabelBinarizer()),
168168
... ('children', None)
169169
... ], default=sklearn.preprocessing.StandardScaler())
170-
>>> np.round(mapper4.fit_transform(data.copy()))
171-
array([[ 1., 0., 0., 4., 2.],
172-
[ 0., 1., 0., 6., -1.],
173-
[ 0., 1., 0., 3., 0.],
174-
[ 0., 0., 1., 3., -1.],
175-
[ 1., 0., 0., 2., -0.],
176-
[ 0., 1., 0., 3., 1.],
177-
[ 1., 0., 0., 5., -0.],
178-
[ 0., 0., 1., 4., -1.]])
170+
>>> np.round(mapper4.fit_transform(data.copy()), 1)
171+
array([[ 1. , 0. , 0. , 4. , 2.3],
172+
[ 0. , 1. , 0. , 6. , -0.9],
173+
[ 0. , 1. , 0. , 3. , 0.1],
174+
[ 0. , 0. , 1. , 3. , -0.7],
175+
[ 1. , 0. , 0. , 2. , -0.5],
176+
[ 0. , 1. , 0. , 3. , 0.8],
177+
[ 1. , 0. , 0. , 5. , -0.3],
178+
[ 0. , 0. , 1. , 4. , -0.7]])
179179

180180
Using ``default=False`` (the default) drops unselected columns. Using
181181
``default=None`` pass the unselected columns unchanged.
182182

183+
Feature selection and other supervised transformations
184+
******************************************************
185+
186+
``DataFrameMapper`` supports transformers that require both X and y arguments. An example of this is feature selection. Treating the 'pet' column as the target, we will select the column that best predicts it.
187+
188+
>>> from sklearn.feature_selection import SelectKBest, chi2
189+
>>> mapper_fs = DataFrameMapper([(['children','salary'], SelectKBest(chi2, k=1))])
190+
>>> mapper_fs.fit_transform(data[['children','salary']], data['pet'])
191+
array([[ 90.],
192+
[ 24.],
193+
[ 44.],
194+
[ 27.],
195+
[ 32.],
196+
[ 59.],
197+
[ 36.],
198+
[ 27.]])
199+
183200
Working with sparse features
184201
****************************
185202

tests/test_dataframe_mapper.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from sklearn.svm import SVC
1919
from sklearn.feature_extraction.text import CountVectorizer
2020
from sklearn.preprocessing import Imputer, StandardScaler, OneHotEncoder
21+
from sklearn.feature_selection import SelectKBest, chi2
2122
from sklearn.base import BaseEstimator, TransformerMixin
2223
import numpy as np
2324
from numpy.testing import assert_array_equal
@@ -69,6 +70,13 @@ def simple_dataframe():
6970
return pd.DataFrame({'a': [1, 2, 3]})
7071

7172

73+
@pytest.fixture
74+
def complex_dataframe():
75+
return pd.DataFrame({'target': ['a', 'a', 'a', 'b', 'b', 'b'],
76+
'feat1': [1, 2, 3, 4, 5, 6],
77+
'feat2': [1, 2, 3, 2, 3, 4]})
78+
79+
7280
def test_nonexistent_columns_explicit_fail(simple_dataframe):
7381
"""
7482
If a nonexistent column is selected, KeyError is raised.
@@ -306,6 +314,37 @@ def test_sparse_off(simple_dataframe):
306314
assert type(dmatrix) != sparse.csr.csr_matrix
307315

308316

317+
def test_fit_with_optional_y_arg(complex_dataframe):
318+
"""
319+
Transformers with an optional y argument in the fit method
320+
are handled correctly
321+
"""
322+
df = complex_dataframe
323+
mapper = DataFrameMapper([(['feat1', 'feat2'], MockTClassifier())])
324+
# doesn't fail
325+
mapper.fit(df[['feat1', 'feat2']], df['target'])
326+
327+
328+
def test_fit_with_required_y_arg(complex_dataframe):
329+
"""
330+
Transformers with a required y argument in the fit method
331+
are handled and perform correctly
332+
"""
333+
df = complex_dataframe
334+
mapper = DataFrameMapper([(['feat1', 'feat2'], SelectKBest(chi2, k=1))])
335+
336+
# fit, doesn't fail
337+
ft_arr = mapper.fit(df[['feat1', 'feat2']], df['target'])
338+
339+
# fit_transform
340+
ft_arr = mapper.fit_transform(df[['feat1', 'feat2']], df['target'])
341+
assert_array_equal(ft_arr, df[['feat1']].values)
342+
343+
# transform
344+
t_arr = mapper.transform(df[['feat1', 'feat2']])
345+
assert_array_equal(t_arr, df[['feat1']].values)
346+
347+
309348
# Integration tests with real dataframes
310349

311350
@pytest.fixture

0 commit comments

Comments
 (0)