Skip to content

Commit 7a3c997

Browse files
committed
Merge pull request #57 from paulgb/default_transformer
Allow specifying a default transformer
2 parents 5280c00 + 0646548 commit 7a3c997

File tree

3 files changed

+156
-4
lines changed

3 files changed

+156
-4
lines changed

README.rst

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -157,16 +157,38 @@ Only columns that are listed in the DataFrameMapper are kept. To keep a column b
157157
[ 1., 0., 0., 5.],
158158
[ 0., 0., 1., 4.]])
159159

160+
Applying a default transformer
161+
******************************
162+
163+
A default transformer can be applied to columns not explicitly selected
164+
passing it as the ``default`` argument to the mapper:
165+
166+
>>> mapper4 = DataFrameMapper([
167+
... ('pet', sklearn.preprocessing.LabelBinarizer()),
168+
... ('children', None)
169+
... ], default=sklearn.preprocessing.StandardScaler())
170+
>>> np.round(mapper4.fit_transform(data.copy()))
171+
array([[ 1., 0., 0., 4., 2.],
172+
[ 0., 1., 0., 6., -1.],
173+
[ 0., 1., 0., 3., 0.],
174+
[ 0., 0., 1., 3., -1.],
175+
[ 1., 0., 0., 2., -0.],
176+
[ 0., 1., 0., 3., 1.],
177+
[ 1., 0., 0., 5., -0.],
178+
[ 0., 0., 1., 4., -1.]])
179+
180+
Using ``default=False`` (the default) drops unselected columns. Using
181+
``default=None`` pass the unselected columns unchanged.
160182

161183
Working with sparse features
162184
****************************
163185

164186
``DataFrameMapper``s will return a dense feature array by default. Setting ``sparse=True`` in the mapper will return a sparse array whenever any of the extracted features is sparse. Example:
165187

166-
>>> mapper4 = DataFrameMapper([
188+
>>> mapper5 = DataFrameMapper([
167189
... ('pet', CountVectorizer()),
168190
... ], sparse=True)
169-
>>> type(mapper4.fit_transform(data))
191+
>>> type(mapper5.fit_transform(data))
170192
<class 'scipy.sparse.csr.csr_matrix'>
171193

172194
The stacking of the sparse features is done without ever densifying them.
@@ -195,6 +217,8 @@ Development
195217

196218
* Deprecate custom cross-validation shim classes.
197219
* Require ``scikit-learn>=0.15.0``. Resolves #49.
220+
* Allow applying a default transformer to columns not selected explicitly in
221+
the mapper. Resolves #55.
198222

199223

200224
1.1.0 (2015-12-06)

sklearn_pandas/dataframe_mapper.py

Lines changed: 54 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,29 +33,68 @@ class DataFrameMapper(BaseEstimator, TransformerMixin):
3333
sklearn transformation.
3434
"""
3535

36-
def __init__(self, features, sparse=False):
36+
def __init__(self, features, default=False, sparse=False):
3737
"""
3838
Params:
3939
4040
features a list of pairs. The first element is the pandas column
4141
selector. This can be a string (for one column) or a list
4242
of strings. The second element is an object that supports
4343
sklearn's transform interface, or a list of such objects.
44+
45+
default default transformer to apply to the columns not
46+
explicitly selected in the mapper. If False (default),
47+
discard them. If None, pass them through untouched. Any
48+
other transformer will be applied to all the unselected
49+
columns as a whole, taken as a 2d-array.
50+
4451
sparse will return sparse matrix if set True and any of the
4552
extracted features is sparse. Defaults to False.
4653
"""
4754
if isinstance(features, list):
4855
features = [(columns, _build_transformer(transformers))
4956
for (columns, transformers) in features]
5057
self.features = features
58+
self.default = _build_transformer(default)
5159
self.sparse = sparse
5260

61+
@property
62+
def _selected_columns(self):
63+
"""
64+
Return a set of selected columns in the feature list.
65+
"""
66+
selected_columns = set()
67+
for feature in self.features:
68+
columns = feature[0]
69+
if isinstance(columns, list):
70+
selected_columns = selected_columns.union(set(columns))
71+
else:
72+
selected_columns.add(columns)
73+
return selected_columns
74+
75+
def _unselected_columns(self, X):
76+
"""
77+
Return list of columns present in X and not selected explicitly in the
78+
mapper.
79+
80+
Unselected columns are returned in the order they appear in the
81+
dataframe to avoid issues with different ordering during default fit
82+
and transform steps.
83+
"""
84+
X_columns = list(X.columns)
85+
return [column for column in X_columns if
86+
column not in self._selected_columns]
87+
5388
def __setstate__(self, state):
5489
# compatibility shim for pickles created with sklearn-pandas<1.0.0
5590
self.features = [(columns, _build_transformer(transformers))
5691
for (columns, transformers) in state['features']]
5792
self.sparse = state.get('sparse', False)
5893

94+
# compatibility shim for pickles created before ``default`` init
95+
# argument existed
96+
self.default = state.get('default', False)
97+
5998
def _get_col_subset(self, X, cols):
6099
"""
61100
Get a subset of columns from the given table X.
@@ -95,6 +134,12 @@ def fit(self, X, y=None):
95134
for columns, transformers in self.features:
96135
if transformers is not None:
97136
transformers.fit(self._get_col_subset(X, columns))
137+
138+
# handle features not explicitly selected
139+
if self.default: # not False and not None
140+
self.default.fit(
141+
self._get_col_subset(X, self._unselected_columns(X))
142+
)
98143
return self
99144

100145
def transform(self, X):
@@ -113,6 +158,14 @@ def transform(self, X):
113158
Xt = transformers.transform(Xt)
114159
extracted.append(_handle_feature(Xt))
115160

161+
# handle features not explicitly selected
162+
if self.default is not False:
163+
Xt = self._get_col_subset(X, self._unselected_columns(X))
164+
if self.default is not None:
165+
Xt = self.default.transform(Xt)
166+
extracted.append(_handle_feature(Xt))
167+
168+
116169
# combine the feature outputs into one array.
117170
# at this point we lose track of which features
118171
# were created from which input columns, so it's

tests/test_dataframe_mapper.py

Lines changed: 76 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from sklearn.pipeline import Pipeline
1818
from sklearn.svm import SVC
1919
from sklearn.feature_extraction.text import CountVectorizer
20-
from sklearn.preprocessing import Imputer, StandardScaler
20+
from sklearn.preprocessing import Imputer, StandardScaler, OneHotEncoder
2121
from sklearn.base import BaseEstimator, TransformerMixin
2222
import numpy as np
2323
from numpy.testing import assert_array_equal
@@ -159,6 +159,70 @@ def test_build_transformers():
159159
assert pipeline.steps[ix][1] == transformer
160160

161161

162+
def test_selected_columns():
163+
"""
164+
selected_columns returns a set of the columns appearing in the features
165+
of the mapper.
166+
"""
167+
mapper = DataFrameMapper([
168+
('a', None),
169+
(['a', 'b'], None)
170+
])
171+
assert mapper._selected_columns == {'a', 'b'}
172+
173+
174+
def test_unselected_columns():
175+
"""
176+
selected_columns returns a list of the columns not appearing in the
177+
features of the mapper but present in the given dataframe.
178+
"""
179+
df = pd.DataFrame({'a': [1], 'b': [2], 'c': [3]})
180+
mapper = DataFrameMapper([
181+
('a', None),
182+
(['a', 'b'], None)
183+
])
184+
assert 'c' in mapper._unselected_columns(df)
185+
186+
187+
def test_default_false():
188+
"""
189+
If default=False, non explicitly selected columns are discarded.
190+
"""
191+
df = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 5, 7]})
192+
mapper = DataFrameMapper([
193+
('b', None)
194+
], default=False)
195+
196+
transformed = mapper.fit_transform(df)
197+
assert transformed.shape == (3, 1)
198+
199+
200+
def test_default_none():
201+
"""
202+
If default=None, non explicitly selected columns are passed through
203+
untransformed.
204+
"""
205+
df = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 5, 7]})
206+
mapper = DataFrameMapper([
207+
(['a'], OneHotEncoder())
208+
], default=None)
209+
210+
transformed = mapper.fit_transform(df)
211+
assert (transformed[:, 3] == np.array([3, 5, 7]).T).all()
212+
213+
214+
def test_default_transformer():
215+
"""
216+
If default=Transformer, non explicitly selected columns are applied this
217+
transformer.
218+
"""
219+
df = pd.DataFrame({'a': [1, np.nan, 3], })
220+
mapper = DataFrameMapper([], default=Imputer())
221+
222+
transformed = mapper.fit_transform(df)
223+
assert (transformed[: 0] == np.array([1., 2., 3.])).all()
224+
225+
162226
def test_list_transformers_single_arg(simple_dataframe):
163227
"""
164228
Multiple transformers can be specified in a list even if some of them
@@ -203,6 +267,17 @@ def test_list_transformers_old_unpickle(simple_dataframe):
203267
assert isinstance(transformer.steps[0][1], MockXTransformer)
204268

205269

270+
def test_default_old_unpickle(simple_dataframe):
271+
mapper = DataFrameMapper([('a', None)])
272+
# simulate the mapper was pickled before the ``default`` init argument
273+
# existed
274+
del mapper.default
275+
mapper_pickled = pickle.dumps(mapper)
276+
277+
loaded_mapper = pickle.loads(mapper_pickled)
278+
loaded_mapper.fit_transform(simple_dataframe) # doesn't fail
279+
280+
206281
def test_sparse_features(simple_dataframe):
207282
"""
208283
If any of the extracted features is sparse and "sparse" argument

0 commit comments

Comments
 (0)