Skip to content

Allow specifying a default transformer #57

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 3, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 26 additions & 2 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -157,16 +157,38 @@ Only columns that are listed in the DataFrameMapper are kept. To keep a column b
[ 1., 0., 0., 5.],
[ 0., 0., 1., 4.]])

Applying a default transformer
******************************

A default transformer can be applied to columns not explicitly selected
passing it as the ``default`` argument to the mapper:

>>> mapper4 = DataFrameMapper([
... ('pet', sklearn.preprocessing.LabelBinarizer()),
... ('children', None)
... ], default=sklearn.preprocessing.StandardScaler())
>>> np.round(mapper4.fit_transform(data.copy()))
array([[ 1., 0., 0., 4., 2.],
[ 0., 1., 0., 6., -1.],
[ 0., 1., 0., 3., 0.],
[ 0., 0., 1., 3., -1.],
[ 1., 0., 0., 2., -0.],
[ 0., 1., 0., 3., 1.],
[ 1., 0., 0., 5., -0.],
[ 0., 0., 1., 4., -1.]])

Using ``default=False`` (the default) drops unselected columns. Using
``default=None`` pass the unselected columns unchanged.

Working with sparse features
****************************

``DataFrameMapper``s will return a dense feature array by default. Setting ``sparse=True`` in the mapper will return a sparse array whenever any of the extracted features is sparse. Example:

>>> mapper4 = DataFrameMapper([
>>> mapper5 = DataFrameMapper([
... ('pet', CountVectorizer()),
... ], sparse=True)
>>> type(mapper4.fit_transform(data))
>>> type(mapper5.fit_transform(data))
<class 'scipy.sparse.csr.csr_matrix'>

The stacking of the sparse features is done without ever densifying them.
Expand Down Expand Up @@ -195,6 +217,8 @@ Development

* Deprecate custom cross-validation shim classes.
* Require ``scikit-learn>=0.15.0``. Resolves #49.
* Allow applying a default transformer to columns not selected explicitly in
the mapper. Resolves #55.


1.1.0 (2015-12-06)
Expand Down
55 changes: 54 additions & 1 deletion sklearn_pandas/dataframe_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,29 +33,68 @@ class DataFrameMapper(BaseEstimator, TransformerMixin):
sklearn transformation.
"""

def __init__(self, features, sparse=False):
def __init__(self, features, default=False, sparse=False):
"""
Params:

features a list of pairs. The first element is the pandas column
selector. This can be a string (for one column) or a list
of strings. The second element is an object that supports
sklearn's transform interface, or a list of such objects.

default default transformer to apply to the columns not
explicitly selected in the mapper. If False (default),
discard them. If None, pass them through untouched. Any
other transformer will be applied to all the unselected
columns as a whole, taken as a 2d-array.

sparse will return sparse matrix if set True and any of the
extracted features is sparse. Defaults to False.
"""
if isinstance(features, list):
features = [(columns, _build_transformer(transformers))
for (columns, transformers) in features]
self.features = features
self.default = _build_transformer(default)
self.sparse = sparse

@property
def _selected_columns(self):
"""
Return a set of selected columns in the feature list.
"""
selected_columns = set()
for feature in self.features:
columns = feature[0]
if isinstance(columns, list):
selected_columns = selected_columns.union(set(columns))
else:
selected_columns.add(columns)
return selected_columns

def _unselected_columns(self, X):
"""
Return list of columns present in X and not selected explicitly in the
mapper.

Unselected columns are returned in the order they appear in the
dataframe to avoid issues with different ordering during default fit
and transform steps.
"""
X_columns = list(X.columns)
return [column for column in X_columns if
column not in self._selected_columns]

def __setstate__(self, state):
# compatibility shim for pickles created with sklearn-pandas<1.0.0
self.features = [(columns, _build_transformer(transformers))
for (columns, transformers) in state['features']]
self.sparse = state.get('sparse', False)

# compatibility shim for pickles created before ``default`` init
# argument existed
self.default = state.get('default', False)

def _get_col_subset(self, X, cols):
"""
Get a subset of columns from the given table X.
Expand Down Expand Up @@ -95,6 +134,12 @@ def fit(self, X, y=None):
for columns, transformers in self.features:
if transformers is not None:
transformers.fit(self._get_col_subset(X, columns))

# handle features not explicitly selected
if self.default: # not False and not None
self.default.fit(
self._get_col_subset(X, self._unselected_columns(X))
)
return self

def transform(self, X):
Expand All @@ -113,6 +158,14 @@ def transform(self, X):
Xt = transformers.transform(Xt)
extracted.append(_handle_feature(Xt))

# handle features not explicitly selected
if self.default is not False:
Xt = self._get_col_subset(X, self._unselected_columns(X))
if self.default is not None:
Xt = self.default.transform(Xt)
extracted.append(_handle_feature(Xt))


# combine the feature outputs into one array.
# at this point we lose track of which features
# were created from which input columns, so it's
Expand Down
77 changes: 76 additions & 1 deletion tests/test_dataframe_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.preprocessing import Imputer, StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
from numpy.testing import assert_array_equal
Expand Down Expand Up @@ -159,6 +159,70 @@ def test_build_transformers():
assert pipeline.steps[ix][1] == transformer


def test_selected_columns():
"""
selected_columns returns a set of the columns appearing in the features
of the mapper.
"""
mapper = DataFrameMapper([
('a', None),
(['a', 'b'], None)
])
assert mapper._selected_columns == {'a', 'b'}


def test_unselected_columns():
"""
selected_columns returns a list of the columns not appearing in the
features of the mapper but present in the given dataframe.
"""
df = pd.DataFrame({'a': [1], 'b': [2], 'c': [3]})
mapper = DataFrameMapper([
('a', None),
(['a', 'b'], None)
])
assert 'c' in mapper._unselected_columns(df)


def test_default_false():
"""
If default=False, non explicitly selected columns are discarded.
"""
df = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 5, 7]})
mapper = DataFrameMapper([
('b', None)
], default=False)

transformed = mapper.fit_transform(df)
assert transformed.shape == (3, 1)


def test_default_none():
"""
If default=None, non explicitly selected columns are passed through
untransformed.
"""
df = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 5, 7]})
mapper = DataFrameMapper([
(['a'], OneHotEncoder())
], default=None)

transformed = mapper.fit_transform(df)
assert (transformed[:, 3] == np.array([3, 5, 7]).T).all()


def test_default_transformer():
"""
If default=Transformer, non explicitly selected columns are applied this
transformer.
"""
df = pd.DataFrame({'a': [1, np.nan, 3], })
mapper = DataFrameMapper([], default=Imputer())

transformed = mapper.fit_transform(df)
assert (transformed[: 0] == np.array([1., 2., 3.])).all()


def test_list_transformers_single_arg(simple_dataframe):
"""
Multiple transformers can be specified in a list even if some of them
Expand Down Expand Up @@ -203,6 +267,17 @@ def test_list_transformers_old_unpickle(simple_dataframe):
assert isinstance(transformer.steps[0][1], MockXTransformer)


def test_default_old_unpickle(simple_dataframe):
mapper = DataFrameMapper([('a', None)])
# simulate the mapper was pickled before the ``default`` init argument
# existed
del mapper.default
mapper_pickled = pickle.dumps(mapper)

loaded_mapper = pickle.loads(mapper_pickled)
loaded_mapper.fit_transform(simple_dataframe) # doesn't fail


def test_sparse_features(simple_dataframe):
"""
If any of the extracted features is sparse and "sparse" argument
Expand Down