Skip to content

Commit 0d4f562

Browse files
committed
Factor out code in several modules, to avoid having everything in __init__.py.
1 parent b4a45ae commit 0d4f562

File tree

5 files changed

+175
-169
lines changed

5 files changed

+175
-169
lines changed

README.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,7 @@ Changelog
193193
1.1.0 (development)
194194
*******************
195195

196+
* Factor out code in several modules, to avoid having everything in ``__init__.py``.
196197
* Use custom ``TransformerPipeline`` class to allow transformation steps accepting only a X argument. Fixes #46.
197198
* Add compatibility shim for unpickling mappers with list of transformers created before 1.0.0. Fixes #45.
198199

sklearn_pandas/__init__.py

Lines changed: 3 additions & 166 deletions
Original file line numberDiff line numberDiff line change
@@ -1,52 +1,10 @@
11
__version__ = '1.0.0'
22

3-
4-
import sys
53
import numpy as np
6-
import pandas as pd
7-
from scipy import sparse
8-
from sklearn.base import BaseEstimator, TransformerMixin
9-
from sklearn import cross_validation
10-
from sklearn import grid_search
11-
from .pipeline import make_transformer_pipeline
12-
13-
# load in the correct stringtype: str for py3, basestring for py2
14-
string_types = str if sys.version_info >= (3, 0) else basestring
15-
16-
17-
def cross_val_score(model, X, *args, **kwargs):
18-
X = DataWrapper(X)
19-
return cross_validation.cross_val_score(model, X, *args, **kwargs)
20-
21-
22-
class GridSearchCV(grid_search.GridSearchCV):
23-
def fit(self, X, *params, **kwparams):
24-
return super(GridSearchCV, self).fit(DataWrapper(X), *params, **kwparams)
25-
26-
def predict(self, X, *params, **kwparams):
27-
return super(GridSearchCV, self).predict(DataWrapper(X), *params, **kwparams)
28-
29-
30-
try:
31-
class RandomizedSearchCV(grid_search.RandomizedSearchCV):
32-
def fit(self, X, *params, **kwparams):
33-
return super(RandomizedSearchCV, self).fit(DataWrapper(X), *params, **kwparams)
34-
35-
def predict(self, X, *params, **kwparams):
36-
return super(RandomizedSearchCV, self).predict(DataWrapper(X), *params, **kwparams)
37-
except AttributeError:
38-
pass
4+
from sklearn.base import TransformerMixin
395

40-
41-
class DataWrapper(object):
42-
def __init__(self, df):
43-
self.df = df
44-
45-
def __len__(self):
46-
return len(self.df)
47-
48-
def __getitem__(self, key):
49-
return self.df.iloc[key]
6+
from .dataframe_mapper import DataFrameMapper # NOQA
7+
from .cross_validation import cross_val_score, GridSearchCV, RandomizedSearchCV # NOQA
508

519

5210
class PassthroughTransformer(TransformerMixin):
@@ -55,124 +13,3 @@ def fit(self, X, y=None, **fit_params):
5513

5614
def transform(self, X):
5715
return np.array(X).astype(np.float)
58-
59-
60-
def _handle_feature(fea):
61-
"""
62-
Convert 1-dimensional arrays to 2-dimensional column vectors.
63-
"""
64-
if len(fea.shape) == 1:
65-
fea = np.array([fea]).T
66-
67-
return fea
68-
69-
70-
def _build_transformer(transformers):
71-
if isinstance(transformers, list):
72-
transformers = make_transformer_pipeline(*transformers)
73-
return transformers
74-
75-
76-
class DataFrameMapper(BaseEstimator, TransformerMixin):
77-
"""
78-
Map Pandas data frame column subsets to their own
79-
sklearn transformation.
80-
"""
81-
82-
def __init__(self, features, sparse=False):
83-
"""
84-
Params:
85-
86-
features a list of pairs. The first element is the pandas column
87-
selector. This can be a string (for one column) or a list
88-
of strings. The second element is an object that supports
89-
sklearn's transform interface, or a list of such objects.
90-
sparse will return sparse matrix if set True and any of the
91-
extracted features is sparse. Defaults to False.
92-
"""
93-
if isinstance(features, list):
94-
features = [(columns, _build_transformer(transformers))
95-
for (columns, transformers) in features]
96-
self.features = features
97-
self.sparse = sparse
98-
99-
def __setstate__(self, state):
100-
# compatibility shim for pickles created with sklearn-pandas<1.0.0
101-
self.features = [(columns, _build_transformer(transformers))
102-
for (columns, transformers) in state['features']]
103-
self.sparse = state.get('sparse', False)
104-
105-
def _get_col_subset(self, X, cols):
106-
"""
107-
Get a subset of columns from the given table X.
108-
109-
X a Pandas dataframe; the table to select columns from
110-
cols a string or list of strings representing the columns
111-
to select
112-
113-
Returns a numpy array with the data from the selected columns
114-
"""
115-
return_vector = False
116-
if isinstance(cols, string_types):
117-
return_vector = True
118-
cols = [cols]
119-
120-
if isinstance(X, list):
121-
X = [x[cols] for x in X]
122-
X = pd.DataFrame(X)
123-
124-
elif isinstance(X, DataWrapper):
125-
# if it's a datawrapper, unwrap it
126-
X = X.df
127-
128-
if return_vector:
129-
t = X[cols[0]].values
130-
else:
131-
t = X[cols].values
132-
133-
return t
134-
135-
def fit(self, X, y=None):
136-
"""
137-
Fit a transformation from the pipeline
138-
139-
X the data to fit
140-
"""
141-
for columns, transformers in self.features:
142-
if transformers is not None:
143-
transformers.fit(self._get_col_subset(X, columns))
144-
return self
145-
146-
def transform(self, X):
147-
"""
148-
Transform the given data. Assumes that fit has already been called.
149-
150-
X the data to transform
151-
"""
152-
extracted = []
153-
for columns, transformers in self.features:
154-
# columns could be a string or list of
155-
# strings; we don't care because pandas
156-
# will handle either.
157-
Xt = self._get_col_subset(X, columns)
158-
if transformers is not None:
159-
Xt = transformers.transform(Xt)
160-
extracted.append(_handle_feature(Xt))
161-
162-
# combine the feature outputs into one array.
163-
# at this point we lose track of which features
164-
# were created from which input columns, so it's
165-
# assumed that that doesn't matter to the model.
166-
167-
# If any of the extracted features is sparse, combine sparsely.
168-
# Otherwise, combine as normal arrays.
169-
if any(sparse.issparse(fea) for fea in extracted):
170-
stacked = sparse.hstack(extracted).tocsr()
171-
# return a sparse matrix only if the mapper was initialized
172-
# with sparse=True
173-
if not self.sparse:
174-
stacked = stacked.toarray()
175-
else:
176-
stacked = np.hstack(extracted)
177-
178-
return stacked

sklearn_pandas/cross_validation.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
from sklearn import cross_validation
2+
from sklearn import grid_search
3+
4+
5+
def cross_val_score(model, X, *args, **kwargs):
6+
X = DataWrapper(X)
7+
return cross_validation.cross_val_score(model, X, *args, **kwargs)
8+
9+
10+
class GridSearchCV(grid_search.GridSearchCV):
11+
def fit(self, X, *params, **kwparams):
12+
return super(GridSearchCV, self).fit(DataWrapper(X), *params, **kwparams)
13+
14+
def predict(self, X, *params, **kwparams):
15+
return super(GridSearchCV, self).predict(DataWrapper(X), *params, **kwparams)
16+
17+
18+
try:
19+
class RandomizedSearchCV(grid_search.RandomizedSearchCV):
20+
def fit(self, X, *params, **kwparams):
21+
return super(RandomizedSearchCV, self).fit(DataWrapper(X), *params, **kwparams)
22+
23+
def predict(self, X, *params, **kwparams):
24+
return super(RandomizedSearchCV, self).predict(DataWrapper(X), *params, **kwparams)
25+
except AttributeError:
26+
pass
27+
28+
29+
class DataWrapper(object):
30+
def __init__(self, df):
31+
self.df = df
32+
33+
def __len__(self):
34+
return len(self.df)
35+
36+
def __getitem__(self, key):
37+
return self.df.iloc[key]

sklearn_pandas/dataframe_mapper.py

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
import sys
2+
import pandas as pd
3+
import numpy as np
4+
from scipy import sparse
5+
from sklearn.base import BaseEstimator, TransformerMixin
6+
7+
from .cross_validation import DataWrapper
8+
from .pipeline import make_transformer_pipeline
9+
10+
# load in the correct stringtype: str for py3, basestring for py2
11+
string_types = str if sys.version_info >= (3, 0) else basestring
12+
13+
14+
def _handle_feature(fea):
15+
"""
16+
Convert 1-dimensional arrays to 2-dimensional column vectors.
17+
"""
18+
if len(fea.shape) == 1:
19+
fea = np.array([fea]).T
20+
21+
return fea
22+
23+
24+
def _build_transformer(transformers):
25+
if isinstance(transformers, list):
26+
transformers = make_transformer_pipeline(*transformers)
27+
return transformers
28+
29+
30+
class DataFrameMapper(BaseEstimator, TransformerMixin):
31+
"""
32+
Map Pandas data frame column subsets to their own
33+
sklearn transformation.
34+
"""
35+
36+
def __init__(self, features, sparse=False):
37+
"""
38+
Params:
39+
40+
features a list of pairs. The first element is the pandas column
41+
selector. This can be a string (for one column) or a list
42+
of strings. The second element is an object that supports
43+
sklearn's transform interface, or a list of such objects.
44+
sparse will return sparse matrix if set True and any of the
45+
extracted features is sparse. Defaults to False.
46+
"""
47+
if isinstance(features, list):
48+
features = [(columns, _build_transformer(transformers))
49+
for (columns, transformers) in features]
50+
self.features = features
51+
self.sparse = sparse
52+
53+
def __setstate__(self, state):
54+
# compatibility shim for pickles created with sklearn-pandas<1.0.0
55+
self.features = [(columns, _build_transformer(transformers))
56+
for (columns, transformers) in state['features']]
57+
self.sparse = state.get('sparse', False)
58+
59+
def _get_col_subset(self, X, cols):
60+
"""
61+
Get a subset of columns from the given table X.
62+
63+
X a Pandas dataframe; the table to select columns from
64+
cols a string or list of strings representing the columns
65+
to select
66+
67+
Returns a numpy array with the data from the selected columns
68+
"""
69+
return_vector = False
70+
if isinstance(cols, string_types):
71+
return_vector = True
72+
cols = [cols]
73+
74+
if isinstance(X, list):
75+
X = [x[cols] for x in X]
76+
X = pd.DataFrame(X)
77+
78+
elif isinstance(X, DataWrapper):
79+
# if it's a datawrapper, unwrap it
80+
X = X.df
81+
82+
if return_vector:
83+
t = X[cols[0]].values
84+
else:
85+
t = X[cols].values
86+
87+
return t
88+
89+
def fit(self, X, y=None):
90+
"""
91+
Fit a transformation from the pipeline
92+
93+
X the data to fit
94+
"""
95+
for columns, transformers in self.features:
96+
if transformers is not None:
97+
transformers.fit(self._get_col_subset(X, columns))
98+
return self
99+
100+
def transform(self, X):
101+
"""
102+
Transform the given data. Assumes that fit has already been called.
103+
104+
X the data to transform
105+
"""
106+
extracted = []
107+
for columns, transformers in self.features:
108+
# columns could be a string or list of
109+
# strings; we don't care because pandas
110+
# will handle either.
111+
Xt = self._get_col_subset(X, columns)
112+
if transformers is not None:
113+
Xt = transformers.transform(Xt)
114+
extracted.append(_handle_feature(Xt))
115+
116+
# combine the feature outputs into one array.
117+
# at this point we lose track of which features
118+
# were created from which input columns, so it's
119+
# assumed that that doesn't matter to the model.
120+
121+
# If any of the extracted features is sparse, combine sparsely.
122+
# Otherwise, combine as normal arrays.
123+
if any(sparse.issparse(fea) for fea in extracted):
124+
stacked = sparse.hstack(extracted).tocsr()
125+
# return a sparse matrix only if the mapper was initialized
126+
# with sparse=True
127+
if not self.sparse:
128+
stacked = stacked.toarray()
129+
else:
130+
stacked = np.hstack(extracted)
131+
132+
return stacked

tests/test_dataframe_mapper.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,9 @@
2323
from sklearn_pandas import (
2424
DataFrameMapper,
2525
PassthroughTransformer,
26-
cross_val_score,
27-
_build_transformer,
28-
_handle_feature,
26+
cross_val_score
2927
)
28+
from sklearn_pandas.dataframe_mapper import _handle_feature, _build_transformer
3029
from sklearn_pandas.pipeline import TransformerPipeline
3130

3231

0 commit comments

Comments
 (0)