Skip to content

Commit 8010b4f

Browse files
jph00dukebody
authored andcommitted
add df_out to return a data frame
1 parent 10a43e4 commit 8010b4f

File tree

2 files changed

+110
-4
lines changed

2 files changed

+110
-4
lines changed

sklearn_pandas/dataframe_mapper.py

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ class DataFrameMapper(BaseEstimator, TransformerMixin):
3333
sklearn transformation.
3434
"""
3535

36-
def __init__(self, features, default=False, sparse=False):
36+
def __init__(self, features, default=False, sparse=False, df_out=False):
3737
"""
3838
Params:
3939
@@ -50,13 +50,23 @@ def __init__(self, features, default=False, sparse=False):
5050
5151
sparse will return sparse matrix if set True and any of the
5252
extracted features is sparse. Defaults to False.
53+
54+
df_out return a pandas data frame, with each column named using
55+
the pandas column that created it (if there's only one
56+
input and output) or the input columns joined with '_'
57+
if there's multiple inputs, and the name concatenated with
58+
'_1', '_2' etc if there's multiple outputs. NB: does not
59+
work if *default* or *sparse* are true
5360
"""
5461
if isinstance(features, list):
5562
features = [(columns, _build_transformer(transformers))
5663
for (columns, transformers) in features]
5764
self.features = features
5865
self.default = _build_transformer(default)
5966
self.sparse = sparse
67+
self.df_out = df_out
68+
if (df_out and (sparse or default)):
69+
raise ValueError("Can not use df_out with sparse or default")
6070

6171
@property
6272
def _selected_columns(self):
@@ -94,6 +104,7 @@ def __setstate__(self, state):
94104
# compatibility shim for pickles created before ``default`` init
95105
# argument existed
96106
self.default = state.get('default', False)
107+
self.df_out = state.get('df_out', False)
97108

98109
def _get_col_subset(self, X, cols):
99110
"""
@@ -145,13 +156,26 @@ def fit(self, X, y=None):
145156
self._get_col_subset(X, self._unselected_columns(X)), y)
146157
return self
147158

159+
160+
def get_names(self, c, t, x):
161+
if type(c)==list:
162+
c = '_'.join(c)
163+
if hasattr(t, 'classes_') and (len(t.classes_)>2):
164+
return [c + '_' + o for o in t.classes_]
165+
elif len(x.shape)>1 and x.shape[1]>1:
166+
return [c + '_' + str(o) for o in range(x.shape[1])]
167+
else:
168+
return [c]
169+
170+
148171
def transform(self, X):
149172
"""
150173
Transform the given data. Assumes that fit has already been called.
151174
152175
X the data to transform
153176
"""
154177
extracted = []
178+
index = []
155179
for columns, transformers in self.features:
156180
# columns could be a string or list of
157181
# strings; we don't care because pandas
@@ -160,10 +184,13 @@ def transform(self, X):
160184
if transformers is not None:
161185
Xt = transformers.transform(Xt)
162186
extracted.append(_handle_feature(Xt))
187+
if self.df_out:
188+
index = index + self.get_names(columns, transformers, Xt)
163189

164190
# handle features not explicitly selected
165191
if self.default is not False:
166-
Xt = self._get_col_subset(X, self._unselected_columns(X))
192+
unsel_cols = self._unselected_columns(X)
193+
Xt = self._get_col_subset(X, unsel_cols)
167194
if self.default is not None:
168195
Xt = self.default.transform(Xt)
169196
extracted.append(_handle_feature(Xt))
@@ -185,4 +212,7 @@ def transform(self, X):
185212
else:
186213
stacked = np.hstack(extracted)
187214

188-
return stacked
215+
if not self.df_out:
216+
return stacked
217+
218+
return pd.DataFrame(stacked, columns=index)

tests/test_dataframe_mapper.py

Lines changed: 77 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,10 @@
1717
from sklearn.pipeline import Pipeline
1818
from sklearn.svm import SVC
1919
from sklearn.feature_extraction.text import CountVectorizer
20-
from sklearn.preprocessing import Imputer, StandardScaler, OneHotEncoder
20+
from sklearn.preprocessing import Imputer, StandardScaler, OneHotEncoder, LabelBinarizer
2121
from sklearn.feature_selection import SelectKBest, chi2
2222
from sklearn.base import BaseEstimator, TransformerMixin
23+
import sklearn.decomposition
2324
import numpy as np
2425
from numpy.testing import assert_array_equal
2526
import pickle
@@ -77,6 +78,81 @@ def complex_dataframe():
7778
'feat2': [1, 2, 3, 2, 3, 4]})
7879

7980

81+
def test_simple_df(simple_dataframe):
82+
"""
83+
Get a dataframe from a simple mapped dataframe
84+
"""
85+
df = simple_dataframe
86+
mapper = DataFrameMapper([('a', None)], df_out=True)
87+
transformed = mapper.fit_transform(df)
88+
assert type(transformed) == pd.DataFrame
89+
assert len(transformed["a"]) == len(simple_dataframe["a"])
90+
91+
92+
def test_complex_df(complex_dataframe):
93+
"""
94+
Get a dataframe from a complex mapped dataframe
95+
"""
96+
df = complex_dataframe
97+
mapper = DataFrameMapper([('target', None), ('feat1', None), ('feat2', None)], df_out=True)
98+
transformed = mapper.fit_transform(df)
99+
assert len(transformed) == len(complex_dataframe)
100+
for c in df.columns:
101+
assert len(transformed[c]) == len(df[c])
102+
103+
104+
def test_binarizer_df():
105+
"""
106+
Check level names from LabelBinarizer
107+
"""
108+
df = pd.DataFrame({'target': ['a', 'a', 'b', 'b', 'c', 'a']})
109+
mapper = DataFrameMapper([('target', LabelBinarizer())], df_out=True)
110+
transformed = mapper.fit_transform(df)
111+
cols = transformed.columns
112+
assert len(cols) == 3
113+
assert cols[0] == 'target_a'
114+
assert cols[1] == 'target_b'
115+
assert cols[2] == 'target_c'
116+
117+
118+
def test_binarizer2_df():
119+
"""
120+
Check level names from LabelBinarizer with just one output column
121+
"""
122+
df = pd.DataFrame({'target': ['a', 'a', 'b', 'b', 'a']})
123+
mapper = DataFrameMapper([('target', LabelBinarizer())], df_out=True)
124+
transformed = mapper.fit_transform(df)
125+
cols = transformed.columns
126+
assert len(cols) == 1
127+
assert cols[0] == 'target'
128+
129+
130+
def test_onehot_df():
131+
"""
132+
Check level ids from one-hot
133+
"""
134+
df = pd.DataFrame({'target': [0, 0, 1, 1, 2, 3, 0]})
135+
mapper = DataFrameMapper([(['target'], OneHotEncoder())], df_out=True)
136+
transformed = mapper.fit_transform(df)
137+
cols = transformed.columns
138+
assert len(cols) == 4
139+
assert cols[0] == 'target_0'
140+
assert cols[3] == 'target_3'
141+
142+
143+
def test_pca(complex_dataframe):
144+
"""
145+
Check multi in and out with PCA
146+
"""
147+
df = complex_dataframe
148+
mapper = DataFrameMapper([(['feat1', 'feat2'], sklearn.decomposition.PCA(2))], df_out=True)
149+
transformed = mapper.fit_transform(df)
150+
cols = transformed.columns
151+
assert len(cols) == 2
152+
assert cols[0] == 'feat1_feat2_0'
153+
assert cols[1] == 'feat1_feat2_1'
154+
155+
80156
def test_nonexistent_columns_explicit_fail(simple_dataframe):
81157
"""
82158
If a nonexistent column is selected, KeyError is raised.

0 commit comments

Comments
 (0)