1
1
__version__ = '1.0.0'
2
2
3
-
4
- import sys
5
3
import numpy as np
6
- import pandas as pd
7
- from scipy import sparse
8
- from sklearn .base import BaseEstimator , TransformerMixin
9
- from sklearn import cross_validation
10
- from sklearn import grid_search
11
- from .pipeline import make_transformer_pipeline
12
-
13
- # load in the correct stringtype: str for py3, basestring for py2
14
- string_types = str if sys .version_info >= (3 , 0 ) else basestring
15
-
16
-
17
- def cross_val_score (model , X , * args , ** kwargs ):
18
- X = DataWrapper (X )
19
- return cross_validation .cross_val_score (model , X , * args , ** kwargs )
20
-
21
-
22
- class GridSearchCV (grid_search .GridSearchCV ):
23
- def fit (self , X , * params , ** kwparams ):
24
- return super (GridSearchCV , self ).fit (DataWrapper (X ), * params , ** kwparams )
25
-
26
- def predict (self , X , * params , ** kwparams ):
27
- return super (GridSearchCV , self ).predict (DataWrapper (X ), * params , ** kwparams )
28
-
29
-
30
- try :
31
- class RandomizedSearchCV (grid_search .RandomizedSearchCV ):
32
- def fit (self , X , * params , ** kwparams ):
33
- return super (RandomizedSearchCV , self ).fit (DataWrapper (X ), * params , ** kwparams )
34
-
35
- def predict (self , X , * params , ** kwparams ):
36
- return super (RandomizedSearchCV , self ).predict (DataWrapper (X ), * params , ** kwparams )
37
- except AttributeError :
38
- pass
4
+ from sklearn .base import TransformerMixin
39
5
40
-
41
- class DataWrapper (object ):
42
- def __init__ (self , df ):
43
- self .df = df
44
-
45
- def __len__ (self ):
46
- return len (self .df )
47
-
48
- def __getitem__ (self , key ):
49
- return self .df .iloc [key ]
6
+ from .dataframe_mapper import DataFrameMapper # NOQA
7
+ from .cross_validation import cross_val_score , GridSearchCV , RandomizedSearchCV # NOQA
50
8
51
9
52
10
class PassthroughTransformer (TransformerMixin ):
@@ -55,124 +13,3 @@ def fit(self, X, y=None, **fit_params):
55
13
56
14
def transform (self , X ):
57
15
return np .array (X ).astype (np .float )
58
-
59
-
60
- def _handle_feature (fea ):
61
- """
62
- Convert 1-dimensional arrays to 2-dimensional column vectors.
63
- """
64
- if len (fea .shape ) == 1 :
65
- fea = np .array ([fea ]).T
66
-
67
- return fea
68
-
69
-
70
- def _build_transformer (transformers ):
71
- if isinstance (transformers , list ):
72
- transformers = make_transformer_pipeline (* transformers )
73
- return transformers
74
-
75
-
76
- class DataFrameMapper (BaseEstimator , TransformerMixin ):
77
- """
78
- Map Pandas data frame column subsets to their own
79
- sklearn transformation.
80
- """
81
-
82
- def __init__ (self , features , sparse = False ):
83
- """
84
- Params:
85
-
86
- features a list of pairs. The first element is the pandas column
87
- selector. This can be a string (for one column) or a list
88
- of strings. The second element is an object that supports
89
- sklearn's transform interface, or a list of such objects.
90
- sparse will return sparse matrix if set True and any of the
91
- extracted features is sparse. Defaults to False.
92
- """
93
- if isinstance (features , list ):
94
- features = [(columns , _build_transformer (transformers ))
95
- for (columns , transformers ) in features ]
96
- self .features = features
97
- self .sparse = sparse
98
-
99
- def __setstate__ (self , state ):
100
- # compatibility shim for pickles created with sklearn-pandas<1.0.0
101
- self .features = [(columns , _build_transformer (transformers ))
102
- for (columns , transformers ) in state ['features' ]]
103
- self .sparse = state .get ('sparse' , False )
104
-
105
- def _get_col_subset (self , X , cols ):
106
- """
107
- Get a subset of columns from the given table X.
108
-
109
- X a Pandas dataframe; the table to select columns from
110
- cols a string or list of strings representing the columns
111
- to select
112
-
113
- Returns a numpy array with the data from the selected columns
114
- """
115
- return_vector = False
116
- if isinstance (cols , string_types ):
117
- return_vector = True
118
- cols = [cols ]
119
-
120
- if isinstance (X , list ):
121
- X = [x [cols ] for x in X ]
122
- X = pd .DataFrame (X )
123
-
124
- elif isinstance (X , DataWrapper ):
125
- # if it's a datawrapper, unwrap it
126
- X = X .df
127
-
128
- if return_vector :
129
- t = X [cols [0 ]].values
130
- else :
131
- t = X [cols ].values
132
-
133
- return t
134
-
135
- def fit (self , X , y = None ):
136
- """
137
- Fit a transformation from the pipeline
138
-
139
- X the data to fit
140
- """
141
- for columns , transformers in self .features :
142
- if transformers is not None :
143
- transformers .fit (self ._get_col_subset (X , columns ))
144
- return self
145
-
146
- def transform (self , X ):
147
- """
148
- Transform the given data. Assumes that fit has already been called.
149
-
150
- X the data to transform
151
- """
152
- extracted = []
153
- for columns , transformers in self .features :
154
- # columns could be a string or list of
155
- # strings; we don't care because pandas
156
- # will handle either.
157
- Xt = self ._get_col_subset (X , columns )
158
- if transformers is not None :
159
- Xt = transformers .transform (Xt )
160
- extracted .append (_handle_feature (Xt ))
161
-
162
- # combine the feature outputs into one array.
163
- # at this point we lose track of which features
164
- # were created from which input columns, so it's
165
- # assumed that that doesn't matter to the model.
166
-
167
- # If any of the extracted features is sparse, combine sparsely.
168
- # Otherwise, combine as normal arrays.
169
- if any (sparse .issparse (fea ) for fea in extracted ):
170
- stacked = sparse .hstack (extracted ).tocsr ()
171
- # return a sparse matrix only if the mapper was initialized
172
- # with sparse=True
173
- if not self .sparse :
174
- stacked = stacked .toarray ()
175
- else :
176
- stacked = np .hstack (extracted )
177
-
178
- return stacked
0 commit comments