|
17 | 17 | from sklearn.preprocessing import Imputer, StandardScaler
|
18 | 18 | from sklearn.base import BaseEstimator, TransformerMixin
|
19 | 19 | import numpy as np
|
| 20 | +from numpy.testing import assert_array_equal |
20 | 21 |
|
21 | 22 | from sklearn_pandas import (
|
22 | 23 | DataFrameMapper,
|
23 | 24 | PassthroughTransformer,
|
24 | 25 | cross_val_score,
|
25 | 26 | _build_transformer,
|
| 27 | + _handle_feature, |
26 | 28 | )
|
27 | 29 |
|
28 | 30 |
|
@@ -51,78 +53,41 @@ def transform(self, X):
|
51 | 53 | return sparse.csr_matrix(X)
|
52 | 54 |
|
53 | 55 |
|
54 |
| -@pytest.fixture |
55 |
| -def iris_dataframe(): |
56 |
| - iris = load_iris() |
57 |
| - return DataFrame( |
58 |
| - data={ |
59 |
| - iris.feature_names[0]: iris.data[:, 0], |
60 |
| - iris.feature_names[1]: iris.data[:, 1], |
61 |
| - iris.feature_names[2]: iris.data[:, 2], |
62 |
| - iris.feature_names[3]: iris.data[:, 3], |
63 |
| - "species": np.array([iris.target_names[e] for e in iris.target]) |
64 |
| - } |
65 |
| - ) |
66 |
| - |
67 |
| - |
68 |
| -@pytest.fixture |
69 |
| -def cars_dataframe(): |
70 |
| - return pd.read_csv("tests/test_data/cars.csv.gz", compression='gzip') |
71 |
| - |
72 |
| - |
73 | 56 | @pytest.fixture
|
74 | 57 | def simple_dataframe():
|
75 | 58 | return pd.DataFrame({'a': [1, 2, 3]})
|
76 | 59 |
|
77 | 60 |
|
78 |
| -def test_nonexistent_columns_explicit_fail(iris_dataframe): |
| 61 | +def test_nonexistent_columns_explicit_fail(simple_dataframe): |
79 | 62 | """
|
80 | 63 | If a nonexistent column is selected, KeyError is raised.
|
81 | 64 | """
|
82 | 65 | mapper = DataFrameMapper(None)
|
83 | 66 | with pytest.raises(KeyError):
|
84 |
| - mapper._get_col_subset(iris_dataframe, ['nonexistent_feature']) |
| 67 | + mapper._get_col_subset(simple_dataframe, ['nonexistent_feature']) |
85 | 68 |
|
86 | 69 |
|
87 |
| -def test_with_iris_dataframe(iris_dataframe): |
88 |
| - pipeline = Pipeline([ |
89 |
| - ("preprocess", DataFrameMapper([ |
90 |
| - ("petal length (cm)", PassthroughTransformer()), |
91 |
| - ("petal width (cm)", PassthroughTransformer()), |
92 |
| - ("sepal length (cm)", PassthroughTransformer()), |
93 |
| - ("sepal width (cm)", PassthroughTransformer()), |
94 |
| - ])), |
95 |
| - ("classify", SVC(kernel='linear')) |
96 |
| - ]) |
97 |
| - data = iris_dataframe.drop("species", axis=1) |
98 |
| - labels = iris_dataframe["species"] |
99 |
| - scores = cross_val_score(pipeline, data, labels) |
100 |
| - assert scores.mean() > 0.96 |
101 |
| - assert (scores.std() * 2) < 0.04 |
102 |
| - |
103 |
| - |
104 |
| -def test_get_col_subset_single_column_array(iris_dataframe): |
| 70 | +def test_get_col_subset_single_column_array(simple_dataframe): |
105 | 71 | """
|
106 | 72 | Selecting a single column should return a 1-dimensional numpy array.
|
107 | 73 | """
|
108 | 74 | mapper = DataFrameMapper(None)
|
109 |
| - array = mapper._get_col_subset(iris_dataframe, "species") |
| 75 | + array = mapper._get_col_subset(simple_dataframe, "a") |
110 | 76 |
|
111 | 77 | assert type(array) == np.ndarray
|
112 |
| - assert array.shape == (len(iris_dataframe["species"]),) |
| 78 | + assert array.shape == (len(simple_dataframe["a"]),) |
113 | 79 |
|
114 | 80 |
|
115 |
| -def test_with_car_dataframe(cars_dataframe): |
116 |
| - pipeline = Pipeline([ |
117 |
| - ("preprocess", DataFrameMapper([ |
118 |
| - ("description", CountVectorizer()), |
119 |
| - ])), |
120 |
| - ("classify", SVC(kernel='linear')) |
121 |
| - ]) |
122 |
| - data = cars_dataframe.drop("model", axis=1) |
123 |
| - labels = cars_dataframe["model"] |
124 |
| - scores = cross_val_score(pipeline, data, labels) |
125 |
| - assert scores.mean() > 0.30 |
| 81 | +def test_get_col_subset_single_column_list(simple_dataframe): |
| 82 | + """ |
| 83 | + Selecting a list of columns (even if the list contains a single element) |
| 84 | + should return a 2-dimensional numpy array. |
| 85 | + """ |
| 86 | + mapper = DataFrameMapper(None) |
| 87 | + array = mapper._get_col_subset(simple_dataframe, ["a"]) |
| 88 | + |
| 89 | + assert type(array) == np.ndarray |
| 90 | + assert array.shape == (len(simple_dataframe["a"]), 1) |
126 | 91 |
|
127 | 92 |
|
128 | 93 | def test_cols_string_array(simple_dataframe):
|
@@ -155,6 +120,22 @@ def test_cols_list_column_vector(simple_dataframe):
|
155 | 120 | assert args[0].shape == (3, 1)
|
156 | 121 |
|
157 | 122 |
|
| 123 | +def test_handle_feature_2dim(): |
| 124 | + """ |
| 125 | + 2-dimensional arrays are returned unchanged. |
| 126 | + """ |
| 127 | + array = np.array([[1, 2], [3, 4]]) |
| 128 | + assert_array_equal(_handle_feature(array), array) |
| 129 | + |
| 130 | + |
| 131 | +def test_handle_feature_1dim(): |
| 132 | + """ |
| 133 | + 1-dimensional arrays are converted to 2-dimensional column vectors. |
| 134 | + """ |
| 135 | + array = np.array([1, 2]) |
| 136 | + assert_array_equal(_handle_feature(array), np.array([[1], [2]])) |
| 137 | + |
| 138 | + |
158 | 139 | def test_build_transformers():
|
159 | 140 | """
|
160 | 141 | When a list of transformers is passed, return a pipeline with
|
@@ -213,3 +194,54 @@ def test_sparse_off(simple_dataframe):
|
213 | 194 |
|
214 | 195 | dmatrix = mapper.fit_transform(df)
|
215 | 196 | assert type(dmatrix) != sparse.csr.csr_matrix
|
| 197 | + |
| 198 | + |
| 199 | +# Integration tests with real dataframes |
| 200 | + |
| 201 | +@pytest.fixture |
| 202 | +def iris_dataframe(): |
| 203 | + iris = load_iris() |
| 204 | + return DataFrame( |
| 205 | + data={ |
| 206 | + iris.feature_names[0]: iris.data[:, 0], |
| 207 | + iris.feature_names[1]: iris.data[:, 1], |
| 208 | + iris.feature_names[2]: iris.data[:, 2], |
| 209 | + iris.feature_names[3]: iris.data[:, 3], |
| 210 | + "species": np.array([iris.target_names[e] for e in iris.target]) |
| 211 | + } |
| 212 | + ) |
| 213 | + |
| 214 | + |
| 215 | +@pytest.fixture |
| 216 | +def cars_dataframe(): |
| 217 | + return pd.read_csv("tests/test_data/cars.csv.gz", compression='gzip') |
| 218 | + |
| 219 | + |
| 220 | +def test_with_iris_dataframe(iris_dataframe): |
| 221 | + pipeline = Pipeline([ |
| 222 | + ("preprocess", DataFrameMapper([ |
| 223 | + ("petal length (cm)", PassthroughTransformer()), |
| 224 | + ("petal width (cm)", PassthroughTransformer()), |
| 225 | + ("sepal length (cm)", PassthroughTransformer()), |
| 226 | + ("sepal width (cm)", PassthroughTransformer()), |
| 227 | + ])), |
| 228 | + ("classify", SVC(kernel='linear')) |
| 229 | + ]) |
| 230 | + data = iris_dataframe.drop("species", axis=1) |
| 231 | + labels = iris_dataframe["species"] |
| 232 | + scores = cross_val_score(pipeline, data, labels) |
| 233 | + assert scores.mean() > 0.96 |
| 234 | + assert (scores.std() * 2) < 0.04 |
| 235 | + |
| 236 | + |
| 237 | +def test_with_car_dataframe(cars_dataframe): |
| 238 | + pipeline = Pipeline([ |
| 239 | + ("preprocess", DataFrameMapper([ |
| 240 | + ("description", CountVectorizer()), |
| 241 | + ])), |
| 242 | + ("classify", SVC(kernel='linear')) |
| 243 | + ]) |
| 244 | + data = cars_dataframe.drop("model", axis=1) |
| 245 | + labels = cars_dataframe["model"] |
| 246 | + scores = cross_val_score(pipeline, data, labels) |
| 247 | + assert scores.mean() > 0.30 |
0 commit comments