Skip to content

Commit 19536ef

Browse files
committed
Write some more unit tests.
1 parent b00e3fb commit 19536ef

File tree

2 files changed

+87
-53
lines changed

2 files changed

+87
-53
lines changed

sklearn_pandas/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,9 @@ def transform(self, X):
5757

5858

5959
def _handle_feature(fea):
60-
# convert 1-dimensional arrays to 2-dimensional column vectors
60+
"""
61+
Convert 1-dimensional arrays to 2-dimensional column vectors.
62+
"""
6163
if len(fea.shape) == 1:
6264
fea = np.array([fea]).T
6365

tests/test_dataframe_mapper.py

Lines changed: 84 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,14 @@
1717
from sklearn.preprocessing import Imputer, StandardScaler
1818
from sklearn.base import BaseEstimator, TransformerMixin
1919
import numpy as np
20+
from numpy.testing import assert_array_equal
2021

2122
from sklearn_pandas import (
2223
DataFrameMapper,
2324
PassthroughTransformer,
2425
cross_val_score,
2526
_build_transformer,
27+
_handle_feature,
2628
)
2729

2830

@@ -51,78 +53,41 @@ def transform(self, X):
5153
return sparse.csr_matrix(X)
5254

5355

54-
@pytest.fixture
55-
def iris_dataframe():
56-
iris = load_iris()
57-
return DataFrame(
58-
data={
59-
iris.feature_names[0]: iris.data[:, 0],
60-
iris.feature_names[1]: iris.data[:, 1],
61-
iris.feature_names[2]: iris.data[:, 2],
62-
iris.feature_names[3]: iris.data[:, 3],
63-
"species": np.array([iris.target_names[e] for e in iris.target])
64-
}
65-
)
66-
67-
68-
@pytest.fixture
69-
def cars_dataframe():
70-
return pd.read_csv("tests/test_data/cars.csv.gz", compression='gzip')
71-
72-
7356
@pytest.fixture
7457
def simple_dataframe():
7558
return pd.DataFrame({'a': [1, 2, 3]})
7659

7760

78-
def test_nonexistent_columns_explicit_fail(iris_dataframe):
61+
def test_nonexistent_columns_explicit_fail(simple_dataframe):
7962
"""
8063
If a nonexistent column is selected, KeyError is raised.
8164
"""
8265
mapper = DataFrameMapper(None)
8366
with pytest.raises(KeyError):
84-
mapper._get_col_subset(iris_dataframe, ['nonexistent_feature'])
67+
mapper._get_col_subset(simple_dataframe, ['nonexistent_feature'])
8568

8669

87-
def test_with_iris_dataframe(iris_dataframe):
88-
pipeline = Pipeline([
89-
("preprocess", DataFrameMapper([
90-
("petal length (cm)", PassthroughTransformer()),
91-
("petal width (cm)", PassthroughTransformer()),
92-
("sepal length (cm)", PassthroughTransformer()),
93-
("sepal width (cm)", PassthroughTransformer()),
94-
])),
95-
("classify", SVC(kernel='linear'))
96-
])
97-
data = iris_dataframe.drop("species", axis=1)
98-
labels = iris_dataframe["species"]
99-
scores = cross_val_score(pipeline, data, labels)
100-
assert scores.mean() > 0.96
101-
assert (scores.std() * 2) < 0.04
102-
103-
104-
def test_get_col_subset_single_column_array(iris_dataframe):
70+
def test_get_col_subset_single_column_array(simple_dataframe):
10571
"""
10672
Selecting a single column should return a 1-dimensional numpy array.
10773
"""
10874
mapper = DataFrameMapper(None)
109-
array = mapper._get_col_subset(iris_dataframe, "species")
75+
array = mapper._get_col_subset(simple_dataframe, "a")
11076

11177
assert type(array) == np.ndarray
112-
assert array.shape == (len(iris_dataframe["species"]),)
78+
assert array.shape == (len(simple_dataframe["a"]),)
11379

11480

115-
def test_with_car_dataframe(cars_dataframe):
116-
pipeline = Pipeline([
117-
("preprocess", DataFrameMapper([
118-
("description", CountVectorizer()),
119-
])),
120-
("classify", SVC(kernel='linear'))
121-
])
122-
data = cars_dataframe.drop("model", axis=1)
123-
labels = cars_dataframe["model"]
124-
scores = cross_val_score(pipeline, data, labels)
125-
assert scores.mean() > 0.30
81+
def test_get_col_subset_single_column_list(simple_dataframe):
82+
"""
83+
Selecting a list of columns (even if the list contains a single element)
84+
should return a 2-dimensional numpy array.
85+
"""
86+
mapper = DataFrameMapper(None)
87+
array = mapper._get_col_subset(simple_dataframe, ["a"])
88+
89+
assert type(array) == np.ndarray
90+
assert array.shape == (len(simple_dataframe["a"]), 1)
12691

12792

12893
def test_cols_string_array(simple_dataframe):
@@ -155,6 +120,22 @@ def test_cols_list_column_vector(simple_dataframe):
155120
assert args[0].shape == (3, 1)
156121

157122

123+
def test_handle_feature_2dim():
124+
"""
125+
2-dimensional arrays are returned unchanged.
126+
"""
127+
array = np.array([[1, 2], [3, 4]])
128+
assert_array_equal(_handle_feature(array), array)
129+
130+
131+
def test_handle_feature_1dim():
132+
"""
133+
1-dimensional arrays are converted to 2-dimensional column vectors.
134+
"""
135+
array = np.array([1, 2])
136+
assert_array_equal(_handle_feature(array), np.array([[1], [2]]))
137+
138+
158139
def test_build_transformers():
159140
"""
160141
When a list of transformers is passed, return a pipeline with
@@ -213,3 +194,54 @@ def test_sparse_off(simple_dataframe):
213194

214195
dmatrix = mapper.fit_transform(df)
215196
assert type(dmatrix) != sparse.csr.csr_matrix
197+
198+
199+
# Integration tests with real dataframes
200+
201+
@pytest.fixture
202+
def iris_dataframe():
203+
iris = load_iris()
204+
return DataFrame(
205+
data={
206+
iris.feature_names[0]: iris.data[:, 0],
207+
iris.feature_names[1]: iris.data[:, 1],
208+
iris.feature_names[2]: iris.data[:, 2],
209+
iris.feature_names[3]: iris.data[:, 3],
210+
"species": np.array([iris.target_names[e] for e in iris.target])
211+
}
212+
)
213+
214+
215+
@pytest.fixture
216+
def cars_dataframe():
217+
return pd.read_csv("tests/test_data/cars.csv.gz", compression='gzip')
218+
219+
220+
def test_with_iris_dataframe(iris_dataframe):
221+
pipeline = Pipeline([
222+
("preprocess", DataFrameMapper([
223+
("petal length (cm)", PassthroughTransformer()),
224+
("petal width (cm)", PassthroughTransformer()),
225+
("sepal length (cm)", PassthroughTransformer()),
226+
("sepal width (cm)", PassthroughTransformer()),
227+
])),
228+
("classify", SVC(kernel='linear'))
229+
])
230+
data = iris_dataframe.drop("species", axis=1)
231+
labels = iris_dataframe["species"]
232+
scores = cross_val_score(pipeline, data, labels)
233+
assert scores.mean() > 0.96
234+
assert (scores.std() * 2) < 0.04
235+
236+
237+
def test_with_car_dataframe(cars_dataframe):
238+
pipeline = Pipeline([
239+
("preprocess", DataFrameMapper([
240+
("description", CountVectorizer()),
241+
])),
242+
("classify", SVC(kernel='linear'))
243+
])
244+
data = cars_dataframe.drop("model", axis=1)
245+
labels = cars_dataframe["model"]
246+
scores = cross_val_score(pipeline, data, labels)
247+
assert scores.mean() > 0.30

0 commit comments

Comments
 (0)