|
17 | 17 | from sklearn.pipeline import Pipeline
|
18 | 18 | from sklearn.svm import SVC
|
19 | 19 | from sklearn.feature_extraction.text import CountVectorizer
|
20 |
| -from sklearn.preprocessing import Imputer, StandardScaler, OneHotEncoder |
| 20 | +from sklearn.preprocessing import Imputer, StandardScaler, OneHotEncoder, LabelBinarizer |
21 | 21 | from sklearn.feature_selection import SelectKBest, chi2
|
22 | 22 | from sklearn.base import BaseEstimator, TransformerMixin
|
| 23 | +import sklearn.decomposition |
23 | 24 | import numpy as np
|
24 | 25 | from numpy.testing import assert_array_equal
|
25 | 26 | import pickle
|
@@ -77,6 +78,81 @@ def complex_dataframe():
|
77 | 78 | 'feat2': [1, 2, 3, 2, 3, 4]})
|
78 | 79 |
|
79 | 80 |
|
| 81 | +def test_simple_df(simple_dataframe): |
| 82 | + """ |
| 83 | + Get a dataframe from a simple mapped dataframe |
| 84 | + """ |
| 85 | + df = simple_dataframe |
| 86 | + mapper = DataFrameMapper([('a', None)], df_out=True) |
| 87 | + transformed = mapper.fit_transform(df) |
| 88 | + assert type(transformed) == pd.DataFrame |
| 89 | + assert len(transformed["a"]) == len(simple_dataframe["a"]) |
| 90 | + |
| 91 | + |
| 92 | +def test_complex_df(complex_dataframe): |
| 93 | + """ |
| 94 | + Get a dataframe from a complex mapped dataframe |
| 95 | + """ |
| 96 | + df = complex_dataframe |
| 97 | + mapper = DataFrameMapper([('target', None), ('feat1', None), ('feat2', None)], df_out=True) |
| 98 | + transformed = mapper.fit_transform(df) |
| 99 | + assert len(transformed) == len(complex_dataframe) |
| 100 | + for c in df.columns: |
| 101 | + assert len(transformed[c]) == len(df[c]) |
| 102 | + |
| 103 | + |
| 104 | +def test_binarizer_df(): |
| 105 | + """ |
| 106 | + Check level names from LabelBinarizer |
| 107 | + """ |
| 108 | + df = pd.DataFrame({'target': ['a', 'a', 'b', 'b', 'c', 'a']}) |
| 109 | + mapper = DataFrameMapper([('target', LabelBinarizer())], df_out=True) |
| 110 | + transformed = mapper.fit_transform(df) |
| 111 | + cols = transformed.columns |
| 112 | + assert len(cols) == 3 |
| 113 | + assert cols[0] == 'target_a' |
| 114 | + assert cols[1] == 'target_b' |
| 115 | + assert cols[2] == 'target_c' |
| 116 | + |
| 117 | + |
| 118 | +def test_binarizer2_df(): |
| 119 | + """ |
| 120 | + Check level names from LabelBinarizer with just one output column |
| 121 | + """ |
| 122 | + df = pd.DataFrame({'target': ['a', 'a', 'b', 'b', 'a']}) |
| 123 | + mapper = DataFrameMapper([('target', LabelBinarizer())], df_out=True) |
| 124 | + transformed = mapper.fit_transform(df) |
| 125 | + cols = transformed.columns |
| 126 | + assert len(cols) == 1 |
| 127 | + assert cols[0] == 'target' |
| 128 | + |
| 129 | + |
| 130 | +def test_onehot_df(): |
| 131 | + """ |
| 132 | + Check level ids from one-hot |
| 133 | + """ |
| 134 | + df = pd.DataFrame({'target': [0, 0, 1, 1, 2, 3, 0]}) |
| 135 | + mapper = DataFrameMapper([(['target'], OneHotEncoder())], df_out=True) |
| 136 | + transformed = mapper.fit_transform(df) |
| 137 | + cols = transformed.columns |
| 138 | + assert len(cols) == 4 |
| 139 | + assert cols[0] == 'target_0' |
| 140 | + assert cols[3] == 'target_3' |
| 141 | + |
| 142 | + |
| 143 | +def test_pca(complex_dataframe): |
| 144 | + """ |
| 145 | + Check multi in and out with PCA |
| 146 | + """ |
| 147 | + df = complex_dataframe |
| 148 | + mapper = DataFrameMapper([(['feat1', 'feat2'], sklearn.decomposition.PCA(2))], df_out=True) |
| 149 | + transformed = mapper.fit_transform(df) |
| 150 | + cols = transformed.columns |
| 151 | + assert len(cols) == 2 |
| 152 | + assert cols[0] == 'feat1_feat2_0' |
| 153 | + assert cols[1] == 'feat1_feat2_1' |
| 154 | + |
| 155 | + |
80 | 156 | def test_nonexistent_columns_explicit_fail(simple_dataframe):
|
81 | 157 | """
|
82 | 158 | If a nonexistent column is selected, KeyError is raised.
|
|
0 commit comments