18
18
from ..utils import Bunch
19
19
from ..utils import check_random_state
20
20
from ..utils import check_pandas_support
21
+ from ..utils .deprecation import deprecated
21
22
22
23
import numpy as np
23
24
@@ -1109,8 +1110,45 @@ def load_linnerud(*, return_X_y=False, as_frame=False):
1109
1110
)
1110
1111
1111
1112
1113
+ @deprecated (
1114
+ r"""`load_boston` is deprecated in 1.0 and will be removed in 1.2.
1115
+
1116
+ The Boston housing prices dataset has an ethical problem. You can refer to
1117
+ the documentation of this function for further details.
1118
+
1119
+ The scikit-learn maintainers therefore strongly discourage the use of this
1120
+ dataset unless the purpose of the code is to study and educate about
1121
+ ethical issues in data science and machine learning.
1122
+
1123
+ In this case special case, you can fetch the dataset from the original
1124
+ source::
1125
+
1126
+ import pandas as pd
1127
+ import numpy as np
1128
+
1129
+
1130
+ data_url = "http://lib.stat.cmu.edu/datasets/boston"
1131
+ raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
1132
+ data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
1133
+ target = raw_df.values[1::2, 2]
1134
+
1135
+ Alternative datasets include the California housing dataset (i.e.
1136
+ func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
1137
+ dataset. You can load the datasets as follows:
1138
+
1139
+ from sklearn.datasets import fetch_california_housing
1140
+ housing = fetch_california_housing()
1141
+
1142
+ for the California housing dataset and:
1143
+
1144
+ from sklearn.datasets import fetch_openml
1145
+ housing = fetch_openml(name="house_prices", as_frame=True)
1146
+
1147
+ for the Ames housing dataset.
1148
+ """
1149
+ )
1112
1150
def load_boston (* , return_X_y = False ):
1113
- """Load and return the boston house-prices dataset (regression).
1151
+ r """Load and return the boston house-prices dataset (regression).
1114
1152
1115
1153
============== ==============
1116
1154
Samples total 506
@@ -1121,6 +1159,50 @@ def load_boston(*, return_X_y=False):
1121
1159
1122
1160
Read more in the :ref:`User Guide <boston_dataset>`.
1123
1161
1162
+ .. deprecated:: 1.0
1163
+ This function is deprecated in 1.0 and will be removed in 1.2. See the
1164
+ warning message below for futher details regarding the alternative
1165
+ datasets.
1166
+
1167
+ .. warning::
1168
+ The Boston housing prices dataset has an ethical problem: as
1169
+ investigated in [1]_, the authors of this dataset engineered a
1170
+ non-invertible variable "B" assuming that racial self-segregation had a
1171
+ positive impact on house prices [2]_. Furthermore the goal of the
1172
+ research that led to the creation of this dataset was to study the
1173
+ impact of air quality but it did not give adequate demonstration of the
1174
+ validity of this assumption.
1175
+
1176
+ The scikit-learn maintainers therefore strongly discourage the use of
1177
+ this dataset unless the purpose of the code is to study and educate
1178
+ about ethical issues in data science and machine learning.
1179
+
1180
+ In this case special case, you can fetch the dataset from the original
1181
+ source::
1182
+
1183
+ import pandas as pd # doctest: +SKIP
1184
+ import numpy as np
1185
+
1186
+
1187
+ data_url = "http://lib.stat.cmu.edu/datasets/boston"
1188
+ raw_df = pd.read_csv(data_url, sep="s+", skiprows=22, header=None)
1189
+ data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
1190
+ target = raw_df.values[1::2, 2]
1191
+
1192
+ Alternative datasets include the California housing dataset [3]_
1193
+ (i.e. func:`~sklearn.datasets.fetch_california_housing`) and Ames
1194
+ housing dataset [4]_. You can load the datasets as follows::
1195
+
1196
+ from sklearn.datasets import fetch_california_housing
1197
+ housing = fetch_california_housing()
1198
+
1199
+ for the California housing dataset and::
1200
+
1201
+ from sklearn.datasets import fetch_openml
1202
+ housing = fetch_openml(name="house_prices", as_frame=True) # noqa
1203
+
1204
+ for the Ames housing dataset.
1205
+
1124
1206
Parameters
1125
1207
----------
1126
1208
return_X_y : bool, default=False
@@ -1136,7 +1218,7 @@ def load_boston(*, return_X_y=False):
1136
1218
1137
1219
data : ndarray of shape (506, 13)
1138
1220
The data matrix.
1139
- target : ndarray of shape (506, )
1221
+ target : ndarray of shape (506,)
1140
1222
The regression target.
1141
1223
filename : str
1142
1224
The physical location of boston csv dataset.
@@ -1157,13 +1239,37 @@ def load_boston(*, return_X_y=False):
1157
1239
.. versionchanged:: 0.20
1158
1240
Fixed a wrong data point at [445, 0].
1159
1241
1242
+ References
1243
+ ----------
1244
+ .. [1] `Racist data destruction? M Carlisle,
1245
+ <https://medium.com/@docintangible/racist-data-destruction-113e3eff54a8>`_
1246
+ .. [2] `Harrison Jr, David, and Daniel L. Rubinfeld.
1247
+ "Hedonic housing prices and the demand for clean air."
1248
+ Journal of environmental economics and management 5.1 (1978): 81-102.
1249
+ <https://www.researchgate.net/publication/4974606_Hedonic_housing_prices_and_the_demand_for_clean_air>`_
1250
+ .. [3] `California housing dataset
1251
+ <https://scikit-learn.org/stable/datasets/real_world.html#california-housing-dataset>`_
1252
+ .. [4] `Ames housing dataset
1253
+ <https://www.openml.org/d/42165>`_
1254
+
1160
1255
Examples
1161
1256
--------
1257
+ >>> import warnings
1162
1258
>>> from sklearn.datasets import load_boston
1163
- >>> X, y = load_boston(return_X_y=True)
1259
+ >>> with warnings.catch_warnings():
1260
+ ... # You should probably not use this dataset.
1261
+ ... warnings.filterwarnings("ignore")
1262
+ ... X, y = load_boston(return_X_y=True)
1164
1263
>>> print(X.shape)
1165
1264
(506, 13)
1166
1265
"""
1266
+ # TODO: once the deprecation period is over, implement a module level
1267
+ # `__getattr__` function in`sklearn.datasets` to raise an exception with
1268
+ # an informative error message at import time instead of just removing
1269
+ # load_boston. The goal is to avoid having beginners that copy-paste code
1270
+ # from numerous books and tutorials that use this dataset loader get
1271
+ # a confusing ImportError when trying to learn scikit-learn.
1272
+ # See: https://www.python.org/dev/peps/pep-0562/
1167
1273
1168
1274
descr_text = load_descr ("boston_house_prices.rst" )
1169
1275
0 commit comments