Skip to content

[MRG] FIX enable bootstraping in bagging #360

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Nov 22, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions doc/ensemble.rst
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,9 @@ classifier will favor the majority classes::
BaggingClassifier(...)
>>> y_pred = bc.predict(X_test)
>>> confusion_matrix(y_test, y_pred)
array([[ 0, 0, 12],
[ 0, 0, 59],
[ 0, 0, 1179]])
array([[ 9, 1, 2],
[ 0, 54, 5],
[ 1, 6, 1172]])

:class:`BalancedBaggingClassifier` allows to resample each subset of data
before to train each estimator of the ensemble. In short, it combines the
Expand All @@ -105,8 +105,8 @@ takes the same parameters than the scikit-learn
>>> y_pred = bbc.predict(X_test)
>>> confusion_matrix(y_test, y_pred)
array([[ 12, 0, 0],
[ 0, 55, 4],
[ 68, 53, 1058]])
[ 1, 54, 4],
[ 49, 53, 1077]])

See
:ref:`sphx_glr_auto_examples_ensemble_plot_comparison_bagging_classifier.py`.
12 changes: 5 additions & 7 deletions examples/ensemble/plot_comparison_bagging_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,11 @@
import matplotlib.pyplot as plt
import numpy as np

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import confusion_matrix

from imblearn.datasets import make_imbalance
from imblearn.datasets import fetch_datasets
from imblearn.ensemble import BalancedBaggingClassifier

from imblearn.metrics import classification_report_imbalanced
Expand Down Expand Up @@ -70,9 +69,8 @@ def plot_confusion_matrix(cm, classes,
plt.xlabel('Predicted label')


iris = load_iris()
X, y = make_imbalance(iris.data, iris.target, ratio={0: 25, 1: 40, 2: 50},
random_state=0)
ozone = fetch_datasets()['ozone_level']
X, y = ozone.data, ozone.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

bagging = BaggingClassifier(random_state=0)
Expand All @@ -90,15 +88,15 @@ def plot_confusion_matrix(cm, classes,
print(classification_report_imbalanced(y_test, y_pred_bagging))
cm_bagging = confusion_matrix(y_test, y_pred_bagging)
plt.figure()
plot_confusion_matrix(cm_bagging, classes=iris.target_names,
plot_confusion_matrix(cm_bagging, classes=np.unique(ozone.target),
title='Confusion matrix using BaggingClassifier')

print('Classification results using a bagging classifier on balanced data')
y_pred_balanced_bagging = balanced_bagging.predict(X_test)
print(classification_report_imbalanced(y_test, y_pred_balanced_bagging))
cm_balanced_bagging = confusion_matrix(y_test, y_pred_balanced_bagging)
plt.figure()
plot_confusion_matrix(cm_balanced_bagging, classes=iris.target_names,
plot_confusion_matrix(cm_balanced_bagging, classes=np.unique(ozone.target),
title='Confusion matrix using BalancedBaggingClassifier')

plt.show()
17 changes: 0 additions & 17 deletions imblearn/ensemble/classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,23 +21,6 @@
old_generate = _generate_bagging_indices


def _masked_bagging_indices(random_state, bootstrap_features,
bootstrap_samples, n_features, n_samples,
max_features, max_samples):
"""Monkey-patch to always get a mask instead of indices"""
feature_indices, sample_indices = old_generate(random_state,
bootstrap_features,
bootstrap_samples,
n_features, n_samples,
max_features, max_samples)
sample_indices = indices_to_mask(sample_indices, n_samples)

return feature_indices, sample_indices


sklearn.ensemble.bagging._generate_bagging_indices = _masked_bagging_indices


class BalancedBaggingClassifier(BaggingClassifier):
"""A Bagging classifier with additional balancing.

Expand Down
76 changes: 39 additions & 37 deletions imblearn/ensemble/tests/test_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,43 +400,45 @@ def test_oob_score_consistency():
assert bagging.fit(X, y).oob_score_ == bagging.fit(X, y).oob_score_


def test_estimators_samples():
# Check that format of estimators_samples_ is correct and that results
# generated at fit time can be identically reproduced at a later time
# using data saved in object attributes.
X, y = make_hastie_10_2(n_samples=200, random_state=1)

# remap the y outside of the BalancedBaggingclassifier
# _, y = np.unique(y, return_inverse=True)
bagging = BalancedBaggingClassifier(LogisticRegression(), max_samples=0.5,
max_features=0.5, random_state=1,
bootstrap=False)
bagging.fit(X, y)

# Get relevant attributes
estimators_samples = bagging.estimators_samples_
estimators_features = bagging.estimators_features_
estimators = bagging.estimators_

# Test for correct formatting
assert len(estimators_samples) == len(estimators)
assert len(estimators_samples[0]) == len(X)
assert estimators_samples[0].dtype.kind == 'b'

# Re-fit single estimator to test for consistent sampling
estimator_index = 0
estimator_samples = estimators_samples[estimator_index]
estimator_features = estimators_features[estimator_index]
estimator = estimators[estimator_index]

X_train = (X[estimator_samples])[:, estimator_features]
y_train = y[estimator_samples]

orig_coefs = estimator.steps[-1][1].coef_
estimator.fit(X_train, y_train)
new_coefs = estimator.steps[-1][1].coef_

assert_array_almost_equal(orig_coefs, new_coefs)
# FIXME: uncomment when #9723 is merged in scikit-learn
# def test_estimators_samples():
# # Check that format of estimators_samples_ is correct and that results
# # generated at fit time can be identically reproduced at a later time
# # using data saved in object attributes.
# X, y = make_hastie_10_2(n_samples=200, random_state=1)

# # remap the y outside of the BalancedBaggingclassifier
# # _, y = np.unique(y, return_inverse=True)
# bagging = BalancedBaggingClassifier(LogisticRegression(),
# max_samples=0.5,
# max_features=0.5, random_state=1,
# bootstrap=False)
# bagging.fit(X, y)

# # Get relevant attributes
# estimators_samples = bagging.estimators_samples_
# estimators_features = bagging.estimators_features_
# estimators = bagging.estimators_

# # Test for correct formatting
# assert len(estimators_samples) == len(estimators)
# assert len(estimators_samples[0]) == len(X)
# assert estimators_samples[0].dtype.kind == 'b'

# # Re-fit single estimator to test for consistent sampling
# estimator_index = 0
# estimator_samples = estimators_samples[estimator_index]
# estimator_features = estimators_features[estimator_index]
# estimator = estimators[estimator_index]

# X_train = (X[estimator_samples])[:, estimator_features]
# y_train = y[estimator_samples]

# orig_coefs = estimator.steps[-1][1].coef_
# estimator.fit(X_train, y_train)
# new_coefs = estimator.steps[-1][1].coef_

# assert_array_almost_equal(orig_coefs, new_coefs)


def test_max_samples_consistency():
Expand Down