Skip to content

Commit 948da4a

Browse files
committed
format code
1 parent 9b2ec7f commit 948da4a

File tree

2 files changed

+105
-92
lines changed

2 files changed

+105
-92
lines changed

imblearn/over_sampling/_mlsmote.py

Lines changed: 97 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import collections
44
import random
55

6+
67
class MLSMOTE:
78
"""Over-sampling using MLSMOTE.
89
@@ -35,47 +36,49 @@ class MLSMOTE:
3536
Knowledge-Based Systems. -. 10.1016/j.knosys.2015.07.019.
3637
3738
"""
38-
def __init__(self,categorical_features,k_neighbors=5 ,sampling_strategy='ranking'):
39-
self.k_neighbors=k_neighbors
40-
self.sampling_strategy_=sampling_strategy
39+
40+
def __init__(self, categorical_features, k_neighbors=5, sampling_strategy='ranking'):
41+
self.k_neighbors = k_neighbors
42+
self.sampling_strategy_ = sampling_strategy
4143
self.categorical_features = categorical_features
42-
self.continuous_features_= None
44+
self.continuous_features_ = None
4345
self.unique_labels = []
44-
self.labels=[]
45-
self.features=[]
46+
self.labels = []
47+
self.features = []
4648

47-
def fit_resample(self,X,y):
49+
def fit_resample(self, X, y):
4850
self.n_features_ = X.shape[1]
49-
self.labels=np.array([np.array(xi) for xi in y])
51+
self.labels = np.array([np.array(xi) for xi in y])
5052

5153
self._validate_estimator()
5254

5355
X_resampled = X.copy()
5456
y_resampled = y.copy()
5557

5658
self.unique_labels = self._collect_unique_labels(y)
57-
self.features=X
59+
self.features = X
5860

59-
X_synth=[]
60-
y_synth=[]
61+
X_synth = []
62+
y_synth = []
6163

62-
append_X_synth=X_synth.append
63-
append_y_synth=y_synth.append
64-
mean_ir=self._get_mean_imbalance_ratio()
64+
append_X_synth = X_synth.append
65+
append_y_synth = y_synth.append
66+
mean_ir = self._get_mean_imbalance_ratio()
6567
for label in self.unique_labels:
66-
irlbl=self._get_imbalance_ratio_per_label(label)
68+
irlbl = self._get_imbalance_ratio_per_label(label)
6769
if irlbl > mean_ir:
68-
min_bag=self._get_all_instances_of_label(label)
70+
min_bag = self._get_all_instances_of_label(label)
6971
for sample in min_bag:
70-
distances=self._calc_distances(sample,min_bag)
71-
distances=np.sort(distances,order='distance')
72-
neighbours=distances[:self.k_neighbors]
73-
ref_neigh=np.random.choice(neighbours,1)[0]
74-
X_new,y_new=self._create_new_sample(sample,ref_neigh[1],[x[1] for x in neighbours])
72+
distances = self._calc_distances(sample, min_bag)
73+
distances = np.sort(distances, order='distance')
74+
neighbours = distances[:self.k_neighbors]
75+
ref_neigh = np.random.choice(neighbours, 1)[0]
76+
X_new, y_new = self._create_new_sample(
77+
sample, ref_neigh[1], [x[1] for x in neighbours])
7578
append_X_synth(X_new)
7679
append_y_synth(y_new)
7780

78-
return np.concatenate((X_resampled,np.array(X_synth))),np.array(y_resampled.tolist()+y_synth)
81+
return np.concatenate((X_resampled, np.array(X_synth))), np.array(y_resampled.tolist()+y_synth)
7982

8083
def _validate_estimator(self):
8184
categorical_features = np.asarray(self.categorical_features)
@@ -101,102 +104,110 @@ def _collect_unique_labels(self, y):
101104
"""A support function that flattens the labelsets and return one set of unique labels"""
102105
return np.unique(np.array([a for x in y for a in (x if isinstance(x, list) else [x])]))
103106

104-
def _create_new_sample(self,sample_id,ref_neigh_id,neighbour_ids):
105-
sample=self.features[sample_id]
106-
sample_labels=self.labels[sample_id]
107-
synth_sample=np.copy(sample)
108-
ref_neigh=self.features[ref_neigh_id]
109-
neighbours_labels=[]
107+
def _create_new_sample(self, sample_id, ref_neigh_id, neighbour_ids):
108+
sample = self.features[sample_id]
109+
sample_labels = self.labels[sample_id]
110+
synth_sample = np.copy(sample)
111+
ref_neigh = self.features[ref_neigh_id]
112+
neighbours_labels = []
110113
for ni in neighbour_ids:
111114
neighbours_labels.append(self.labels[ni].tolist())
112115
for i in range(synth_sample.shape[0]):
113116
if i in self.continuous_features_:
114-
diff=ref_neigh[i]-sample[i]
115-
offset=diff*random.uniform(0,1)
116-
synth_sample[i]=sample[i]+offset
117+
diff = ref_neigh[i]-sample[i]
118+
offset = diff*random.uniform(0, 1)
119+
synth_sample[i] = sample[i]+offset
117120
if i in self.categorical_features_:
118-
synth_sample[i]=self._get_most_frequent_value(self.features[neighbour_ids,i])
119-
120-
labels=sample_labels.tolist()
121-
labels+=[a for x in neighbours_labels for a in (x if isinstance(x, list) else [x])]
122-
labels=list(set(labels))
123-
if self.sampling_strategy_=='ranking':
124-
head_index=int((self.k_neighbors+ 1)/2)
125-
y=labels[:head_index]
126-
if self.sampling_strategy_=='union':
127-
y=labels[:]
128-
if self.sampling_strategy_=='intersection':
129-
y=list(set.intersection(*neighbours_labels))
130-
131-
X=synth_sample
132-
return X,y
133-
134-
135-
def _calc_distances(self,sample,min_bag):
136-
distances=[]
137-
append_distances=distances.append
121+
synth_sample[i] = self._get_most_frequent_value(
122+
self.features[neighbour_ids, i])
123+
124+
labels = sample_labels.tolist()
125+
labels += [a for x in neighbours_labels for a in (
126+
x if isinstance(x, list) else [x])]
127+
labels = list(set(labels))
128+
if self.sampling_strategy_ == 'ranking':
129+
head_index = int((self.k_neighbors + 1)/2)
130+
y = labels[:head_index]
131+
if self.sampling_strategy_ == 'union':
132+
y = labels[:]
133+
if self.sampling_strategy_ == 'intersection':
134+
y = list(set.intersection(*neighbours_labels))
135+
136+
X = synth_sample
137+
return X, y
138+
139+
def _calc_distances(self, sample, min_bag):
140+
distances = []
141+
append_distances = distances.append
138142
for bag_sample in min_bag:
139-
nominal_distances=np.array([self._get_vdm(self.features[sample,cat],self.features[bag_sample,cat])for cat in self.categorical_features_])
140-
ordinal_distances=np.array([self._get_euclidean_distance(self.features[sample,num],self.features[bag_sample,num])for num in self.continuous_features_])
141-
dists=np.array([nominal_distances.sum(),ordinal_distances.sum()])
142-
append_distances((dists.sum(),bag_sample))
143-
dtype = np.dtype([('distance', float), ('index', int)])
144-
return np.array(distances,dtype=dtype)
145-
146-
147-
def _get_euclidean_distance(self,first,second):
148-
euclidean_distance=np.linalg.norm(first-second)
143+
nominal_distances = np.array([self._get_vdm(
144+
self.features[sample, cat], self.features[bag_sample, cat])for cat in self.categorical_features_])
145+
ordinal_distances = np.array([self._get_euclidean_distance(
146+
self.features[sample, num], self.features[bag_sample, num])for num in self.continuous_features_])
147+
dists = np.array(
148+
[nominal_distances.sum(), ordinal_distances.sum()])
149+
append_distances((dists.sum(), bag_sample))
150+
dtype = np.dtype([('distance', float), ('index', int)])
151+
return np.array(distances, dtype=dtype)
152+
153+
def _get_euclidean_distance(self, first, second):
154+
euclidean_distance = np.linalg.norm(first-second)
149155
return euclidean_distance
150156

151-
def _get_vdm(self,first,second):
157+
def _get_vdm(self, first, second):
152158
"""A support function to compute the Value Difference Metric(VDM) discribed in https://arxiv.org/pdf/cs/9701101.pdf"""
153159
def f(c):
154-
N_ax=len(np.where(self.features[:,self.categorical_features_]==first))
155-
N_ay=len(np.where(self.features[:,self.categorical_features_]==second))
156-
c_instances=self._get_all_instances_of_label(c)
157-
N_axc=len(np.where(self.features[np.ix_(c_instances,self.categorical_features_)]==first)[0])
158-
N_ayc=len(np.where(self.features[np.ix_(c_instances,self.categorical_features_)]==second)[0])
160+
N_ax = len(
161+
np.where(self.features[:, self.categorical_features_] == first))
162+
N_ay = len(
163+
np.where(self.features[:, self.categorical_features_] == second))
164+
c_instances = self._get_all_instances_of_label(c)
165+
N_axc = len(np.where(self.features[np.ix_(
166+
c_instances, self.categorical_features_)] == first)[0])
167+
N_ayc = len(np.where(self.features[np.ix_(
168+
c_instances, self.categorical_features_)] == second)[0])
159169
return np.square(np.abs((N_axc/N_ax)-(N_ayc/N_ay)))
160-
170+
161171
return np.sum(np.array([f(c)for c in self.unique_labels]))
162172

163-
def _get_all_instances_of_label(self,label):
164-
instance_ids=[]
165-
append_instance_id=instance_ids.append
166-
for i,label_set in enumerate(self.labels):
173+
def _get_all_instances_of_label(self, label):
174+
instance_ids = []
175+
append_instance_id = instance_ids.append
176+
for i, label_set in enumerate(self.labels):
167177
if label in label_set:
168178
append_instance_id(i)
169179
return np.array(instance_ids)
170180

171181
def _get_mean_imbalance_ratio(self):
172-
ratio_sum=np.sum(np.array(list(map(self._get_imbalance_ratio_per_label,self.unique_labels))))
182+
ratio_sum = np.sum(
183+
np.array(list(map(self._get_imbalance_ratio_per_label, self.unique_labels))))
173184
return ratio_sum/self.unique_labels.shape[0]
174185

175-
def _get_imbalance_ratio_per_label(self,label):
176-
sum_array=list(map(self._sum_h,self.unique_labels))
177-
sum_array=np.array(sum_array)
186+
def _get_imbalance_ratio_per_label(self, label):
187+
sum_array = list(map(self._sum_h, self.unique_labels))
188+
sum_array = np.array(sum_array)
178189
return sum_array.max()/self._sum_h(label)
179190

180-
def _sum_h(self,label):
181-
h_sum=0
182-
def h(l,Y):
191+
def _sum_h(self, label):
192+
h_sum = 0
193+
194+
def h(l, Y):
183195
if l in Y:
184196
return 1
185197
else:
186198
return 0
187199

188200
for label_set in self.labels:
189-
h_sum+=h(label,label_set)
201+
h_sum += h(label, label_set)
190202
return h_sum
191203

192-
193-
def _get_label_frequencies(self,labels):
204+
def _get_label_frequencies(self, labels):
194205
""""A support function to get the frequencies of labels"""
195-
frequency_map=np.array(np.unique(labels, return_counts=True)).T
196-
frequencies=np.array([x[1] for x in count_map])
206+
frequency_map = np.array(np.unique(labels, return_counts=True)).T
207+
frequencies = np.array([x[1] for x in count_map])
197208
return frequencies
198-
209+
199210
def _get_most_frequent_value(self, values):
200211
""""A support function to get most frequent value if a list of values"""
201212
uniques, indices = np.unique(values, return_inverse=True)
202-
return uniques[np.argmax(np.bincount(indices))]
213+
return uniques[np.argmax(np.bincount(indices))]

imblearn/over_sampling/tests/test_mlsmote.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ def data_heterogneous_ordered():
2020
X[:, 2] = rng.choice(["a", "b", "c"], size=30).astype(object)
2121
# create a categorical feature using some integer
2222
X[:, 3] = rng.randint(3, size=30)
23-
y = np.array([[0,2,3]] * 5 +[[1,2,3,4]]*2 + [[1,2]]*3+[[1]] * 20)
23+
y = np.array([[0, 2, 3]] * 5 + [[1, 2, 3, 4]]*2 + [[1, 2]]*3+[[1]] * 20)
2424
# return the categories
2525
return X, y, [2, 3]
2626

@@ -34,7 +34,7 @@ def data_heterogneous_unordered():
3434
X[:, 0] = rng.choice(["a", "b", "c"], size=30).astype(object)
3535
# create a categorical feature using some integer
3636
X[:, 3] = rng.randint(3, size=30)
37-
y = np.array([[0,2,3]] * 5 +[[1,2,3,4]]*2 + [[1,2]]*3+[[1]] * 20)
37+
y = np.array([[0, 2, 3]] * 5 + [[1, 2, 3, 4]]*2 + [[1, 2]]*3+[[1]] * 20)
3838
# return the categories
3939
return X, y, [0, 3]
4040

@@ -48,7 +48,7 @@ def data_heterogneous_masked():
4848
X[:, 0] = rng.choice(["a", "b", "c"], size=30).astype(object)
4949
# create a categorical feature using some integer
5050
X[:, 3] = rng.randint(3, size=30)
51-
y = np.array([[0,2,3]] * 5 +[[1,2,3,4]]*2 + [[1,2]]*3+[[1]] * 20)
51+
y = np.array([[0, 2, 3]] * 5 + [[1, 2, 3, 4]]*2 + [[1, 2]]*3+[[1]] * 20)
5252
# return the categories
5353
return X, y, [True, False, True]
5454

@@ -83,6 +83,7 @@ def test_mlsmote(data):
8383
assert set(X[:, cat_idx]) == set(X_resampled[:, cat_idx])
8484
assert X[:, cat_idx].dtype == X_resampled[:, cat_idx].dtype
8585

86+
8687
def test_mlsmote_fit():
8788
X, y, categorical_features = data_heterogneous_unordered()
8889
smote = MLSMOTE(categorical_features=categorical_features)
@@ -94,11 +95,12 @@ def test_mlsmote_fit():
9495

9596
def test_mlsmote_fit_resample():
9697
X, y, categorical_features = data_heterogneous_unordered()
97-
target_stats = Counter(np.unique(np.array([a for x in y for a in (x if isinstance(x, list) else [x])])))
98+
target_stats = Counter(np.unique(
99+
np.array([a for x in y for a in (x if isinstance(x, list) else [x])])))
98100
smote = MLSMOTE(categorical_features=categorical_features)
99101
_, y_res = smote.fit_resample(X, y)
100-
classes_res=np.unique(np.array([a for x in y_res for a in (x if isinstance(x, list) else [x])]))
102+
classes_res = np.unique(
103+
np.array([a for x in y_res for a in (x if isinstance(x, list) else [x])]))
101104
_ = Counter(classes_res)
102105
n_samples = max(target_stats.values())
103106
assert all(value >= n_samples for value in Counter(classes_res).values())
104-

0 commit comments

Comments
 (0)