3
3
import collections
4
4
import random
5
5
6
+
6
7
class MLSMOTE :
7
8
"""Over-sampling using MLSMOTE.
8
9
@@ -35,47 +36,49 @@ class MLSMOTE:
35
36
Knowledge-Based Systems. -. 10.1016/j.knosys.2015.07.019.
36
37
37
38
"""
38
- def __init__ (self ,categorical_features ,k_neighbors = 5 ,sampling_strategy = 'ranking' ):
39
- self .k_neighbors = k_neighbors
40
- self .sampling_strategy_ = sampling_strategy
39
+
40
+ def __init__ (self , categorical_features , k_neighbors = 5 , sampling_strategy = 'ranking' ):
41
+ self .k_neighbors = k_neighbors
42
+ self .sampling_strategy_ = sampling_strategy
41
43
self .categorical_features = categorical_features
42
- self .continuous_features_ = None
44
+ self .continuous_features_ = None
43
45
self .unique_labels = []
44
- self .labels = []
45
- self .features = []
46
+ self .labels = []
47
+ self .features = []
46
48
47
- def fit_resample (self ,X , y ):
49
+ def fit_resample (self , X , y ):
48
50
self .n_features_ = X .shape [1 ]
49
- self .labels = np .array ([np .array (xi ) for xi in y ])
51
+ self .labels = np .array ([np .array (xi ) for xi in y ])
50
52
51
53
self ._validate_estimator ()
52
54
53
55
X_resampled = X .copy ()
54
56
y_resampled = y .copy ()
55
57
56
58
self .unique_labels = self ._collect_unique_labels (y )
57
- self .features = X
59
+ self .features = X
58
60
59
- X_synth = []
60
- y_synth = []
61
+ X_synth = []
62
+ y_synth = []
61
63
62
- append_X_synth = X_synth .append
63
- append_y_synth = y_synth .append
64
- mean_ir = self ._get_mean_imbalance_ratio ()
64
+ append_X_synth = X_synth .append
65
+ append_y_synth = y_synth .append
66
+ mean_ir = self ._get_mean_imbalance_ratio ()
65
67
for label in self .unique_labels :
66
- irlbl = self ._get_imbalance_ratio_per_label (label )
68
+ irlbl = self ._get_imbalance_ratio_per_label (label )
67
69
if irlbl > mean_ir :
68
- min_bag = self ._get_all_instances_of_label (label )
70
+ min_bag = self ._get_all_instances_of_label (label )
69
71
for sample in min_bag :
70
- distances = self ._calc_distances (sample ,min_bag )
71
- distances = np .sort (distances ,order = 'distance' )
72
- neighbours = distances [:self .k_neighbors ]
73
- ref_neigh = np .random .choice (neighbours ,1 )[0 ]
74
- X_new ,y_new = self ._create_new_sample (sample ,ref_neigh [1 ],[x [1 ] for x in neighbours ])
72
+ distances = self ._calc_distances (sample , min_bag )
73
+ distances = np .sort (distances , order = 'distance' )
74
+ neighbours = distances [:self .k_neighbors ]
75
+ ref_neigh = np .random .choice (neighbours , 1 )[0 ]
76
+ X_new , y_new = self ._create_new_sample (
77
+ sample , ref_neigh [1 ], [x [1 ] for x in neighbours ])
75
78
append_X_synth (X_new )
76
79
append_y_synth (y_new )
77
80
78
- return np .concatenate ((X_resampled ,np .array (X_synth ))),np .array (y_resampled .tolist ()+ y_synth )
81
+ return np .concatenate ((X_resampled , np .array (X_synth ))), np .array (y_resampled .tolist ()+ y_synth )
79
82
80
83
def _validate_estimator (self ):
81
84
categorical_features = np .asarray (self .categorical_features )
@@ -101,102 +104,110 @@ def _collect_unique_labels(self, y):
101
104
"""A support function that flattens the labelsets and return one set of unique labels"""
102
105
return np .unique (np .array ([a for x in y for a in (x if isinstance (x , list ) else [x ])]))
103
106
104
- def _create_new_sample (self ,sample_id ,ref_neigh_id ,neighbour_ids ):
105
- sample = self .features [sample_id ]
106
- sample_labels = self .labels [sample_id ]
107
- synth_sample = np .copy (sample )
108
- ref_neigh = self .features [ref_neigh_id ]
109
- neighbours_labels = []
107
+ def _create_new_sample (self , sample_id , ref_neigh_id , neighbour_ids ):
108
+ sample = self .features [sample_id ]
109
+ sample_labels = self .labels [sample_id ]
110
+ synth_sample = np .copy (sample )
111
+ ref_neigh = self .features [ref_neigh_id ]
112
+ neighbours_labels = []
110
113
for ni in neighbour_ids :
111
114
neighbours_labels .append (self .labels [ni ].tolist ())
112
115
for i in range (synth_sample .shape [0 ]):
113
116
if i in self .continuous_features_ :
114
- diff = ref_neigh [i ]- sample [i ]
115
- offset = diff * random .uniform (0 ,1 )
116
- synth_sample [i ]= sample [i ]+ offset
117
+ diff = ref_neigh [i ]- sample [i ]
118
+ offset = diff * random .uniform (0 , 1 )
119
+ synth_sample [i ] = sample [i ]+ offset
117
120
if i in self .categorical_features_ :
118
- synth_sample [i ]= self ._get_most_frequent_value (self .features [neighbour_ids ,i ])
119
-
120
- labels = sample_labels .tolist ()
121
- labels += [a for x in neighbours_labels for a in (x if isinstance (x , list ) else [x ])]
122
- labels = list (set (labels ))
123
- if self .sampling_strategy_ == 'ranking' :
124
- head_index = int ((self .k_neighbors + 1 )/ 2 )
125
- y = labels [:head_index ]
126
- if self .sampling_strategy_ == 'union' :
127
- y = labels [:]
128
- if self .sampling_strategy_ == 'intersection' :
129
- y = list (set .intersection (* neighbours_labels ))
130
-
131
- X = synth_sample
132
- return X ,y
133
-
134
-
135
- def _calc_distances (self ,sample ,min_bag ):
136
- distances = []
137
- append_distances = distances .append
121
+ synth_sample [i ] = self ._get_most_frequent_value (
122
+ self .features [neighbour_ids , i ])
123
+
124
+ labels = sample_labels .tolist ()
125
+ labels += [a for x in neighbours_labels for a in (
126
+ x if isinstance (x , list ) else [x ])]
127
+ labels = list (set (labels ))
128
+ if self .sampling_strategy_ == 'ranking' :
129
+ head_index = int ((self .k_neighbors + 1 )/ 2 )
130
+ y = labels [:head_index ]
131
+ if self .sampling_strategy_ == 'union' :
132
+ y = labels [:]
133
+ if self .sampling_strategy_ == 'intersection' :
134
+ y = list (set .intersection (* neighbours_labels ))
135
+
136
+ X = synth_sample
137
+ return X , y
138
+
139
+ def _calc_distances (self , sample , min_bag ):
140
+ distances = []
141
+ append_distances = distances .append
138
142
for bag_sample in min_bag :
139
- nominal_distances = np .array ([self ._get_vdm (self .features [sample ,cat ],self .features [bag_sample ,cat ])for cat in self .categorical_features_ ])
140
- ordinal_distances = np .array ([self ._get_euclidean_distance (self .features [sample ,num ],self .features [bag_sample ,num ])for num in self .continuous_features_ ])
141
- dists = np .array ([nominal_distances .sum (),ordinal_distances .sum ()])
142
- append_distances ((dists .sum (),bag_sample ))
143
- dtype = np .dtype ([('distance' , float ), ('index' , int )])
144
- return np .array (distances ,dtype = dtype )
145
-
146
-
147
- def _get_euclidean_distance (self ,first ,second ):
148
- euclidean_distance = np .linalg .norm (first - second )
143
+ nominal_distances = np .array ([self ._get_vdm (
144
+ self .features [sample , cat ], self .features [bag_sample , cat ])for cat in self .categorical_features_ ])
145
+ ordinal_distances = np .array ([self ._get_euclidean_distance (
146
+ self .features [sample , num ], self .features [bag_sample , num ])for num in self .continuous_features_ ])
147
+ dists = np .array (
148
+ [nominal_distances .sum (), ordinal_distances .sum ()])
149
+ append_distances ((dists .sum (), bag_sample ))
150
+ dtype = np .dtype ([('distance' , float ), ('index' , int )])
151
+ return np .array (distances , dtype = dtype )
152
+
153
+ def _get_euclidean_distance (self , first , second ):
154
+ euclidean_distance = np .linalg .norm (first - second )
149
155
return euclidean_distance
150
156
151
- def _get_vdm (self ,first ,second ):
157
+ def _get_vdm (self , first , second ):
152
158
"""A support function to compute the Value Difference Metric(VDM) discribed in https://arxiv.org/pdf/cs/9701101.pdf"""
153
159
def f (c ):
154
- N_ax = len (np .where (self .features [:,self .categorical_features_ ]== first ))
155
- N_ay = len (np .where (self .features [:,self .categorical_features_ ]== second ))
156
- c_instances = self ._get_all_instances_of_label (c )
157
- N_axc = len (np .where (self .features [np .ix_ (c_instances ,self .categorical_features_ )]== first )[0 ])
158
- N_ayc = len (np .where (self .features [np .ix_ (c_instances ,self .categorical_features_ )]== second )[0 ])
160
+ N_ax = len (
161
+ np .where (self .features [:, self .categorical_features_ ] == first ))
162
+ N_ay = len (
163
+ np .where (self .features [:, self .categorical_features_ ] == second ))
164
+ c_instances = self ._get_all_instances_of_label (c )
165
+ N_axc = len (np .where (self .features [np .ix_ (
166
+ c_instances , self .categorical_features_ )] == first )[0 ])
167
+ N_ayc = len (np .where (self .features [np .ix_ (
168
+ c_instances , self .categorical_features_ )] == second )[0 ])
159
169
return np .square (np .abs ((N_axc / N_ax )- (N_ayc / N_ay )))
160
-
170
+
161
171
return np .sum (np .array ([f (c )for c in self .unique_labels ]))
162
172
163
- def _get_all_instances_of_label (self ,label ):
164
- instance_ids = []
165
- append_instance_id = instance_ids .append
166
- for i ,label_set in enumerate (self .labels ):
173
+ def _get_all_instances_of_label (self , label ):
174
+ instance_ids = []
175
+ append_instance_id = instance_ids .append
176
+ for i , label_set in enumerate (self .labels ):
167
177
if label in label_set :
168
178
append_instance_id (i )
169
179
return np .array (instance_ids )
170
180
171
181
def _get_mean_imbalance_ratio (self ):
172
- ratio_sum = np .sum (np .array (list (map (self ._get_imbalance_ratio_per_label ,self .unique_labels ))))
182
+ ratio_sum = np .sum (
183
+ np .array (list (map (self ._get_imbalance_ratio_per_label , self .unique_labels ))))
173
184
return ratio_sum / self .unique_labels .shape [0 ]
174
185
175
- def _get_imbalance_ratio_per_label (self ,label ):
176
- sum_array = list (map (self ._sum_h ,self .unique_labels ))
177
- sum_array = np .array (sum_array )
186
+ def _get_imbalance_ratio_per_label (self , label ):
187
+ sum_array = list (map (self ._sum_h , self .unique_labels ))
188
+ sum_array = np .array (sum_array )
178
189
return sum_array .max ()/ self ._sum_h (label )
179
190
180
- def _sum_h (self ,label ):
181
- h_sum = 0
182
- def h (l ,Y ):
191
+ def _sum_h (self , label ):
192
+ h_sum = 0
193
+
194
+ def h (l , Y ):
183
195
if l in Y :
184
196
return 1
185
197
else :
186
198
return 0
187
199
188
200
for label_set in self .labels :
189
- h_sum += h (label ,label_set )
201
+ h_sum += h (label , label_set )
190
202
return h_sum
191
203
192
-
193
- def _get_label_frequencies (self ,labels ):
204
+ def _get_label_frequencies (self , labels ):
194
205
""""A support function to get the frequencies of labels"""
195
- frequency_map = np .array (np .unique (labels , return_counts = True )).T
196
- frequencies = np .array ([x [1 ] for x in count_map ])
206
+ frequency_map = np .array (np .unique (labels , return_counts = True )).T
207
+ frequencies = np .array ([x [1 ] for x in count_map ])
197
208
return frequencies
198
-
209
+
199
210
def _get_most_frequent_value (self , values ):
200
211
""""A support function to get most frequent value if a list of values"""
201
212
uniques , indices = np .unique (values , return_inverse = True )
202
- return uniques [np .argmax (np .bincount (indices ))]
213
+ return uniques [np .argmax (np .bincount (indices ))]
0 commit comments