7
7
#include <linux/kernel.h>
8
8
#include <linux/slab.h>
9
9
#include <linux/cpu.h>
10
+ #include <linux/sort.h>
10
11
11
12
static void irq_spread_init_one (struct cpumask * irqmsk , struct cpumask * nmsk ,
12
13
unsigned int cpus_per_vec )
@@ -94,6 +95,155 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,
94
95
return nodes ;
95
96
}
96
97
98
+ struct node_vectors {
99
+ unsigned id ;
100
+
101
+ union {
102
+ unsigned nvectors ;
103
+ unsigned ncpus ;
104
+ };
105
+ };
106
+
107
+ static int ncpus_cmp_func (const void * l , const void * r )
108
+ {
109
+ const struct node_vectors * ln = l ;
110
+ const struct node_vectors * rn = r ;
111
+
112
+ return ln -> ncpus - rn -> ncpus ;
113
+ }
114
+
115
+ /*
116
+ * Allocate vector number for each node, so that for each node:
117
+ *
118
+ * 1) the allocated number is >= 1
119
+ *
120
+ * 2) the allocated numbver is <= active CPU number of this node
121
+ *
122
+ * The actual allocated total vectors may be less than @numvecs when
123
+ * active total CPU number is less than @numvecs.
124
+ *
125
+ * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]'
126
+ * for each node.
127
+ */
128
+ static void alloc_nodes_vectors (unsigned int numvecs ,
129
+ const cpumask_var_t * node_to_cpumask ,
130
+ const struct cpumask * cpu_mask ,
131
+ const nodemask_t nodemsk ,
132
+ struct cpumask * nmsk ,
133
+ struct node_vectors * node_vectors )
134
+ {
135
+ unsigned n , remaining_ncpus = 0 ;
136
+
137
+ for (n = 0 ; n < nr_node_ids ; n ++ ) {
138
+ node_vectors [n ].id = n ;
139
+ node_vectors [n ].ncpus = UINT_MAX ;
140
+ }
141
+
142
+ for_each_node_mask (n , nodemsk ) {
143
+ unsigned ncpus ;
144
+
145
+ cpumask_and (nmsk , cpu_mask , node_to_cpumask [n ]);
146
+ ncpus = cpumask_weight (nmsk );
147
+
148
+ if (!ncpus )
149
+ continue ;
150
+ remaining_ncpus += ncpus ;
151
+ node_vectors [n ].ncpus = ncpus ;
152
+ }
153
+
154
+ numvecs = min_t (unsigned , remaining_ncpus , numvecs );
155
+
156
+ sort (node_vectors , nr_node_ids , sizeof (node_vectors [0 ]),
157
+ ncpus_cmp_func , NULL );
158
+
159
+ /*
160
+ * Allocate vectors for each node according to the ratio of this
161
+ * node's nr_cpus to remaining un-assigned ncpus. 'numvecs' is
162
+ * bigger than number of active numa nodes. Always start the
163
+ * allocation from the node with minimized nr_cpus.
164
+ *
165
+ * This way guarantees that each active node gets allocated at
166
+ * least one vector, and the theory is simple: over-allocation
167
+ * is only done when this node is assigned by one vector, so
168
+ * other nodes will be allocated >= 1 vector, since 'numvecs' is
169
+ * bigger than number of numa nodes.
170
+ *
171
+ * One perfect invariant is that number of allocated vectors for
172
+ * each node is <= CPU count of this node:
173
+ *
174
+ * 1) suppose there are two nodes: A and B
175
+ * ncpu(X) is CPU count of node X
176
+ * vecs(X) is the vector count allocated to node X via this
177
+ * algorithm
178
+ *
179
+ * ncpu(A) <= ncpu(B)
180
+ * ncpu(A) + ncpu(B) = N
181
+ * vecs(A) + vecs(B) = V
182
+ *
183
+ * vecs(A) = max(1, round_down(V * ncpu(A) / N))
184
+ * vecs(B) = V - vecs(A)
185
+ *
186
+ * both N and V are integer, and 2 <= V <= N, suppose
187
+ * V = N - delta, and 0 <= delta <= N - 2
188
+ *
189
+ * 2) obviously vecs(A) <= ncpu(A) because:
190
+ *
191
+ * if vecs(A) is 1, then vecs(A) <= ncpu(A) given
192
+ * ncpu(A) >= 1
193
+ *
194
+ * otherwise,
195
+ * vecs(A) <= V * ncpu(A) / N <= ncpu(A), given V <= N
196
+ *
197
+ * 3) prove how vecs(B) <= ncpu(B):
198
+ *
199
+ * if round_down(V * ncpu(A) / N) == 0, vecs(B) won't be
200
+ * over-allocated, so vecs(B) <= ncpu(B),
201
+ *
202
+ * otherwise:
203
+ *
204
+ * vecs(A) =
205
+ * round_down(V * ncpu(A) / N) =
206
+ * round_down((N - delta) * ncpu(A) / N) =
207
+ * round_down((N * ncpu(A) - delta * ncpu(A)) / N) >=
208
+ * round_down((N * ncpu(A) - delta * N) / N) =
209
+ * cpu(A) - delta
210
+ *
211
+ * then:
212
+ *
213
+ * vecs(A) - V >= ncpu(A) - delta - V
214
+ * =>
215
+ * V - vecs(A) <= V + delta - ncpu(A)
216
+ * =>
217
+ * vecs(B) <= N - ncpu(A)
218
+ * =>
219
+ * vecs(B) <= cpu(B)
220
+ *
221
+ * For nodes >= 3, it can be thought as one node and another big
222
+ * node given that is exactly what this algorithm is implemented,
223
+ * and we always re-calculate 'remaining_ncpus' & 'numvecs', and
224
+ * finally for each node X: vecs(X) <= ncpu(X).
225
+ *
226
+ */
227
+ for (n = 0 ; n < nr_node_ids ; n ++ ) {
228
+ unsigned nvectors , ncpus ;
229
+
230
+ if (node_vectors [n ].ncpus == UINT_MAX )
231
+ continue ;
232
+
233
+ WARN_ON_ONCE (numvecs == 0 );
234
+
235
+ ncpus = node_vectors [n ].ncpus ;
236
+ nvectors = max_t (unsigned , 1 ,
237
+ numvecs * ncpus / remaining_ncpus );
238
+ WARN_ON_ONCE (nvectors > ncpus );
239
+
240
+ node_vectors [n ].nvectors = nvectors ;
241
+
242
+ remaining_ncpus -= ncpus ;
243
+ numvecs -= nvectors ;
244
+ }
245
+ }
246
+
97
247
static int __irq_build_affinity_masks (unsigned int startvec ,
98
248
unsigned int numvecs ,
99
249
unsigned int firstvec ,
@@ -102,10 +252,11 @@ static int __irq_build_affinity_masks(unsigned int startvec,
102
252
struct cpumask * nmsk ,
103
253
struct irq_affinity_desc * masks )
104
254
{
105
- unsigned int n , nodes , cpus_per_vec , extra_vecs , done = 0 ;
255
+ unsigned int i , n , nodes , cpus_per_vec , extra_vecs , done = 0 ;
106
256
unsigned int last_affv = firstvec + numvecs ;
107
257
unsigned int curvec = startvec ;
108
258
nodemask_t nodemsk = NODE_MASK_NONE ;
259
+ struct node_vectors * node_vectors ;
109
260
110
261
if (!cpumask_weight (cpu_mask ))
111
262
return 0 ;
@@ -126,53 +277,57 @@ static int __irq_build_affinity_masks(unsigned int startvec,
126
277
return numvecs ;
127
278
}
128
279
129
- for_each_node_mask (n , nodemsk ) {
130
- unsigned int ncpus , v , vecs_to_assign , vecs_per_node ;
280
+ node_vectors = kcalloc (nr_node_ids ,
281
+ sizeof (struct node_vectors ),
282
+ GFP_KERNEL );
283
+ if (!node_vectors )
284
+ return - ENOMEM ;
285
+
286
+ /* allocate vector number for each node */
287
+ alloc_nodes_vectors (numvecs , node_to_cpumask , cpu_mask ,
288
+ nodemsk , nmsk , node_vectors );
289
+
290
+ for (i = 0 ; i < nr_node_ids ; i ++ ) {
291
+ unsigned int ncpus , v ;
292
+ struct node_vectors * nv = & node_vectors [i ];
293
+
294
+ if (nv -> nvectors == UINT_MAX )
295
+ continue ;
131
296
132
297
/* Get the cpus on this node which are in the mask */
133
- cpumask_and (nmsk , cpu_mask , node_to_cpumask [n ]);
298
+ cpumask_and (nmsk , cpu_mask , node_to_cpumask [nv -> id ]);
134
299
ncpus = cpumask_weight (nmsk );
135
300
if (!ncpus )
136
301
continue ;
137
302
138
- /*
139
- * Calculate the number of cpus per vector
140
- *
141
- * Spread the vectors evenly per node. If the requested
142
- * vector number has been reached, simply allocate one
143
- * vector for each remaining node so that all nodes can
144
- * be covered
145
- */
146
- if (numvecs > done )
147
- vecs_per_node = max_t (unsigned ,
148
- (numvecs - done ) / nodes , 1 );
149
- else
150
- vecs_per_node = 1 ;
151
-
152
- vecs_to_assign = min (vecs_per_node , ncpus );
303
+ WARN_ON_ONCE (nv -> nvectors > ncpus );
153
304
154
305
/* Account for rounding errors */
155
- extra_vecs = ncpus - vecs_to_assign * (ncpus / vecs_to_assign );
306
+ extra_vecs = ncpus - nv -> nvectors * (ncpus / nv -> nvectors );
156
307
157
- for ( v = 0 ; curvec < last_affv && v < vecs_to_assign ;
158
- curvec ++ , v ++ ) {
159
- cpus_per_vec = ncpus / vecs_to_assign ;
308
+ /* Spread allocated vectors on CPUs of the current node */
309
+ for ( v = 0 ; v < nv -> nvectors ; v ++ , curvec ++ ) {
310
+ cpus_per_vec = ncpus / nv -> nvectors ;
160
311
161
312
/* Account for extra vectors to compensate rounding errors */
162
313
if (extra_vecs ) {
163
314
cpus_per_vec ++ ;
164
315
-- extra_vecs ;
165
316
}
317
+
318
+ /*
319
+ * wrapping has to be considered given 'startvec'
320
+ * may start anywhere
321
+ */
322
+ if (curvec >= last_affv )
323
+ curvec = firstvec ;
166
324
irq_spread_init_one (& masks [curvec ].mask , nmsk ,
167
325
cpus_per_vec );
168
326
}
169
-
170
- done += v ;
171
- if (curvec >= last_affv )
172
- curvec = firstvec ;
173
- -- nodes ;
327
+ done += nv -> nvectors ;
174
328
}
175
- return done < numvecs ? done : numvecs ;
329
+ kfree (node_vectors );
330
+ return done ;
176
331
}
177
332
178
333
/*
@@ -184,7 +339,7 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
184
339
unsigned int firstvec ,
185
340
struct irq_affinity_desc * masks )
186
341
{
187
- unsigned int curvec = startvec , nr_present , nr_others ;
342
+ unsigned int curvec = startvec , nr_present = 0 , nr_others = 0 ;
188
343
cpumask_var_t * node_to_cpumask ;
189
344
cpumask_var_t nmsk , npresmsk ;
190
345
int ret = - ENOMEM ;
@@ -199,15 +354,17 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
199
354
if (!node_to_cpumask )
200
355
goto fail_npresmsk ;
201
356
202
- ret = 0 ;
203
357
/* Stabilize the cpumasks */
204
358
get_online_cpus ();
205
359
build_node_to_cpumask (node_to_cpumask );
206
360
207
361
/* Spread on present CPUs starting from affd->pre_vectors */
208
- nr_present = __irq_build_affinity_masks (curvec , numvecs ,
209
- firstvec , node_to_cpumask ,
210
- cpu_present_mask , nmsk , masks );
362
+ ret = __irq_build_affinity_masks (curvec , numvecs , firstvec ,
363
+ node_to_cpumask , cpu_present_mask ,
364
+ nmsk , masks );
365
+ if (ret < 0 )
366
+ goto fail_build_affinity ;
367
+ nr_present = ret ;
211
368
212
369
/*
213
370
* Spread on non present CPUs starting from the next vector to be
@@ -220,12 +377,16 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
220
377
else
221
378
curvec = firstvec + nr_present ;
222
379
cpumask_andnot (npresmsk , cpu_possible_mask , cpu_present_mask );
223
- nr_others = __irq_build_affinity_masks (curvec , numvecs ,
224
- firstvec , node_to_cpumask ,
225
- npresmsk , nmsk , masks );
380
+ ret = __irq_build_affinity_masks (curvec , numvecs , firstvec ,
381
+ node_to_cpumask , npresmsk , nmsk ,
382
+ masks );
383
+ if (ret >= 0 )
384
+ nr_others = ret ;
385
+
386
+ fail_build_affinity :
226
387
put_online_cpus ();
227
388
228
- if (nr_present < numvecs )
389
+ if (ret >= 0 )
229
390
WARN_ON (nr_present + nr_others < numvecs );
230
391
231
392
free_node_to_cpumask (node_to_cpumask );
@@ -235,7 +396,7 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
235
396
236
397
fail_nmsk :
237
398
free_cpumask_var (nmsk );
238
- return ret ;
399
+ return ret < 0 ? ret : 0 ;
239
400
}
240
401
241
402
static void default_calc_sets (struct irq_affinity * affd , unsigned int affvecs )
0 commit comments