Skip to content

Commit b1a5a73

Browse files
Ming LeiKAGA-KOKO
authored andcommitted
genirq/affinity: Spread vectors on node according to nr_cpu ratio
Now __irq_build_affinity_masks() spreads vectors evenly per node, but there is a case that not all vectors have been spread when each numa node has a different number of CPUs which triggers the warning in the spreading code. Improve the spreading algorithm by - assigning vectors according to the ratio of the number of CPUs on a node to the number of remaining CPUs. - running the assignment from smaller nodes to bigger nodes to guarantee that every active node gets allocated at least one vector. This ensures that all vectors are spread out. Asided of that the spread becomes more fair if the nodes have different number of CPUs. For example, on the following machine: CPU(s): 16 On-line CPU(s) list: 0-15 Thread(s) per core: 1 Core(s) per socket: 8 Socket(s): 2 NUMA node(s): 2 ... NUMA node0 CPU(s): 0,1,3,5-9,11,13-15 NUMA node1 CPU(s): 2,4,10,12 When a driver requests to allocate 8 vectors, the following spread results: irq 31, cpu list 2,4 irq 32, cpu list 10,12 irq 33, cpu list 0-1 irq 34, cpu list 3,5 irq 35, cpu list 6-7 irq 36, cpu list 8-9 irq 37, cpu list 11,13 irq 38, cpu list 14-15 So Node 0 has now 6 and Node 1 has 2 vectors assigned. The original algorithm assigned 4 vectors on each node which was unfair versus Node 0. [ tglx: Massaged changelog ] Reported-by: Jon Derrick <[email protected]> Signed-off-by: Ming Lei <[email protected]> Signed-off-by: Thomas Gleixner <[email protected]> Reviewed-by: Keith Busch <[email protected]> Reviewed-by: Jon Derrick <[email protected]> Link: https://lkml.kernel.org/r/[email protected]
1 parent 53c1788 commit b1a5a73

File tree

1 file changed

+200
-39
lines changed

1 file changed

+200
-39
lines changed

kernel/irq/affinity.c

Lines changed: 200 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include <linux/kernel.h>
88
#include <linux/slab.h>
99
#include <linux/cpu.h>
10+
#include <linux/sort.h>
1011

1112
static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
1213
unsigned int cpus_per_vec)
@@ -94,6 +95,155 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,
9495
return nodes;
9596
}
9697

98+
struct node_vectors {
99+
unsigned id;
100+
101+
union {
102+
unsigned nvectors;
103+
unsigned ncpus;
104+
};
105+
};
106+
107+
static int ncpus_cmp_func(const void *l, const void *r)
108+
{
109+
const struct node_vectors *ln = l;
110+
const struct node_vectors *rn = r;
111+
112+
return ln->ncpus - rn->ncpus;
113+
}
114+
115+
/*
116+
* Allocate vector number for each node, so that for each node:
117+
*
118+
* 1) the allocated number is >= 1
119+
*
120+
* 2) the allocated numbver is <= active CPU number of this node
121+
*
122+
* The actual allocated total vectors may be less than @numvecs when
123+
* active total CPU number is less than @numvecs.
124+
*
125+
* Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]'
126+
* for each node.
127+
*/
128+
static void alloc_nodes_vectors(unsigned int numvecs,
129+
const cpumask_var_t *node_to_cpumask,
130+
const struct cpumask *cpu_mask,
131+
const nodemask_t nodemsk,
132+
struct cpumask *nmsk,
133+
struct node_vectors *node_vectors)
134+
{
135+
unsigned n, remaining_ncpus = 0;
136+
137+
for (n = 0; n < nr_node_ids; n++) {
138+
node_vectors[n].id = n;
139+
node_vectors[n].ncpus = UINT_MAX;
140+
}
141+
142+
for_each_node_mask(n, nodemsk) {
143+
unsigned ncpus;
144+
145+
cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
146+
ncpus = cpumask_weight(nmsk);
147+
148+
if (!ncpus)
149+
continue;
150+
remaining_ncpus += ncpus;
151+
node_vectors[n].ncpus = ncpus;
152+
}
153+
154+
numvecs = min_t(unsigned, remaining_ncpus, numvecs);
155+
156+
sort(node_vectors, nr_node_ids, sizeof(node_vectors[0]),
157+
ncpus_cmp_func, NULL);
158+
159+
/*
160+
* Allocate vectors for each node according to the ratio of this
161+
* node's nr_cpus to remaining un-assigned ncpus. 'numvecs' is
162+
* bigger than number of active numa nodes. Always start the
163+
* allocation from the node with minimized nr_cpus.
164+
*
165+
* This way guarantees that each active node gets allocated at
166+
* least one vector, and the theory is simple: over-allocation
167+
* is only done when this node is assigned by one vector, so
168+
* other nodes will be allocated >= 1 vector, since 'numvecs' is
169+
* bigger than number of numa nodes.
170+
*
171+
* One perfect invariant is that number of allocated vectors for
172+
* each node is <= CPU count of this node:
173+
*
174+
* 1) suppose there are two nodes: A and B
175+
* ncpu(X) is CPU count of node X
176+
* vecs(X) is the vector count allocated to node X via this
177+
* algorithm
178+
*
179+
* ncpu(A) <= ncpu(B)
180+
* ncpu(A) + ncpu(B) = N
181+
* vecs(A) + vecs(B) = V
182+
*
183+
* vecs(A) = max(1, round_down(V * ncpu(A) / N))
184+
* vecs(B) = V - vecs(A)
185+
*
186+
* both N and V are integer, and 2 <= V <= N, suppose
187+
* V = N - delta, and 0 <= delta <= N - 2
188+
*
189+
* 2) obviously vecs(A) <= ncpu(A) because:
190+
*
191+
* if vecs(A) is 1, then vecs(A) <= ncpu(A) given
192+
* ncpu(A) >= 1
193+
*
194+
* otherwise,
195+
* vecs(A) <= V * ncpu(A) / N <= ncpu(A), given V <= N
196+
*
197+
* 3) prove how vecs(B) <= ncpu(B):
198+
*
199+
* if round_down(V * ncpu(A) / N) == 0, vecs(B) won't be
200+
* over-allocated, so vecs(B) <= ncpu(B),
201+
*
202+
* otherwise:
203+
*
204+
* vecs(A) =
205+
* round_down(V * ncpu(A) / N) =
206+
* round_down((N - delta) * ncpu(A) / N) =
207+
* round_down((N * ncpu(A) - delta * ncpu(A)) / N) >=
208+
* round_down((N * ncpu(A) - delta * N) / N) =
209+
* cpu(A) - delta
210+
*
211+
* then:
212+
*
213+
* vecs(A) - V >= ncpu(A) - delta - V
214+
* =>
215+
* V - vecs(A) <= V + delta - ncpu(A)
216+
* =>
217+
* vecs(B) <= N - ncpu(A)
218+
* =>
219+
* vecs(B) <= cpu(B)
220+
*
221+
* For nodes >= 3, it can be thought as one node and another big
222+
* node given that is exactly what this algorithm is implemented,
223+
* and we always re-calculate 'remaining_ncpus' & 'numvecs', and
224+
* finally for each node X: vecs(X) <= ncpu(X).
225+
*
226+
*/
227+
for (n = 0; n < nr_node_ids; n++) {
228+
unsigned nvectors, ncpus;
229+
230+
if (node_vectors[n].ncpus == UINT_MAX)
231+
continue;
232+
233+
WARN_ON_ONCE(numvecs == 0);
234+
235+
ncpus = node_vectors[n].ncpus;
236+
nvectors = max_t(unsigned, 1,
237+
numvecs * ncpus / remaining_ncpus);
238+
WARN_ON_ONCE(nvectors > ncpus);
239+
240+
node_vectors[n].nvectors = nvectors;
241+
242+
remaining_ncpus -= ncpus;
243+
numvecs -= nvectors;
244+
}
245+
}
246+
97247
static int __irq_build_affinity_masks(unsigned int startvec,
98248
unsigned int numvecs,
99249
unsigned int firstvec,
@@ -102,10 +252,11 @@ static int __irq_build_affinity_masks(unsigned int startvec,
102252
struct cpumask *nmsk,
103253
struct irq_affinity_desc *masks)
104254
{
105-
unsigned int n, nodes, cpus_per_vec, extra_vecs, done = 0;
255+
unsigned int i, n, nodes, cpus_per_vec, extra_vecs, done = 0;
106256
unsigned int last_affv = firstvec + numvecs;
107257
unsigned int curvec = startvec;
108258
nodemask_t nodemsk = NODE_MASK_NONE;
259+
struct node_vectors *node_vectors;
109260

110261
if (!cpumask_weight(cpu_mask))
111262
return 0;
@@ -126,53 +277,57 @@ static int __irq_build_affinity_masks(unsigned int startvec,
126277
return numvecs;
127278
}
128279

129-
for_each_node_mask(n, nodemsk) {
130-
unsigned int ncpus, v, vecs_to_assign, vecs_per_node;
280+
node_vectors = kcalloc(nr_node_ids,
281+
sizeof(struct node_vectors),
282+
GFP_KERNEL);
283+
if (!node_vectors)
284+
return -ENOMEM;
285+
286+
/* allocate vector number for each node */
287+
alloc_nodes_vectors(numvecs, node_to_cpumask, cpu_mask,
288+
nodemsk, nmsk, node_vectors);
289+
290+
for (i = 0; i < nr_node_ids; i++) {
291+
unsigned int ncpus, v;
292+
struct node_vectors *nv = &node_vectors[i];
293+
294+
if (nv->nvectors == UINT_MAX)
295+
continue;
131296

132297
/* Get the cpus on this node which are in the mask */
133-
cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
298+
cpumask_and(nmsk, cpu_mask, node_to_cpumask[nv->id]);
134299
ncpus = cpumask_weight(nmsk);
135300
if (!ncpus)
136301
continue;
137302

138-
/*
139-
* Calculate the number of cpus per vector
140-
*
141-
* Spread the vectors evenly per node. If the requested
142-
* vector number has been reached, simply allocate one
143-
* vector for each remaining node so that all nodes can
144-
* be covered
145-
*/
146-
if (numvecs > done)
147-
vecs_per_node = max_t(unsigned,
148-
(numvecs - done) / nodes, 1);
149-
else
150-
vecs_per_node = 1;
151-
152-
vecs_to_assign = min(vecs_per_node, ncpus);
303+
WARN_ON_ONCE(nv->nvectors > ncpus);
153304

154305
/* Account for rounding errors */
155-
extra_vecs = ncpus - vecs_to_assign * (ncpus / vecs_to_assign);
306+
extra_vecs = ncpus - nv->nvectors * (ncpus / nv->nvectors);
156307

157-
for (v = 0; curvec < last_affv && v < vecs_to_assign;
158-
curvec++, v++) {
159-
cpus_per_vec = ncpus / vecs_to_assign;
308+
/* Spread allocated vectors on CPUs of the current node */
309+
for (v = 0; v < nv->nvectors; v++, curvec++) {
310+
cpus_per_vec = ncpus / nv->nvectors;
160311

161312
/* Account for extra vectors to compensate rounding errors */
162313
if (extra_vecs) {
163314
cpus_per_vec++;
164315
--extra_vecs;
165316
}
317+
318+
/*
319+
* wrapping has to be considered given 'startvec'
320+
* may start anywhere
321+
*/
322+
if (curvec >= last_affv)
323+
curvec = firstvec;
166324
irq_spread_init_one(&masks[curvec].mask, nmsk,
167325
cpus_per_vec);
168326
}
169-
170-
done += v;
171-
if (curvec >= last_affv)
172-
curvec = firstvec;
173-
--nodes;
327+
done += nv->nvectors;
174328
}
175-
return done < numvecs ? done : numvecs;
329+
kfree(node_vectors);
330+
return done;
176331
}
177332

178333
/*
@@ -184,7 +339,7 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
184339
unsigned int firstvec,
185340
struct irq_affinity_desc *masks)
186341
{
187-
unsigned int curvec = startvec, nr_present, nr_others;
342+
unsigned int curvec = startvec, nr_present = 0, nr_others = 0;
188343
cpumask_var_t *node_to_cpumask;
189344
cpumask_var_t nmsk, npresmsk;
190345
int ret = -ENOMEM;
@@ -199,15 +354,17 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
199354
if (!node_to_cpumask)
200355
goto fail_npresmsk;
201356

202-
ret = 0;
203357
/* Stabilize the cpumasks */
204358
get_online_cpus();
205359
build_node_to_cpumask(node_to_cpumask);
206360

207361
/* Spread on present CPUs starting from affd->pre_vectors */
208-
nr_present = __irq_build_affinity_masks(curvec, numvecs,
209-
firstvec, node_to_cpumask,
210-
cpu_present_mask, nmsk, masks);
362+
ret = __irq_build_affinity_masks(curvec, numvecs, firstvec,
363+
node_to_cpumask, cpu_present_mask,
364+
nmsk, masks);
365+
if (ret < 0)
366+
goto fail_build_affinity;
367+
nr_present = ret;
211368

212369
/*
213370
* Spread on non present CPUs starting from the next vector to be
@@ -220,12 +377,16 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
220377
else
221378
curvec = firstvec + nr_present;
222379
cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
223-
nr_others = __irq_build_affinity_masks(curvec, numvecs,
224-
firstvec, node_to_cpumask,
225-
npresmsk, nmsk, masks);
380+
ret = __irq_build_affinity_masks(curvec, numvecs, firstvec,
381+
node_to_cpumask, npresmsk, nmsk,
382+
masks);
383+
if (ret >= 0)
384+
nr_others = ret;
385+
386+
fail_build_affinity:
226387
put_online_cpus();
227388

228-
if (nr_present < numvecs)
389+
if (ret >= 0)
229390
WARN_ON(nr_present + nr_others < numvecs);
230391

231392
free_node_to_cpumask(node_to_cpumask);
@@ -235,7 +396,7 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
235396

236397
fail_nmsk:
237398
free_cpumask_var(nmsk);
238-
return ret;
399+
return ret < 0 ? ret : 0;
239400
}
240401

241402
static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs)

0 commit comments

Comments
 (0)