Skip to content

Commit 596ff4a

Browse files
committed
cpumask: re-introduce constant-sized cpumask optimizations
Commit aa47a7c ("lib/cpumask: deprecate nr_cpumask_bits") resulted in the cpumask operations potentially becoming hugely less efficient, because suddenly the cpumask was always considered to be variable-sized. The optimization was then later added back in a limited form by commit 6f9c07b ("lib/cpumask: add FORCE_NR_CPUS config option"), but that FORCE_NR_CPUS option is not useful in a generic kernel and more of a special case for embedded situations with fixed hardware. Instead, just re-introduce the optimization, with some changes. Instead of depending on CPUMASK_OFFSTACK being false, and then always using the full constant cpumask width, this introduces three different cpumask "sizes": - the exact size (nr_cpumask_bits) remains identical to nr_cpu_ids. This is used for situations where we should use the exact size. - the "small" size (small_cpumask_bits) is the NR_CPUS constant if it fits in a single word and the bitmap operations thus end up able to trigger the "small_const_nbits()" optimizations. This is used for the operations that have optimized single-word cases that get inlined, notably the bit find and scanning functions. - the "large" size (large_cpumask_bits) is the NR_CPUS constant if it is an sufficiently small constant that makes simple "copy" and "clear" operations more efficient. This is arbitrarily set at four words or less. As a an example of this situation, without this fixed size optimization, cpumask_clear() will generate code like movl nr_cpu_ids(%rip), %edx addq $63, %rdx shrq $3, %rdx andl $-8, %edx callq memset@PLT on x86-64, because it would calculate the "exact" number of longwords that need to be cleared. In contrast, with this patch, using a MAX_CPU of 64 (which is quite a reasonable value to use), the above becomes a single movq $0,cpumask instruction instead, because instead of caring to figure out exactly how many CPU's the system has, it just knows that the cpumask will be a single word and can just clear it all. Note that this does end up tightening the rules a bit from the original version in another way: operations that set bits in the cpumask are now limited to the actual nr_cpu_ids limit, whereas we used to do the nr_cpumask_bits thing almost everywhere in the cpumask code. But if you just clear bits, or scan for bits, we can use the simpler compile-time constants. In the process, remove 'cpumask_complement()' and 'for_each_cpu_not()' which were not useful, and which fundamentally have to be limited to 'nr_cpu_ids'. Better remove them now than have somebody introduce use of them later. Of course, on x86-64 with MAXSMP there is no sane small compile-time constant for the cpumask sizes, and we end up using the actual CPU bits, and will generate the above kind of horrors regardless. Please don't use MAXSMP unless you really expect to have machines with thousands of cores. Signed-off-by: Linus Torvalds <[email protected]>
1 parent f915322 commit 596ff4a

File tree

4 files changed

+72
-72
lines changed

4 files changed

+72
-72
lines changed

.clang-format

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -226,7 +226,6 @@ ForEachMacros:
226226
- 'for_each_console_srcu'
227227
- 'for_each_cpu'
228228
- 'for_each_cpu_and'
229-
- 'for_each_cpu_not'
230229
- 'for_each_cpu_wrap'
231230
- 'for_each_dapm_widgets'
232231
- 'for_each_dedup_cand'

arch/ia64/kernel/acpi.c

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -783,11 +783,9 @@ __init void prefill_possible_map(void)
783783

784784
static int _acpi_map_lsapic(acpi_handle handle, int physid, int *pcpu)
785785
{
786-
cpumask_t tmp_map;
787786
int cpu;
788787

789-
cpumask_complement(&tmp_map, cpu_present_mask);
790-
cpu = cpumask_first(&tmp_map);
788+
cpu = cpumask_first_zero(cpu_present_mask);
791789
if (cpu >= nr_cpu_ids)
792790
return -EINVAL;
793791

include/linux/cpumask.h

Lines changed: 70 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,41 @@ static inline void set_nr_cpu_ids(unsigned int nr)
5050
#endif
5151
}
5252

53-
/* Deprecated. Always use nr_cpu_ids. */
54-
#define nr_cpumask_bits nr_cpu_ids
53+
/*
54+
* We have several different "preferred sizes" for the cpumask
55+
* operations, depending on operation.
56+
*
57+
* For example, the bitmap scanning and operating operations have
58+
* optimized routines that work for the single-word case, but only when
59+
* the size is constant. So if NR_CPUS fits in one single word, we are
60+
* better off using that small constant, in order to trigger the
61+
* optimized bit finding. That is 'small_cpumask_size'.
62+
*
63+
* The clearing and copying operations will similarly perform better
64+
* with a constant size, but we limit that size arbitrarily to four
65+
* words. We call this 'large_cpumask_size'.
66+
*
67+
* Finally, some operations just want the exact limit, either because
68+
* they set bits or just don't have any faster fixed-sized versions. We
69+
* call this just 'nr_cpumask_size'.
70+
*
71+
* Note that these optional constants are always guaranteed to be at
72+
* least as big as 'nr_cpu_ids' itself is, and all our cpumask
73+
* allocations are at least that size (see cpumask_size()). The
74+
* optimization comes from being able to potentially use a compile-time
75+
* constant instead of a run-time generated exact number of CPUs.
76+
*/
77+
#if NR_CPUS <= BITS_PER_LONG
78+
#define small_cpumask_bits ((unsigned int)NR_CPUS)
79+
#define large_cpumask_bits ((unsigned int)NR_CPUS)
80+
#elif NR_CPUS <= 4*BITS_PER_LONG
81+
#define small_cpumask_bits nr_cpu_ids
82+
#define large_cpumask_bits ((unsigned int)NR_CPUS)
83+
#else
84+
#define small_cpumask_bits nr_cpu_ids
85+
#define large_cpumask_bits nr_cpu_ids
86+
#endif
87+
#define nr_cpumask_bits nr_cpu_ids
5588

5689
/*
5790
* The following particular system cpumasks and operations manage
@@ -126,7 +159,7 @@ static __always_inline unsigned int cpumask_check(unsigned int cpu)
126159
*/
127160
static inline unsigned int cpumask_first(const struct cpumask *srcp)
128161
{
129-
return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits);
162+
return find_first_bit(cpumask_bits(srcp), small_cpumask_bits);
130163
}
131164

132165
/**
@@ -137,7 +170,7 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp)
137170
*/
138171
static inline unsigned int cpumask_first_zero(const struct cpumask *srcp)
139172
{
140-
return find_first_zero_bit(cpumask_bits(srcp), nr_cpumask_bits);
173+
return find_first_zero_bit(cpumask_bits(srcp), small_cpumask_bits);
141174
}
142175

143176
/**
@@ -150,7 +183,7 @@ static inline unsigned int cpumask_first_zero(const struct cpumask *srcp)
150183
static inline
151184
unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask *srcp2)
152185
{
153-
return find_first_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), nr_cpumask_bits);
186+
return find_first_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits);
154187
}
155188

156189
/**
@@ -161,7 +194,7 @@ unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask
161194
*/
162195
static inline unsigned int cpumask_last(const struct cpumask *srcp)
163196
{
164-
return find_last_bit(cpumask_bits(srcp), nr_cpumask_bits);
197+
return find_last_bit(cpumask_bits(srcp), small_cpumask_bits);
165198
}
166199

167200
/**
@@ -177,7 +210,7 @@ unsigned int cpumask_next(int n, const struct cpumask *srcp)
177210
/* -1 is a legal arg here. */
178211
if (n != -1)
179212
cpumask_check(n);
180-
return find_next_bit(cpumask_bits(srcp), nr_cpumask_bits, n + 1);
213+
return find_next_bit(cpumask_bits(srcp), small_cpumask_bits, n + 1);
181214
}
182215

183216
/**
@@ -192,7 +225,7 @@ static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
192225
/* -1 is a legal arg here. */
193226
if (n != -1)
194227
cpumask_check(n);
195-
return find_next_zero_bit(cpumask_bits(srcp), nr_cpumask_bits, n+1);
228+
return find_next_zero_bit(cpumask_bits(srcp), small_cpumask_bits, n+1);
196229
}
197230

198231
#if NR_CPUS == 1
@@ -235,7 +268,7 @@ unsigned int cpumask_next_and(int n, const struct cpumask *src1p,
235268
if (n != -1)
236269
cpumask_check(n);
237270
return find_next_and_bit(cpumask_bits(src1p), cpumask_bits(src2p),
238-
nr_cpumask_bits, n + 1);
271+
small_cpumask_bits, n + 1);
239272
}
240273

241274
/**
@@ -246,17 +279,7 @@ unsigned int cpumask_next_and(int n, const struct cpumask *src1p,
246279
* After the loop, cpu is >= nr_cpu_ids.
247280
*/
248281
#define for_each_cpu(cpu, mask) \
249-
for_each_set_bit(cpu, cpumask_bits(mask), nr_cpumask_bits)
250-
251-
/**
252-
* for_each_cpu_not - iterate over every cpu in a complemented mask
253-
* @cpu: the (optionally unsigned) integer iterator
254-
* @mask: the cpumask pointer
255-
*
256-
* After the loop, cpu is >= nr_cpu_ids.
257-
*/
258-
#define for_each_cpu_not(cpu, mask) \
259-
for_each_clear_bit(cpu, cpumask_bits(mask), nr_cpumask_bits)
282+
for_each_set_bit(cpu, cpumask_bits(mask), small_cpumask_bits)
260283

261284
#if NR_CPUS == 1
262285
static inline
@@ -290,7 +313,7 @@ unsigned int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int sta
290313
* After the loop, cpu is >= nr_cpu_ids.
291314
*/
292315
#define for_each_cpu_wrap(cpu, mask, start) \
293-
for_each_set_bit_wrap(cpu, cpumask_bits(mask), nr_cpumask_bits, start)
316+
for_each_set_bit_wrap(cpu, cpumask_bits(mask), small_cpumask_bits, start)
294317

295318
/**
296319
* for_each_cpu_and - iterate over every cpu in both masks
@@ -307,7 +330,7 @@ unsigned int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int sta
307330
* After the loop, cpu is >= nr_cpu_ids.
308331
*/
309332
#define for_each_cpu_and(cpu, mask1, mask2) \
310-
for_each_and_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), nr_cpumask_bits)
333+
for_each_and_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits)
311334

312335
/**
313336
* for_each_cpu_andnot - iterate over every cpu present in one mask, excluding
@@ -325,7 +348,7 @@ unsigned int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int sta
325348
* After the loop, cpu is >= nr_cpu_ids.
326349
*/
327350
#define for_each_cpu_andnot(cpu, mask1, mask2) \
328-
for_each_andnot_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), nr_cpumask_bits)
351+
for_each_andnot_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits)
329352

330353
/**
331354
* cpumask_any_but - return a "random" in a cpumask, but not this one.
@@ -356,7 +379,7 @@ unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
356379
*/
357380
static inline unsigned int cpumask_nth(unsigned int cpu, const struct cpumask *srcp)
358381
{
359-
return find_nth_bit(cpumask_bits(srcp), nr_cpumask_bits, cpumask_check(cpu));
382+
return find_nth_bit(cpumask_bits(srcp), small_cpumask_bits, cpumask_check(cpu));
360383
}
361384

362385
/**
@@ -372,7 +395,7 @@ unsigned int cpumask_nth_and(unsigned int cpu, const struct cpumask *srcp1,
372395
const struct cpumask *srcp2)
373396
{
374397
return find_nth_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2),
375-
nr_cpumask_bits, cpumask_check(cpu));
398+
small_cpumask_bits, cpumask_check(cpu));
376399
}
377400

378401
/**
@@ -388,7 +411,7 @@ unsigned int cpumask_nth_andnot(unsigned int cpu, const struct cpumask *srcp1,
388411
const struct cpumask *srcp2)
389412
{
390413
return find_nth_andnot_bit(cpumask_bits(srcp1), cpumask_bits(srcp2),
391-
nr_cpumask_bits, cpumask_check(cpu));
414+
small_cpumask_bits, cpumask_check(cpu));
392415
}
393416

394417
/**
@@ -408,7 +431,7 @@ unsigned int cpumask_nth_and_andnot(unsigned int cpu, const struct cpumask *srcp
408431
return find_nth_and_andnot_bit(cpumask_bits(srcp1),
409432
cpumask_bits(srcp2),
410433
cpumask_bits(srcp3),
411-
nr_cpumask_bits, cpumask_check(cpu));
434+
small_cpumask_bits, cpumask_check(cpu));
412435
}
413436

414437
#define CPU_BITS_NONE \
@@ -495,10 +518,14 @@ static __always_inline bool cpumask_test_and_clear_cpu(int cpu, struct cpumask *
495518
/**
496519
* cpumask_setall - set all cpus (< nr_cpu_ids) in a cpumask
497520
* @dstp: the cpumask pointer
521+
*
522+
* Note: since we set bits, we should use the tighter 'bitmap_set()' with
523+
* the eact number of bits, not 'bitmap_fill()' that will fill past the
524+
* end.
498525
*/
499526
static inline void cpumask_setall(struct cpumask *dstp)
500527
{
501-
bitmap_fill(cpumask_bits(dstp), nr_cpumask_bits);
528+
bitmap_set(cpumask_bits(dstp), 0, nr_cpumask_bits);
502529
}
503530

504531
/**
@@ -507,7 +534,7 @@ static inline void cpumask_setall(struct cpumask *dstp)
507534
*/
508535
static inline void cpumask_clear(struct cpumask *dstp)
509536
{
510-
bitmap_zero(cpumask_bits(dstp), nr_cpumask_bits);
537+
bitmap_zero(cpumask_bits(dstp), large_cpumask_bits);
511538
}
512539

513540
/**
@@ -523,7 +550,7 @@ static inline bool cpumask_and(struct cpumask *dstp,
523550
const struct cpumask *src2p)
524551
{
525552
return bitmap_and(cpumask_bits(dstp), cpumask_bits(src1p),
526-
cpumask_bits(src2p), nr_cpumask_bits);
553+
cpumask_bits(src2p), small_cpumask_bits);
527554
}
528555

529556
/**
@@ -536,7 +563,7 @@ static inline void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p,
536563
const struct cpumask *src2p)
537564
{
538565
bitmap_or(cpumask_bits(dstp), cpumask_bits(src1p),
539-
cpumask_bits(src2p), nr_cpumask_bits);
566+
cpumask_bits(src2p), small_cpumask_bits);
540567
}
541568

542569
/**
@@ -550,7 +577,7 @@ static inline void cpumask_xor(struct cpumask *dstp,
550577
const struct cpumask *src2p)
551578
{
552579
bitmap_xor(cpumask_bits(dstp), cpumask_bits(src1p),
553-
cpumask_bits(src2p), nr_cpumask_bits);
580+
cpumask_bits(src2p), small_cpumask_bits);
554581
}
555582

556583
/**
@@ -566,19 +593,7 @@ static inline bool cpumask_andnot(struct cpumask *dstp,
566593
const struct cpumask *src2p)
567594
{
568595
return bitmap_andnot(cpumask_bits(dstp), cpumask_bits(src1p),
569-
cpumask_bits(src2p), nr_cpumask_bits);
570-
}
571-
572-
/**
573-
* cpumask_complement - *dstp = ~*srcp
574-
* @dstp: the cpumask result
575-
* @srcp: the input to invert
576-
*/
577-
static inline void cpumask_complement(struct cpumask *dstp,
578-
const struct cpumask *srcp)
579-
{
580-
bitmap_complement(cpumask_bits(dstp), cpumask_bits(srcp),
581-
nr_cpumask_bits);
596+
cpumask_bits(src2p), small_cpumask_bits);
582597
}
583598

584599
/**
@@ -590,7 +605,7 @@ static inline bool cpumask_equal(const struct cpumask *src1p,
590605
const struct cpumask *src2p)
591606
{
592607
return bitmap_equal(cpumask_bits(src1p), cpumask_bits(src2p),
593-
nr_cpumask_bits);
608+
small_cpumask_bits);
594609
}
595610

596611
/**
@@ -604,7 +619,7 @@ static inline bool cpumask_or_equal(const struct cpumask *src1p,
604619
const struct cpumask *src3p)
605620
{
606621
return bitmap_or_equal(cpumask_bits(src1p), cpumask_bits(src2p),
607-
cpumask_bits(src3p), nr_cpumask_bits);
622+
cpumask_bits(src3p), small_cpumask_bits);
608623
}
609624

610625
/**
@@ -616,7 +631,7 @@ static inline bool cpumask_intersects(const struct cpumask *src1p,
616631
const struct cpumask *src2p)
617632
{
618633
return bitmap_intersects(cpumask_bits(src1p), cpumask_bits(src2p),
619-
nr_cpumask_bits);
634+
small_cpumask_bits);
620635
}
621636

622637
/**
@@ -630,7 +645,7 @@ static inline bool cpumask_subset(const struct cpumask *src1p,
630645
const struct cpumask *src2p)
631646
{
632647
return bitmap_subset(cpumask_bits(src1p), cpumask_bits(src2p),
633-
nr_cpumask_bits);
648+
small_cpumask_bits);
634649
}
635650

636651
/**
@@ -639,7 +654,7 @@ static inline bool cpumask_subset(const struct cpumask *src1p,
639654
*/
640655
static inline bool cpumask_empty(const struct cpumask *srcp)
641656
{
642-
return bitmap_empty(cpumask_bits(srcp), nr_cpumask_bits);
657+
return bitmap_empty(cpumask_bits(srcp), small_cpumask_bits);
643658
}
644659

645660
/**
@@ -657,7 +672,7 @@ static inline bool cpumask_full(const struct cpumask *srcp)
657672
*/
658673
static inline unsigned int cpumask_weight(const struct cpumask *srcp)
659674
{
660-
return bitmap_weight(cpumask_bits(srcp), nr_cpumask_bits);
675+
return bitmap_weight(cpumask_bits(srcp), small_cpumask_bits);
661676
}
662677

663678
/**
@@ -668,7 +683,7 @@ static inline unsigned int cpumask_weight(const struct cpumask *srcp)
668683
static inline unsigned int cpumask_weight_and(const struct cpumask *srcp1,
669684
const struct cpumask *srcp2)
670685
{
671-
return bitmap_weight_and(cpumask_bits(srcp1), cpumask_bits(srcp2), nr_cpumask_bits);
686+
return bitmap_weight_and(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits);
672687
}
673688

674689
/**
@@ -681,7 +696,7 @@ static inline void cpumask_shift_right(struct cpumask *dstp,
681696
const struct cpumask *srcp, int n)
682697
{
683698
bitmap_shift_right(cpumask_bits(dstp), cpumask_bits(srcp), n,
684-
nr_cpumask_bits);
699+
small_cpumask_bits);
685700
}
686701

687702
/**
@@ -705,7 +720,7 @@ static inline void cpumask_shift_left(struct cpumask *dstp,
705720
static inline void cpumask_copy(struct cpumask *dstp,
706721
const struct cpumask *srcp)
707722
{
708-
bitmap_copy(cpumask_bits(dstp), cpumask_bits(srcp), nr_cpumask_bits);
723+
bitmap_copy(cpumask_bits(dstp), cpumask_bits(srcp), large_cpumask_bits);
709724
}
710725

711726
/**
@@ -789,7 +804,7 @@ static inline int cpulist_parse(const char *buf, struct cpumask *dstp)
789804
*/
790805
static inline unsigned int cpumask_size(void)
791806
{
792-
return BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long);
807+
return BITS_TO_LONGS(large_cpumask_bits) * sizeof(long);
793808
}
794809

795810
/*

0 commit comments

Comments
 (0)