Skip to content

Commit 9a46ad6

Browse files
Shaohua Litorvalds
authored andcommitted
smp: make smp_call_function_many() use logic similar to smp_call_function_single()
I'm testing swapout workload in a two-socket Xeon machine. The workload has 10 threads, each thread sequentially accesses separate memory region. TLB flush overhead is very big in the workload. For each page, page reclaim need move it from active lru list and then unmap it. Both need a TLB flush. And this is a multthread workload, TLB flush happens in 10 CPUs. In X86, TLB flush uses generic smp_call)function. So this workload stress smp_call_function_many heavily. Without patch, perf shows: + 24.49% [k] generic_smp_call_function_interrupt - 21.72% [k] _raw_spin_lock - _raw_spin_lock + 79.80% __page_check_address + 6.42% generic_smp_call_function_interrupt + 3.31% get_swap_page + 2.37% free_pcppages_bulk + 1.75% handle_pte_fault + 1.54% put_super + 1.41% grab_super_passive + 1.36% __swap_duplicate + 0.68% blk_flush_plug_list + 0.62% swap_info_get + 6.55% [k] flush_tlb_func + 6.46% [k] smp_call_function_many + 5.09% [k] call_function_interrupt + 4.75% [k] default_send_IPI_mask_sequence_phys + 2.18% [k] find_next_bit swapout throughput is around 1300M/s. With the patch, perf shows: - 27.23% [k] _raw_spin_lock - _raw_spin_lock + 80.53% __page_check_address + 8.39% generic_smp_call_function_single_interrupt + 2.44% get_swap_page + 1.76% free_pcppages_bulk + 1.40% handle_pte_fault + 1.15% __swap_duplicate + 1.05% put_super + 0.98% grab_super_passive + 0.86% blk_flush_plug_list + 0.57% swap_info_get + 8.25% [k] default_send_IPI_mask_sequence_phys + 7.55% [k] call_function_interrupt + 7.47% [k] smp_call_function_many + 7.25% [k] flush_tlb_func + 3.81% [k] _raw_spin_lock_irqsave + 3.78% [k] generic_smp_call_function_single_interrupt swapout throughput is around 1400M/s. So there is around a 7% improvement, and total cpu utilization doesn't change. Without the patch, cfd_data is shared by all CPUs. generic_smp_call_function_interrupt does read/write cfd_data several times which will create a lot of cache ping-pong. With the patch, the data becomes per-cpu. The ping-pong is avoided. And from the perf data, this doesn't make call_single_queue lock contend. Next step is to remove generic_smp_call_function_interrupt() from arch code. Signed-off-by: Shaohua Li <[email protected]> Cc: Peter Zijlstra <[email protected]> Cc: Ingo Molnar <[email protected]> Cc: Steven Rostedt <[email protected]> Cc: Jens Axboe <[email protected]> Cc: Linus Torvalds <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 6d1c7cc commit 9a46ad6

File tree

2 files changed

+32
-154
lines changed

2 files changed

+32
-154
lines changed

include/linux/smp.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,8 @@ void kick_all_cpus_sync(void);
8989
#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
9090
void __init call_function_init(void);
9191
void generic_smp_call_function_single_interrupt(void);
92-
void generic_smp_call_function_interrupt(void);
92+
#define generic_smp_call_function_interrupt \
93+
generic_smp_call_function_single_interrupt
9394
#else
9495
static inline void call_function_init(void) { }
9596
#endif

kernel/smp.c

Lines changed: 30 additions & 153 deletions
Original file line numberDiff line numberDiff line change
@@ -16,22 +16,12 @@
1616
#include "smpboot.h"
1717

1818
#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
19-
static struct {
20-
struct list_head queue;
21-
raw_spinlock_t lock;
22-
} call_function __cacheline_aligned_in_smp =
23-
{
24-
.queue = LIST_HEAD_INIT(call_function.queue),
25-
.lock = __RAW_SPIN_LOCK_UNLOCKED(call_function.lock),
26-
};
27-
2819
enum {
2920
CSD_FLAG_LOCK = 0x01,
3021
};
3122

3223
struct call_function_data {
33-
struct call_single_data csd;
34-
atomic_t refs;
24+
struct call_single_data __percpu *csd;
3525
cpumask_var_t cpumask;
3626
cpumask_var_t cpumask_ipi;
3727
};
@@ -60,6 +50,11 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
6050
if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL,
6151
cpu_to_node(cpu)))
6252
return notifier_from_errno(-ENOMEM);
53+
cfd->csd = alloc_percpu(struct call_single_data);
54+
if (!cfd->csd) {
55+
free_cpumask_var(cfd->cpumask);
56+
return notifier_from_errno(-ENOMEM);
57+
}
6358
break;
6459

6560
#ifdef CONFIG_HOTPLUG_CPU
@@ -70,6 +65,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
7065
case CPU_DEAD_FROZEN:
7166
free_cpumask_var(cfd->cpumask);
7267
free_cpumask_var(cfd->cpumask_ipi);
68+
free_percpu(cfd->csd);
7369
break;
7470
#endif
7571
};
@@ -170,85 +166,6 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
170166
csd_lock_wait(data);
171167
}
172168

173-
/*
174-
* Invoked by arch to handle an IPI for call function. Must be called with
175-
* interrupts disabled.
176-
*/
177-
void generic_smp_call_function_interrupt(void)
178-
{
179-
struct call_function_data *data;
180-
int cpu = smp_processor_id();
181-
182-
/*
183-
* Shouldn't receive this interrupt on a cpu that is not yet online.
184-
*/
185-
WARN_ON_ONCE(!cpu_online(cpu));
186-
187-
/*
188-
* Ensure entry is visible on call_function_queue after we have
189-
* entered the IPI. See comment in smp_call_function_many.
190-
* If we don't have this, then we may miss an entry on the list
191-
* and never get another IPI to process it.
192-
*/
193-
smp_mb();
194-
195-
/*
196-
* It's ok to use list_for_each_rcu() here even though we may
197-
* delete 'pos', since list_del_rcu() doesn't clear ->next
198-
*/
199-
list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
200-
int refs;
201-
smp_call_func_t func;
202-
203-
/*
204-
* Since we walk the list without any locks, we might
205-
* see an entry that was completed, removed from the
206-
* list and is in the process of being reused.
207-
*
208-
* We must check that the cpu is in the cpumask before
209-
* checking the refs, and both must be set before
210-
* executing the callback on this cpu.
211-
*/
212-
213-
if (!cpumask_test_cpu(cpu, data->cpumask))
214-
continue;
215-
216-
smp_rmb();
217-
218-
if (atomic_read(&data->refs) == 0)
219-
continue;
220-
221-
func = data->csd.func; /* save for later warn */
222-
func(data->csd.info);
223-
224-
/*
225-
* If the cpu mask is not still set then func enabled
226-
* interrupts (BUG), and this cpu took another smp call
227-
* function interrupt and executed func(info) twice
228-
* on this cpu. That nested execution decremented refs.
229-
*/
230-
if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) {
231-
WARN(1, "%pf enabled interrupts and double executed\n", func);
232-
continue;
233-
}
234-
235-
refs = atomic_dec_return(&data->refs);
236-
WARN_ON(refs < 0);
237-
238-
if (refs)
239-
continue;
240-
241-
WARN_ON(!cpumask_empty(data->cpumask));
242-
243-
raw_spin_lock(&call_function.lock);
244-
list_del_rcu(&data->csd.list);
245-
raw_spin_unlock(&call_function.lock);
246-
247-
csd_unlock(&data->csd);
248-
}
249-
250-
}
251-
252169
/*
253170
* Invoked by arch to handle an IPI for call function single. Must be
254171
* called from the arch with interrupts disabled.
@@ -453,8 +370,7 @@ void smp_call_function_many(const struct cpumask *mask,
453370
smp_call_func_t func, void *info, bool wait)
454371
{
455372
struct call_function_data *data;
456-
unsigned long flags;
457-
int refs, cpu, next_cpu, this_cpu = smp_processor_id();
373+
int cpu, next_cpu, this_cpu = smp_processor_id();
458374

459375
/*
460376
* Can deadlock when called with interrupts disabled.
@@ -486,85 +402,46 @@ void smp_call_function_many(const struct cpumask *mask,
486402
}
487403

488404
data = &__get_cpu_var(cfd_data);
489-
csd_lock(&data->csd);
490-
491-
/* This BUG_ON verifies our reuse assertions and can be removed */
492-
BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask));
493-
494-
/*
495-
* The global call function queue list add and delete are protected
496-
* by a lock, but the list is traversed without any lock, relying
497-
* on the rcu list add and delete to allow safe concurrent traversal.
498-
* We reuse the call function data without waiting for any grace
499-
* period after some other cpu removes it from the global queue.
500-
* This means a cpu might find our data block as it is being
501-
* filled out.
502-
*
503-
* We hold off the interrupt handler on the other cpu by
504-
* ordering our writes to the cpu mask vs our setting of the
505-
* refs counter. We assert only the cpu owning the data block
506-
* will set a bit in cpumask, and each bit will only be cleared
507-
* by the subject cpu. Each cpu must first find its bit is
508-
* set and then check that refs is set indicating the element is
509-
* ready to be processed, otherwise it must skip the entry.
510-
*
511-
* On the previous iteration refs was set to 0 by another cpu.
512-
* To avoid the use of transitivity, set the counter to 0 here
513-
* so the wmb will pair with the rmb in the interrupt handler.
514-
*/
515-
atomic_set(&data->refs, 0); /* convert 3rd to 1st party write */
516-
517-
data->csd.func = func;
518-
data->csd.info = info;
519405

520-
/* Ensure 0 refs is visible before mask. Also orders func and info */
521-
smp_wmb();
522-
523-
/* We rely on the "and" being processed before the store */
524406
cpumask_and(data->cpumask, mask, cpu_online_mask);
525407
cpumask_clear_cpu(this_cpu, data->cpumask);
526-
refs = cpumask_weight(data->cpumask);
527408

528409
/* Some callers race with other cpus changing the passed mask */
529-
if (unlikely(!refs)) {
530-
csd_unlock(&data->csd);
410+
if (unlikely(!cpumask_weight(data->cpumask)))
531411
return;
532-
}
533412

534413
/*
535414
* After we put an entry into the list, data->cpumask
536415
* may be cleared again when another CPU sends another IPI for
537416
* a SMP function call, so data->cpumask will be zero.
538417
*/
539418
cpumask_copy(data->cpumask_ipi, data->cpumask);
540-
raw_spin_lock_irqsave(&call_function.lock, flags);
541-
/*
542-
* Place entry at the _HEAD_ of the list, so that any cpu still
543-
* observing the entry in generic_smp_call_function_interrupt()
544-
* will not miss any other list entries:
545-
*/
546-
list_add_rcu(&data->csd.list, &call_function.queue);
547-
/*
548-
* We rely on the wmb() in list_add_rcu to complete our writes
549-
* to the cpumask before this write to refs, which indicates
550-
* data is on the list and is ready to be processed.
551-
*/
552-
atomic_set(&data->refs, refs);
553-
raw_spin_unlock_irqrestore(&call_function.lock, flags);
554419

555-
/*
556-
* Make the list addition visible before sending the ipi.
557-
* (IPIs must obey or appear to obey normal Linux cache
558-
* coherency rules -- see comment in generic_exec_single).
559-
*/
560-
smp_mb();
420+
for_each_cpu(cpu, data->cpumask) {
421+
struct call_single_data *csd = per_cpu_ptr(data->csd, cpu);
422+
struct call_single_queue *dst =
423+
&per_cpu(call_single_queue, cpu);
424+
unsigned long flags;
425+
426+
csd_lock(csd);
427+
csd->func = func;
428+
csd->info = info;
429+
430+
raw_spin_lock_irqsave(&dst->lock, flags);
431+
list_add_tail(&csd->list, &dst->list);
432+
raw_spin_unlock_irqrestore(&dst->lock, flags);
433+
}
561434

562435
/* Send a message to all CPUs in the map */
563436
arch_send_call_function_ipi_mask(data->cpumask_ipi);
564437

565-
/* Optionally wait for the CPUs to complete */
566-
if (wait)
567-
csd_lock_wait(&data->csd);
438+
if (wait) {
439+
for_each_cpu(cpu, data->cpumask) {
440+
struct call_single_data *csd =
441+
per_cpu_ptr(data->csd, cpu);
442+
csd_lock_wait(csd);
443+
}
444+
}
568445
}
569446
EXPORT_SYMBOL(smp_call_function_many);
570447

0 commit comments

Comments
 (0)