Skip to content

Commit 0cef77c

Browse files
npigginmpe
authored andcommitted
powerpc/64s/radix: flush remote CPUs out of single-threaded mm_cpumask
When a single-threaded process has a non-local mm_cpumask, try to use that point to flush the TLBs out of other CPUs in the cpumask. An IPI is used for clearing remote CPUs for a few reasons: - An IPI can end lazy TLB use of the mm, which is required to prevent TLB entries being created on the remote CPU. The alternative is to drop lazy TLB switching completely, which costs 7.5% in a context switch ping-pong test betwee a process and kernel idle thread. - An IPI can have remote CPUs flush the entire PID, but the local CPU can flush a specific VA. tlbie would require over-flushing of the local CPU (where the process is running). - A single threaded process that is migrated to a different CPU is likely to have a relatively small mm_cpumask, so IPI is reasonable. No other thread can concurrently switch to this mm, because it must have been given a reference to mm_users by the current thread before it can use_mm. mm_users can be asynchronously incremented (by mm_activate or mmget_not_zero), but those users must use remote mm access and can't use_mm or access user address space. Existing code makes the this assumption already, for example sparc64 has reset mm_cpumask using this condition since the start of history, see arch/sparc/kernel/smp_64.c. This reduces tlbies for a kernel compile workload from 0.90M to 0.12M, tlbiels are increased significantly due to the PID flushing for the cleaning up remote CPUs, and increased local flushes (PID flushes take 128 tlbiels vs 1 tlbie). Signed-off-by: Nicholas Piggin <[email protected]> Signed-off-by: Michael Ellerman <[email protected]>
1 parent 85bcfaf commit 0cef77c

File tree

2 files changed

+134
-27
lines changed

2 files changed

+134
-27
lines changed

arch/powerpc/include/asm/tlb.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,19 @@ static inline int mm_is_thread_local(struct mm_struct *mm)
7676
return false;
7777
return cpumask_test_cpu(smp_processor_id(), mm_cpumask(mm));
7878
}
79+
static inline void mm_reset_thread_local(struct mm_struct *mm)
80+
{
81+
WARN_ON(atomic_read(&mm->context.copros) > 0);
82+
/*
83+
* It's possible for mm_access to take a reference on mm_users to
84+
* access the remote mm from another thread, but it's not allowed
85+
* to set mm_cpumask, so mm_users may be > 1 here.
86+
*/
87+
WARN_ON(current->mm != mm);
88+
atomic_set(&mm->context.active_cpus, 1);
89+
cpumask_clear(mm_cpumask(mm));
90+
cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
91+
}
7992
#else /* CONFIG_PPC_BOOK3S_64 */
8093
static inline int mm_is_thread_local(struct mm_struct *mm)
8194
{

arch/powerpc/mm/tlb-radix.c

Lines changed: 121 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
#include <linux/mm.h>
1313
#include <linux/hugetlb.h>
1414
#include <linux/memblock.h>
15+
#include <linux/mmu_context.h>
16+
#include <linux/sched/mm.h>
1517

1618
#include <asm/ppc-opcode.h>
1719
#include <asm/tlb.h>
@@ -504,17 +506,63 @@ void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmadd
504506
}
505507
EXPORT_SYMBOL(radix__local_flush_tlb_page);
506508

509+
static bool mm_is_singlethreaded(struct mm_struct *mm)
510+
{
511+
if (atomic_read(&mm->context.copros) > 0)
512+
return false;
513+
if (atomic_read(&mm->mm_users) <= 1 && current->mm == mm)
514+
return true;
515+
return false;
516+
}
517+
507518
static bool mm_needs_flush_escalation(struct mm_struct *mm)
508519
{
509520
/*
510521
* P9 nest MMU has issues with the page walk cache
511522
* caching PTEs and not flushing them properly when
512523
* RIC = 0 for a PID/LPID invalidate
513524
*/
514-
return atomic_read(&mm->context.copros) != 0;
525+
if (atomic_read(&mm->context.copros) > 0)
526+
return true;
527+
return false;
515528
}
516529

517530
#ifdef CONFIG_SMP
531+
static void do_exit_flush_lazy_tlb(void *arg)
532+
{
533+
struct mm_struct *mm = arg;
534+
unsigned long pid = mm->context.id;
535+
536+
if (current->mm == mm)
537+
return; /* Local CPU */
538+
539+
if (current->active_mm == mm) {
540+
/*
541+
* Must be a kernel thread because sender is single-threaded.
542+
*/
543+
BUG_ON(current->mm);
544+
mmgrab(&init_mm);
545+
switch_mm(mm, &init_mm, current);
546+
current->active_mm = &init_mm;
547+
mmdrop(mm);
548+
}
549+
_tlbiel_pid(pid, RIC_FLUSH_ALL);
550+
}
551+
552+
static void exit_flush_lazy_tlbs(struct mm_struct *mm)
553+
{
554+
/*
555+
* Would be nice if this was async so it could be run in
556+
* parallel with our local flush, but generic code does not
557+
* give a good API for it. Could extend the generic code or
558+
* make a special powerpc IPI for flushing TLBs.
559+
* For now it's not too performance critical.
560+
*/
561+
smp_call_function_many(mm_cpumask(mm), do_exit_flush_lazy_tlb,
562+
(void *)mm, 1);
563+
mm_reset_thread_local(mm);
564+
}
565+
518566
void radix__flush_tlb_mm(struct mm_struct *mm)
519567
{
520568
unsigned long pid;
@@ -530,17 +578,24 @@ void radix__flush_tlb_mm(struct mm_struct *mm)
530578
*/
531579
smp_mb();
532580
if (!mm_is_thread_local(mm)) {
581+
if (unlikely(mm_is_singlethreaded(mm))) {
582+
exit_flush_lazy_tlbs(mm);
583+
goto local;
584+
}
585+
533586
if (mm_needs_flush_escalation(mm))
534587
_tlbie_pid(pid, RIC_FLUSH_ALL);
535588
else
536589
_tlbie_pid(pid, RIC_FLUSH_TLB);
537-
} else
590+
} else {
591+
local:
538592
_tlbiel_pid(pid, RIC_FLUSH_TLB);
593+
}
539594
preempt_enable();
540595
}
541596
EXPORT_SYMBOL(radix__flush_tlb_mm);
542597

543-
void radix__flush_all_mm(struct mm_struct *mm)
598+
static void __flush_all_mm(struct mm_struct *mm, bool fullmm)
544599
{
545600
unsigned long pid;
546601

@@ -550,12 +605,24 @@ void radix__flush_all_mm(struct mm_struct *mm)
550605

551606
preempt_disable();
552607
smp_mb(); /* see radix__flush_tlb_mm */
553-
if (!mm_is_thread_local(mm))
608+
if (!mm_is_thread_local(mm)) {
609+
if (unlikely(mm_is_singlethreaded(mm))) {
610+
if (!fullmm) {
611+
exit_flush_lazy_tlbs(mm);
612+
goto local;
613+
}
614+
}
554615
_tlbie_pid(pid, RIC_FLUSH_ALL);
555-
else
616+
} else {
617+
local:
556618
_tlbiel_pid(pid, RIC_FLUSH_ALL);
619+
}
557620
preempt_enable();
558621
}
622+
void radix__flush_all_mm(struct mm_struct *mm)
623+
{
624+
__flush_all_mm(mm, false);
625+
}
559626
EXPORT_SYMBOL(radix__flush_all_mm);
560627

561628
void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr)
@@ -575,10 +642,16 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
575642

576643
preempt_disable();
577644
smp_mb(); /* see radix__flush_tlb_mm */
578-
if (!mm_is_thread_local(mm))
645+
if (!mm_is_thread_local(mm)) {
646+
if (unlikely(mm_is_singlethreaded(mm))) {
647+
exit_flush_lazy_tlbs(mm);
648+
goto local;
649+
}
579650
_tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
580-
else
651+
} else {
652+
local:
581653
_tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
654+
}
582655
preempt_enable();
583656
}
584657

@@ -638,14 +711,21 @@ void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
638711

639712
preempt_disable();
640713
smp_mb(); /* see radix__flush_tlb_mm */
641-
if (mm_is_thread_local(mm)) {
642-
local = true;
643-
full = (end == TLB_FLUSH_ALL ||
644-
nr_pages > tlb_local_single_page_flush_ceiling);
645-
} else {
714+
if (!mm_is_thread_local(mm)) {
715+
if (unlikely(mm_is_singlethreaded(mm))) {
716+
if (end != TLB_FLUSH_ALL) {
717+
exit_flush_lazy_tlbs(mm);
718+
goto is_local;
719+
}
720+
}
646721
local = false;
647722
full = (end == TLB_FLUSH_ALL ||
648723
nr_pages > tlb_single_page_flush_ceiling);
724+
} else {
725+
is_local:
726+
local = true;
727+
full = (end == TLB_FLUSH_ALL ||
728+
nr_pages > tlb_local_single_page_flush_ceiling);
649729
}
650730

651731
if (full) {
@@ -766,7 +846,7 @@ void radix__tlb_flush(struct mmu_gather *tlb)
766846
* See the comment for radix in arch_exit_mmap().
767847
*/
768848
if (tlb->fullmm) {
769-
radix__flush_all_mm(mm);
849+
__flush_all_mm(mm, true);
770850
} else if ( (psize = radix_get_mmu_psize(page_size)) == -1) {
771851
if (!tlb->need_flush_all)
772852
radix__flush_tlb_mm(mm);
@@ -800,24 +880,32 @@ static inline void __radix__flush_tlb_range_psize(struct mm_struct *mm,
800880

801881
preempt_disable();
802882
smp_mb(); /* see radix__flush_tlb_mm */
803-
if (mm_is_thread_local(mm)) {
804-
local = true;
805-
full = (end == TLB_FLUSH_ALL ||
806-
nr_pages > tlb_local_single_page_flush_ceiling);
807-
} else {
883+
if (!mm_is_thread_local(mm)) {
884+
if (unlikely(mm_is_singlethreaded(mm))) {
885+
if (end != TLB_FLUSH_ALL) {
886+
exit_flush_lazy_tlbs(mm);
887+
goto is_local;
888+
}
889+
}
808890
local = false;
809891
full = (end == TLB_FLUSH_ALL ||
810892
nr_pages > tlb_single_page_flush_ceiling);
893+
} else {
894+
is_local:
895+
local = true;
896+
full = (end == TLB_FLUSH_ALL ||
897+
nr_pages > tlb_local_single_page_flush_ceiling);
811898
}
812899

813900
if (full) {
814-
if (!local && mm_needs_flush_escalation(mm))
815-
also_pwc = true;
816-
817-
if (local)
901+
if (local) {
818902
_tlbiel_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
819-
else
820-
_tlbie_pid(pid, also_pwc ? RIC_FLUSH_ALL: RIC_FLUSH_TLB);
903+
} else {
904+
if (mm_needs_flush_escalation(mm))
905+
also_pwc = true;
906+
907+
_tlbie_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
908+
}
821909
} else {
822910
if (local)
823911
_tlbiel_va_range(start, end, pid, page_size, psize, also_pwc);
@@ -859,10 +947,16 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
859947
/* Otherwise first do the PWC, then iterate the pages. */
860948
preempt_disable();
861949
smp_mb(); /* see radix__flush_tlb_mm */
862-
if (mm_is_thread_local(mm)) {
863-
_tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
864-
} else {
950+
if (!mm_is_thread_local(mm)) {
951+
if (unlikely(mm_is_singlethreaded(mm))) {
952+
exit_flush_lazy_tlbs(mm);
953+
goto local;
954+
}
865955
_tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
956+
goto local;
957+
} else {
958+
local:
959+
_tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
866960
}
867961

868962
preempt_enable();

0 commit comments

Comments
 (0)