Skip to content

Commit 87e65ad

Browse files
committed
powerpc/mm/64s/hash: Add real-mode change_memory_range() for hash LPAR
When we enabled STRICT_KERNEL_RWX we received some reports of boot failures when using the Hash MMU and running under phyp. The crashes are intermittent, and often exhibit as a completely unresponsive system, or possibly an oops. One example, which was caught in xmon: [ 14.068327][ T1] devtmpfs: mounted [ 14.069302][ T1] Freeing unused kernel memory: 5568K [ 14.142060][ T347] BUG: Unable to handle kernel instruction fetch [ 14.142063][ T1] Run /sbin/init as init process [ 14.142074][ T347] Faulting instruction address: 0xc000000000004400 cpu 0x2: Vector: 400 (Instruction Access) at [c00000000c7475e0] pc: c000000000004400: exc_virt_0x4400_instruction_access+0x0/0x80 lr: c0000000001862d4: update_rq_clock+0x44/0x110 sp: c00000000c747880 msr: 8000000040001031 current = 0xc00000000c60d380 paca = 0xc00000001ec9de80 irqmask: 0x03 irq_happened: 0x01 pid = 347, comm = kworker/2:1 ... enter ? for help [c00000000c747880] c0000000001862d4 update_rq_clock+0x44/0x110 (unreliable) [c00000000c7478f0] c000000000198794 update_blocked_averages+0xb4/0x6d0 [c00000000c7479f0] c000000000198e40 update_nohz_stats+0x90/0xd0 [c00000000c747a20] c0000000001a13b4 _nohz_idle_balance+0x164/0x390 [c00000000c747b10] c0000000001a1af8 newidle_balance+0x478/0x610 [c00000000c747be0] c0000000001a1d48 pick_next_task_fair+0x58/0x480 [c00000000c747c40] c000000000eaab5c __schedule+0x12c/0x950 [c00000000c747cd0] c000000000eab3e8 schedule+0x68/0x120 [c00000000c747d00] c00000000016b730 worker_thread+0x130/0x640 [c00000000c747da0] c000000000174d50 kthread+0x1a0/0x1b0 [c00000000c747e10] c00000000000e0f0 ret_from_kernel_thread+0x5c/0x6c This shows that CPU 2, which was idle, woke up and then appears to randomly take an instruction fault on a completely valid area of kernel text. The cause turns out to be the call to hash__mark_rodata_ro(), late in boot. Due to the way we layout text and rodata, that function actually changes the permissions for all of text and rodata to read-only plus execute. To do the permission change we use a hypervisor call, H_PROTECT. On phyp that appears to be implemented by briefly removing the mapping of the kernel text, before putting it back with the updated permissions. If any other CPU is executing during that window, it will see spurious faults on the kernel text and/or data, leading to crashes. To fix it we use stop machine to collect all other CPUs, and then have them drop into real mode (MMU off), while we change the mapping. That way they are unaffected by the mapping temporarily disappearing. We don't see this bug on KVM because KVM always use VPM=1, where faults are directed to the hypervisor, and the fault will be serialised vs the h_protect() by HPTE_V_HVLOCK. Signed-off-by: Michael Ellerman <[email protected]> Link: https://lore.kernel.org/r/[email protected]
1 parent 6f223eb commit 87e65ad

File tree

1 file changed

+104
-1
lines changed

1 file changed

+104
-1
lines changed

arch/powerpc/mm/book3s64/hash_pgtable.c

Lines changed: 104 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include <linux/sched.h>
99
#include <linux/mm_types.h>
1010
#include <linux/mm.h>
11+
#include <linux/stop_machine.h>
1112

1213
#include <asm/sections.h>
1314
#include <asm/mmu.h>
@@ -400,6 +401,19 @@ EXPORT_SYMBOL_GPL(hash__has_transparent_hugepage);
400401
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
401402

402403
#ifdef CONFIG_STRICT_KERNEL_RWX
404+
405+
struct change_memory_parms {
406+
unsigned long start, end, newpp;
407+
unsigned int step, nr_cpus, master_cpu;
408+
atomic_t cpu_counter;
409+
};
410+
411+
// We'd rather this was on the stack but it has to be in the RMO
412+
static struct change_memory_parms chmem_parms;
413+
414+
// And therefore we need a lock to protect it from concurrent use
415+
static DEFINE_MUTEX(chmem_lock);
416+
403417
static void change_memory_range(unsigned long start, unsigned long end,
404418
unsigned int step, unsigned long newpp)
405419
{
@@ -414,6 +428,73 @@ static void change_memory_range(unsigned long start, unsigned long end,
414428
mmu_kernel_ssize);
415429
}
416430

431+
static int notrace chmem_secondary_loop(struct change_memory_parms *parms)
432+
{
433+
unsigned long msr, tmp, flags;
434+
int *p;
435+
436+
p = &parms->cpu_counter.counter;
437+
438+
local_irq_save(flags);
439+
hard_irq_disable();
440+
441+
asm volatile (
442+
// Switch to real mode and leave interrupts off
443+
"mfmsr %[msr] ;"
444+
"li %[tmp], %[MSR_IR_DR] ;"
445+
"andc %[tmp], %[msr], %[tmp] ;"
446+
"mtmsrd %[tmp] ;"
447+
448+
// Tell the master we are in real mode
449+
"1: "
450+
"lwarx %[tmp], 0, %[p] ;"
451+
"addic %[tmp], %[tmp], -1 ;"
452+
"stwcx. %[tmp], 0, %[p] ;"
453+
"bne- 1b ;"
454+
455+
// Spin until the counter goes to zero
456+
"2: ;"
457+
"lwz %[tmp], 0(%[p]) ;"
458+
"cmpwi %[tmp], 0 ;"
459+
"bne- 2b ;"
460+
461+
// Switch back to virtual mode
462+
"mtmsrd %[msr] ;"
463+
464+
: // outputs
465+
[msr] "=&r" (msr), [tmp] "=&b" (tmp), "+m" (*p)
466+
: // inputs
467+
[p] "b" (p), [MSR_IR_DR] "i" (MSR_IR | MSR_DR)
468+
: // clobbers
469+
"cc", "xer"
470+
);
471+
472+
local_irq_restore(flags);
473+
474+
return 0;
475+
}
476+
477+
static int change_memory_range_fn(void *data)
478+
{
479+
struct change_memory_parms *parms = data;
480+
481+
if (parms->master_cpu != smp_processor_id())
482+
return chmem_secondary_loop(parms);
483+
484+
// Wait for all but one CPU (this one) to call-in
485+
while (atomic_read(&parms->cpu_counter) > 1)
486+
barrier();
487+
488+
change_memory_range(parms->start, parms->end, parms->step, parms->newpp);
489+
490+
mb();
491+
492+
// Signal the other CPUs that we're done
493+
atomic_dec(&parms->cpu_counter);
494+
495+
return 0;
496+
}
497+
417498
static bool hash__change_memory_range(unsigned long start, unsigned long end,
418499
unsigned long newpp)
419500
{
@@ -428,7 +509,29 @@ static bool hash__change_memory_range(unsigned long start, unsigned long end,
428509
if (start >= end)
429510
return false;
430511

431-
change_memory_range(start, end, step, newpp);
512+
if (firmware_has_feature(FW_FEATURE_LPAR)) {
513+
mutex_lock(&chmem_lock);
514+
515+
chmem_parms.start = start;
516+
chmem_parms.end = end;
517+
chmem_parms.step = step;
518+
chmem_parms.newpp = newpp;
519+
chmem_parms.master_cpu = smp_processor_id();
520+
521+
cpus_read_lock();
522+
523+
atomic_set(&chmem_parms.cpu_counter, num_online_cpus());
524+
525+
// Ensure state is consistent before we call the other CPUs
526+
mb();
527+
528+
stop_machine_cpuslocked(change_memory_range_fn, &chmem_parms,
529+
cpu_online_mask);
530+
531+
cpus_read_unlock();
532+
mutex_unlock(&chmem_lock);
533+
} else
534+
change_memory_range(start, end, step, newpp);
432535

433536
return true;
434537
}

0 commit comments

Comments
 (0)