Skip to content

Commit 7f0a002

Browse files
joergroedeltorvalds
authored andcommitted
x86/mm: remove vmalloc faulting
Remove fault handling on vmalloc areas, as the vmalloc code now takes care of synchronizing changes to all page-tables in the system. Signed-off-by: Joerg Roedel <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Acked-by: Andy Lutomirski <[email protected]> Acked-by: Peter Zijlstra (Intel) <[email protected]> Cc: Arnd Bergmann <[email protected]> Cc: Christoph Hellwig <[email protected]> Cc: Dave Hansen <[email protected]> Cc: "H . Peter Anvin" <[email protected]> Cc: Ingo Molnar <[email protected]> Cc: Matthew Wilcox (Oracle) <[email protected]> Cc: Michal Hocko <[email protected]> Cc: "Rafael J. Wysocki" <[email protected]> Cc: Steven Rostedt (VMware) <[email protected]> Cc: Thomas Gleixner <[email protected]> Cc: Vlastimil Babka <[email protected]> Link: http://lkml.kernel.org/r/[email protected] Signed-off-by: Linus Torvalds <[email protected]>
1 parent 73f693c commit 7f0a002

File tree

5 files changed

+4
-204
lines changed

5 files changed

+4
-204
lines changed

arch/x86/include/asm/switch_to.h

Lines changed: 0 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -12,27 +12,6 @@ struct task_struct *__switch_to_asm(struct task_struct *prev,
1212
__visible struct task_struct *__switch_to(struct task_struct *prev,
1313
struct task_struct *next);
1414

15-
/* This runs runs on the previous thread's stack. */
16-
static inline void prepare_switch_to(struct task_struct *next)
17-
{
18-
#ifdef CONFIG_VMAP_STACK
19-
/*
20-
* If we switch to a stack that has a top-level paging entry
21-
* that is not present in the current mm, the resulting #PF will
22-
* will be promoted to a double-fault and we'll panic. Probe
23-
* the new stack now so that vmalloc_fault can fix up the page
24-
* tables if needed. This can only happen if we use a stack
25-
* in vmap space.
26-
*
27-
* We assume that the stack is aligned so that it never spans
28-
* more than one top-level paging entry.
29-
*
30-
* To minimize cache pollution, just follow the stack pointer.
31-
*/
32-
READ_ONCE(*(unsigned char *)next->thread.sp);
33-
#endif
34-
}
35-
3615
asmlinkage void ret_from_fork(void);
3716

3817
/*
@@ -67,8 +46,6 @@ struct fork_frame {
6746

6847
#define switch_to(prev, next, last) \
6948
do { \
70-
prepare_switch_to(next); \
71-
\
7249
((last) = __switch_to_asm((prev), (next))); \
7350
} while (0)
7451

arch/x86/kernel/setup_percpu.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -287,9 +287,9 @@ void __init setup_per_cpu_areas(void)
287287
/*
288288
* Sync back kernel address range again. We already did this in
289289
* setup_arch(), but percpu data also needs to be available in
290-
* the smpboot asm. We can't reliably pick up percpu mappings
291-
* using vmalloc_fault(), because exception dispatch needs
292-
* percpu data.
290+
* the smpboot asm and arch_sync_kernel_mappings() doesn't sync to
291+
* swapper_pg_dir on 32-bit. The per-cpu mappings need to be available
292+
* there too.
293293
*
294294
* FIXME: Can the later sync in setup_cpu_entry_areas() replace
295295
* this call?

arch/x86/mm/fault.c

Lines changed: 0 additions & 134 deletions
Original file line numberDiff line numberDiff line change
@@ -214,44 +214,6 @@ void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
214214
}
215215
}
216216

217-
/*
218-
* 32-bit:
219-
*
220-
* Handle a fault on the vmalloc or module mapping area
221-
*/
222-
static noinline int vmalloc_fault(unsigned long address)
223-
{
224-
unsigned long pgd_paddr;
225-
pmd_t *pmd_k;
226-
pte_t *pte_k;
227-
228-
/* Make sure we are in vmalloc area: */
229-
if (!(address >= VMALLOC_START && address < VMALLOC_END))
230-
return -1;
231-
232-
/*
233-
* Synchronize this task's top level page-table
234-
* with the 'reference' page table.
235-
*
236-
* Do _not_ use "current" here. We might be inside
237-
* an interrupt in the middle of a task switch..
238-
*/
239-
pgd_paddr = read_cr3_pa();
240-
pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
241-
if (!pmd_k)
242-
return -1;
243-
244-
if (pmd_large(*pmd_k))
245-
return 0;
246-
247-
pte_k = pte_offset_kernel(pmd_k, address);
248-
if (!pte_present(*pte_k))
249-
return -1;
250-
251-
return 0;
252-
}
253-
NOKPROBE_SYMBOL(vmalloc_fault);
254-
255217
/*
256218
* Did it hit the DOS screen memory VA from vm86 mode?
257219
*/
@@ -316,79 +278,6 @@ static void dump_pagetable(unsigned long address)
316278

317279
#else /* CONFIG_X86_64: */
318280

319-
/*
320-
* 64-bit:
321-
*
322-
* Handle a fault on the vmalloc area
323-
*/
324-
static noinline int vmalloc_fault(unsigned long address)
325-
{
326-
pgd_t *pgd, *pgd_k;
327-
p4d_t *p4d, *p4d_k;
328-
pud_t *pud;
329-
pmd_t *pmd;
330-
pte_t *pte;
331-
332-
/* Make sure we are in vmalloc area: */
333-
if (!(address >= VMALLOC_START && address < VMALLOC_END))
334-
return -1;
335-
336-
/*
337-
* Copy kernel mappings over when needed. This can also
338-
* happen within a race in page table update. In the later
339-
* case just flush:
340-
*/
341-
pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address);
342-
pgd_k = pgd_offset_k(address);
343-
if (pgd_none(*pgd_k))
344-
return -1;
345-
346-
if (pgtable_l5_enabled()) {
347-
if (pgd_none(*pgd)) {
348-
set_pgd(pgd, *pgd_k);
349-
arch_flush_lazy_mmu_mode();
350-
} else {
351-
BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_k));
352-
}
353-
}
354-
355-
/* With 4-level paging, copying happens on the p4d level. */
356-
p4d = p4d_offset(pgd, address);
357-
p4d_k = p4d_offset(pgd_k, address);
358-
if (p4d_none(*p4d_k))
359-
return -1;
360-
361-
if (p4d_none(*p4d) && !pgtable_l5_enabled()) {
362-
set_p4d(p4d, *p4d_k);
363-
arch_flush_lazy_mmu_mode();
364-
} else {
365-
BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_k));
366-
}
367-
368-
BUILD_BUG_ON(CONFIG_PGTABLE_LEVELS < 4);
369-
370-
pud = pud_offset(p4d, address);
371-
if (pud_none(*pud))
372-
return -1;
373-
374-
if (pud_large(*pud))
375-
return 0;
376-
377-
pmd = pmd_offset(pud, address);
378-
if (pmd_none(*pmd))
379-
return -1;
380-
381-
if (pmd_large(*pmd))
382-
return 0;
383-
384-
pte = pte_offset_kernel(pmd, address);
385-
if (!pte_present(*pte))
386-
return -1;
387-
388-
return 0;
389-
}
390-
NOKPROBE_SYMBOL(vmalloc_fault);
391-
392281
#ifdef CONFIG_CPU_SUP_AMD
393282
static const char errata93_warning[] =
394283
KERN_ERR
@@ -1227,29 +1116,6 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
12271116
*/
12281117
WARN_ON_ONCE(hw_error_code & X86_PF_PK);
12291118

1230-
/*
1231-
* We can fault-in kernel-space virtual memory on-demand. The
1232-
* 'reference' page table is init_mm.pgd.
1233-
*
1234-
* NOTE! We MUST NOT take any locks for this case. We may
1235-
* be in an interrupt or a critical region, and should
1236-
* only copy the information from the master page table,
1237-
* nothing more.
1238-
*
1239-
* Before doing this on-demand faulting, ensure that the
1240-
* fault is not any of the following:
1241-
* 1. A fault on a PTE with a reserved bit set.
1242-
* 2. A fault caused by a user-mode access. (Do not demand-
1243-
* fault kernel memory due to user-mode accesses).
1244-
* 3. A fault caused by a page-level protection violation.
1245-
* (A demand fault would be on a non-present page which
1246-
* would have X86_PF_PROT==0).
1247-
*/
1248-
if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
1249-
if (vmalloc_fault(address) >= 0)
1250-
return;
1251-
}
1252-
12531119
/* Was the fault spurious, caused by lazy TLB invalidation? */
12541120
if (spurious_kernel_fault(hw_error_code, address))
12551121
return;

arch/x86/mm/pti.c

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -448,13 +448,7 @@ static void __init pti_clone_user_shared(void)
448448
* the sp1 and sp2 slots.
449449
*
450450
* This is done for all possible CPUs during boot to ensure
451-
* that it's propagated to all mms. If we were to add one of
452-
* these mappings during CPU hotplug, we would need to take
453-
* some measure to make sure that every mm that subsequently
454-
* ran on that CPU would have the relevant PGD entry in its
455-
* pagetables. The usual vmalloc_fault() mechanism would not
456-
* work for page faults taken in entry_SYSCALL_64 before RSP
457-
* is set up.
451+
* that it's propagated to all mms.
458452
*/
459453

460454
unsigned long va = (unsigned long)&per_cpu(cpu_tss_rw, cpu);

arch/x86/mm/tlb.c

Lines changed: 0 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -161,34 +161,6 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
161161
local_irq_restore(flags);
162162
}
163163

164-
static void sync_current_stack_to_mm(struct mm_struct *mm)
165-
{
166-
unsigned long sp = current_stack_pointer;
167-
pgd_t *pgd = pgd_offset(mm, sp);
168-
169-
if (pgtable_l5_enabled()) {
170-
if (unlikely(pgd_none(*pgd))) {
171-
pgd_t *pgd_ref = pgd_offset_k(sp);
172-
173-
set_pgd(pgd, *pgd_ref);
174-
}
175-
} else {
176-
/*
177-
* "pgd" is faked. The top level entries are "p4d"s, so sync
178-
* the p4d. This compiles to approximately the same code as
179-
* the 5-level case.
180-
*/
181-
p4d_t *p4d = p4d_offset(pgd, sp);
182-
183-
if (unlikely(p4d_none(*p4d))) {
184-
pgd_t *pgd_ref = pgd_offset_k(sp);
185-
p4d_t *p4d_ref = p4d_offset(pgd_ref, sp);
186-
187-
set_p4d(p4d, *p4d_ref);
188-
}
189-
}
190-
}
191-
192164
static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next)
193165
{
194166
unsigned long next_tif = task_thread_info(next)->flags;
@@ -377,15 +349,6 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
377349
*/
378350
cond_ibpb(tsk);
379351

380-
if (IS_ENABLED(CONFIG_VMAP_STACK)) {
381-
/*
382-
* If our current stack is in vmalloc space and isn't
383-
* mapped in the new pgd, we'll double-fault. Forcibly
384-
* map it.
385-
*/
386-
sync_current_stack_to_mm(next);
387-
}
388-
389352
/*
390353
* Stop remote flushes for the previous mm.
391354
* Skip kernel threads; we never send init_mm TLB flushing IPIs,

0 commit comments

Comments
 (0)