Skip to content

Commit 015b315

Browse files
committed
Merge tag 'x86-urgent-2020-09-06' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 fixes from Ingo Molnar: - more generic entry code ABI fallout - debug register handling bugfixes - fix vmalloc mappings on 32-bit kernels - kprobes instrumentation output fix on 32-bit kernels - fix over-eager WARN_ON_ONCE() on !SMAP hardware - NUMA debugging fix - fix Clang related crash on !RETPOLINE kernels * tag 'x86-urgent-2020-09-06' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/entry: Unbreak 32bit fast syscall x86/debug: Allow a single level of #DB recursion x86/entry: Fix AC assertion tracing/kprobes, x86/ptrace: Fix regs argument order for i386 x86, fakenuma: Fix invalid starting node ID x86/mm/32: Bring back vmalloc faulting on x86_32 x86/cmdline: Disable jump tables for cmdline.c
2 parents 68beef5 + 4facb95 commit 015b315

File tree

9 files changed

+213
-63
lines changed

9 files changed

+213
-63
lines changed

arch/x86/entry/common.c

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -60,16 +60,10 @@ __visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs)
6060
#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
6161
static __always_inline unsigned int syscall_32_enter(struct pt_regs *regs)
6262
{
63-
unsigned int nr = (unsigned int)regs->orig_ax;
64-
6563
if (IS_ENABLED(CONFIG_IA32_EMULATION))
6664
current_thread_info()->status |= TS_COMPAT;
67-
/*
68-
* Subtlety here: if ptrace pokes something larger than 2^32-1 into
69-
* orig_ax, the unsigned int return value truncates it. This may
70-
* or may not be necessary, but it matches the old asm behavior.
71-
*/
72-
return (unsigned int)syscall_enter_from_user_mode(regs, nr);
65+
66+
return (unsigned int)regs->orig_ax;
7367
}
7468

7569
/*
@@ -91,15 +85,29 @@ __visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
9185
{
9286
unsigned int nr = syscall_32_enter(regs);
9387

88+
/*
89+
* Subtlety here: if ptrace pokes something larger than 2^32-1 into
90+
* orig_ax, the unsigned int return value truncates it. This may
91+
* or may not be necessary, but it matches the old asm behavior.
92+
*/
93+
nr = (unsigned int)syscall_enter_from_user_mode(regs, nr);
94+
9495
do_syscall_32_irqs_on(regs, nr);
9596
syscall_exit_to_user_mode(regs);
9697
}
9798

9899
static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
99100
{
100-
unsigned int nr = syscall_32_enter(regs);
101+
unsigned int nr = syscall_32_enter(regs);
101102
int res;
102103

104+
/*
105+
* This cannot use syscall_enter_from_user_mode() as it has to
106+
* fetch EBP before invoking any of the syscall entry work
107+
* functions.
108+
*/
109+
syscall_enter_from_user_mode_prepare(regs);
110+
103111
instrumentation_begin();
104112
/* Fetch EBP from where the vDSO stashed it. */
105113
if (IS_ENABLED(CONFIG_X86_64)) {
@@ -122,6 +130,9 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
122130
return false;
123131
}
124132

133+
/* The case truncates any ptrace induced syscall nr > 2^32 -1 */
134+
nr = (unsigned int)syscall_enter_from_user_mode_work(regs, nr);
135+
125136
/* Now this is just like a normal syscall. */
126137
do_syscall_32_irqs_on(regs, nr);
127138
syscall_exit_to_user_mode(regs);

arch/x86/include/asm/entry-common.h

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,16 @@ static __always_inline void arch_check_user_regs(struct pt_regs *regs)
1818
* state, not the interrupt state as imagined by Xen.
1919
*/
2020
unsigned long flags = native_save_fl();
21-
WARN_ON_ONCE(flags & (X86_EFLAGS_AC | X86_EFLAGS_DF |
22-
X86_EFLAGS_NT));
21+
unsigned long mask = X86_EFLAGS_DF | X86_EFLAGS_NT;
22+
23+
/*
24+
* For !SMAP hardware we patch out CLAC on entry.
25+
*/
26+
if (boot_cpu_has(X86_FEATURE_SMAP) ||
27+
(IS_ENABLED(CONFIG_64_BIT) && boot_cpu_has(X86_FEATURE_XENPV)))
28+
mask |= X86_EFLAGS_AC;
29+
30+
WARN_ON_ONCE(flags & mask);
2331

2432
/* We think we came from user mode. Make sure pt_regs agrees. */
2533
WARN_ON_ONCE(!user_mode(regs));

arch/x86/include/asm/ptrace.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -327,8 +327,8 @@ static inline unsigned long regs_get_kernel_argument(struct pt_regs *regs,
327327
static const unsigned int argument_offs[] = {
328328
#ifdef __i386__
329329
offsetof(struct pt_regs, ax),
330-
offsetof(struct pt_regs, cx),
331330
offsetof(struct pt_regs, dx),
331+
offsetof(struct pt_regs, cx),
332332
#define NR_REG_ARGUMENTS 3
333333
#else
334334
offsetof(struct pt_regs, di),

arch/x86/kernel/traps.c

Lines changed: 31 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -729,20 +729,9 @@ static bool is_sysenter_singlestep(struct pt_regs *regs)
729729
#endif
730730
}
731731

732-
static __always_inline void debug_enter(unsigned long *dr6, unsigned long *dr7)
732+
static __always_inline unsigned long debug_read_clear_dr6(void)
733733
{
734-
/*
735-
* Disable breakpoints during exception handling; recursive exceptions
736-
* are exceedingly 'fun'.
737-
*
738-
* Since this function is NOKPROBE, and that also applies to
739-
* HW_BREAKPOINT_X, we can't hit a breakpoint before this (XXX except a
740-
* HW_BREAKPOINT_W on our stack)
741-
*
742-
* Entry text is excluded for HW_BP_X and cpu_entry_area, which
743-
* includes the entry stack is excluded for everything.
744-
*/
745-
*dr7 = local_db_save();
734+
unsigned long dr6;
746735

747736
/*
748737
* The Intel SDM says:
@@ -755,15 +744,12 @@ static __always_inline void debug_enter(unsigned long *dr6, unsigned long *dr7)
755744
*
756745
* Keep it simple: clear DR6 immediately.
757746
*/
758-
get_debugreg(*dr6, 6);
747+
get_debugreg(dr6, 6);
759748
set_debugreg(0, 6);
760749
/* Filter out all the reserved bits which are preset to 1 */
761-
*dr6 &= ~DR6_RESERVED;
762-
}
750+
dr6 &= ~DR6_RESERVED;
763751

764-
static __always_inline void debug_exit(unsigned long dr7)
765-
{
766-
local_db_restore(dr7);
752+
return dr6;
767753
}
768754

769755
/*
@@ -863,6 +849,18 @@ static void handle_debug(struct pt_regs *regs, unsigned long dr6, bool user)
863849
static __always_inline void exc_debug_kernel(struct pt_regs *regs,
864850
unsigned long dr6)
865851
{
852+
/*
853+
* Disable breakpoints during exception handling; recursive exceptions
854+
* are exceedingly 'fun'.
855+
*
856+
* Since this function is NOKPROBE, and that also applies to
857+
* HW_BREAKPOINT_X, we can't hit a breakpoint before this (XXX except a
858+
* HW_BREAKPOINT_W on our stack)
859+
*
860+
* Entry text is excluded for HW_BP_X and cpu_entry_area, which
861+
* includes the entry stack is excluded for everything.
862+
*/
863+
unsigned long dr7 = local_db_save();
866864
bool irq_state = idtentry_enter_nmi(regs);
867865
instrumentation_begin();
868866

@@ -883,6 +881,8 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs,
883881

884882
instrumentation_end();
885883
idtentry_exit_nmi(regs, irq_state);
884+
885+
local_db_restore(dr7);
886886
}
887887

888888
static __always_inline void exc_debug_user(struct pt_regs *regs,
@@ -894,6 +894,15 @@ static __always_inline void exc_debug_user(struct pt_regs *regs,
894894
*/
895895
WARN_ON_ONCE(!user_mode(regs));
896896

897+
/*
898+
* NB: We can't easily clear DR7 here because
899+
* idtentry_exit_to_usermode() can invoke ptrace, schedule, access
900+
* user memory, etc. This means that a recursive #DB is possible. If
901+
* this happens, that #DB will hit exc_debug_kernel() and clear DR7.
902+
* Since we're not on the IST stack right now, everything will be
903+
* fine.
904+
*/
905+
897906
irqentry_enter_from_user_mode(regs);
898907
instrumentation_begin();
899908

@@ -907,36 +916,24 @@ static __always_inline void exc_debug_user(struct pt_regs *regs,
907916
/* IST stack entry */
908917
DEFINE_IDTENTRY_DEBUG(exc_debug)
909918
{
910-
unsigned long dr6, dr7;
911-
912-
debug_enter(&dr6, &dr7);
913-
exc_debug_kernel(regs, dr6);
914-
debug_exit(dr7);
919+
exc_debug_kernel(regs, debug_read_clear_dr6());
915920
}
916921

917922
/* User entry, runs on regular task stack */
918923
DEFINE_IDTENTRY_DEBUG_USER(exc_debug)
919924
{
920-
unsigned long dr6, dr7;
921-
922-
debug_enter(&dr6, &dr7);
923-
exc_debug_user(regs, dr6);
924-
debug_exit(dr7);
925+
exc_debug_user(regs, debug_read_clear_dr6());
925926
}
926927
#else
927928
/* 32 bit does not have separate entry points. */
928929
DEFINE_IDTENTRY_RAW(exc_debug)
929930
{
930-
unsigned long dr6, dr7;
931-
932-
debug_enter(&dr6, &dr7);
931+
unsigned long dr6 = debug_read_clear_dr6();
933932

934933
if (user_mode(regs))
935934
exc_debug_user(regs, dr6);
936935
else
937936
exc_debug_kernel(regs, dr6);
938-
939-
debug_exit(dr7);
940937
}
941938
#endif
942939

arch/x86/lib/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ ifdef CONFIG_FUNCTION_TRACER
2424
CFLAGS_REMOVE_cmdline.o = -pg
2525
endif
2626

27-
CFLAGS_cmdline.o := -fno-stack-protector
27+
CFLAGS_cmdline.o := -fno-stack-protector -fno-jump-tables
2828
endif
2929

3030
inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk

arch/x86/mm/fault.c

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,53 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
190190
return pmd_k;
191191
}
192192

193+
/*
194+
* Handle a fault on the vmalloc or module mapping area
195+
*
196+
* This is needed because there is a race condition between the time
197+
* when the vmalloc mapping code updates the PMD to the point in time
198+
* where it synchronizes this update with the other page-tables in the
199+
* system.
200+
*
201+
* In this race window another thread/CPU can map an area on the same
202+
* PMD, finds it already present and does not synchronize it with the
203+
* rest of the system yet. As a result v[mz]alloc might return areas
204+
* which are not mapped in every page-table in the system, causing an
205+
* unhandled page-fault when they are accessed.
206+
*/
207+
static noinline int vmalloc_fault(unsigned long address)
208+
{
209+
unsigned long pgd_paddr;
210+
pmd_t *pmd_k;
211+
pte_t *pte_k;
212+
213+
/* Make sure we are in vmalloc area: */
214+
if (!(address >= VMALLOC_START && address < VMALLOC_END))
215+
return -1;
216+
217+
/*
218+
* Synchronize this task's top level page-table
219+
* with the 'reference' page table.
220+
*
221+
* Do _not_ use "current" here. We might be inside
222+
* an interrupt in the middle of a task switch..
223+
*/
224+
pgd_paddr = read_cr3_pa();
225+
pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
226+
if (!pmd_k)
227+
return -1;
228+
229+
if (pmd_large(*pmd_k))
230+
return 0;
231+
232+
pte_k = pte_offset_kernel(pmd_k, address);
233+
if (!pte_present(*pte_k))
234+
return -1;
235+
236+
return 0;
237+
}
238+
NOKPROBE_SYMBOL(vmalloc_fault);
239+
193240
void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
194241
{
195242
unsigned long addr;
@@ -1110,6 +1157,37 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
11101157
*/
11111158
WARN_ON_ONCE(hw_error_code & X86_PF_PK);
11121159

1160+
#ifdef CONFIG_X86_32
1161+
/*
1162+
* We can fault-in kernel-space virtual memory on-demand. The
1163+
* 'reference' page table is init_mm.pgd.
1164+
*
1165+
* NOTE! We MUST NOT take any locks for this case. We may
1166+
* be in an interrupt or a critical region, and should
1167+
* only copy the information from the master page table,
1168+
* nothing more.
1169+
*
1170+
* Before doing this on-demand faulting, ensure that the
1171+
* fault is not any of the following:
1172+
* 1. A fault on a PTE with a reserved bit set.
1173+
* 2. A fault caused by a user-mode access. (Do not demand-
1174+
* fault kernel memory due to user-mode accesses).
1175+
* 3. A fault caused by a page-level protection violation.
1176+
* (A demand fault would be on a non-present page which
1177+
* would have X86_PF_PROT==0).
1178+
*
1179+
* This is only needed to close a race condition on x86-32 in
1180+
* the vmalloc mapping/unmapping code. See the comment above
1181+
* vmalloc_fault() for details. On x86-64 the race does not
1182+
* exist as the vmalloc mappings don't need to be synchronized
1183+
* there.
1184+
*/
1185+
if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
1186+
if (vmalloc_fault(address) >= 0)
1187+
return;
1188+
}
1189+
#endif
1190+
11131191
/* Was the fault spurious, caused by lazy TLB invalidation? */
11141192
if (spurious_kernel_fault(hw_error_code, address))
11151193
return;

arch/x86/mm/numa_emulation.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -321,7 +321,7 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
321321
u64 addr, u64 max_addr, u64 size)
322322
{
323323
return split_nodes_size_interleave_uniform(ei, pi, addr, max_addr, size,
324-
0, NULL, NUMA_NO_NODE);
324+
0, NULL, 0);
325325
}
326326

327327
static int __init setup_emu2phys_nid(int *dfl_phys_nid)

include/linux/entry-common.h

Lines changed: 42 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -110,15 +110,30 @@ static inline __must_check int arch_syscall_enter_tracehook(struct pt_regs *regs
110110
#endif
111111

112112
/**
113-
* syscall_enter_from_user_mode - Check and handle work before invoking
114-
* a syscall
113+
* syscall_enter_from_user_mode_prepare - Establish state and enable interrupts
115114
* @regs: Pointer to currents pt_regs
116-
* @syscall: The syscall number
117115
*
118116
* Invoked from architecture specific syscall entry code with interrupts
119117
* disabled. The calling code has to be non-instrumentable. When the
120-
* function returns all state is correct and the subsequent functions can be
121-
* instrumented.
118+
* function returns all state is correct, interrupts are enabled and the
119+
* subsequent functions can be instrumented.
120+
*
121+
* This handles lockdep, RCU (context tracking) and tracing state.
122+
*
123+
* This is invoked when there is extra architecture specific functionality
124+
* to be done between establishing state and handling user mode entry work.
125+
*/
126+
void syscall_enter_from_user_mode_prepare(struct pt_regs *regs);
127+
128+
/**
129+
* syscall_enter_from_user_mode_work - Check and handle work before invoking
130+
* a syscall
131+
* @regs: Pointer to currents pt_regs
132+
* @syscall: The syscall number
133+
*
134+
* Invoked from architecture specific syscall entry code with interrupts
135+
* enabled after invoking syscall_enter_from_user_mode_prepare() and extra
136+
* architecture specific work.
122137
*
123138
* Returns: The original or a modified syscall number
124139
*
@@ -127,12 +142,30 @@ static inline __must_check int arch_syscall_enter_tracehook(struct pt_regs *regs
127142
* syscall_set_return_value() first. If neither of those are called and -1
128143
* is returned, then the syscall will fail with ENOSYS.
129144
*
130-
* The following functionality is handled here:
145+
* It handles the following work items:
131146
*
132-
* 1) Establish state (lockdep, RCU (context tracking), tracing)
133-
* 2) TIF flag dependent invocations of arch_syscall_enter_tracehook(),
147+
* 1) TIF flag dependent invocations of arch_syscall_enter_tracehook(),
134148
* __secure_computing(), trace_sys_enter()
135-
* 3) Invocation of audit_syscall_entry()
149+
* 2) Invocation of audit_syscall_entry()
150+
*/
151+
long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall);
152+
153+
/**
154+
* syscall_enter_from_user_mode - Establish state and check and handle work
155+
* before invoking a syscall
156+
* @regs: Pointer to currents pt_regs
157+
* @syscall: The syscall number
158+
*
159+
* Invoked from architecture specific syscall entry code with interrupts
160+
* disabled. The calling code has to be non-instrumentable. When the
161+
* function returns all state is correct, interrupts are enabled and the
162+
* subsequent functions can be instrumented.
163+
*
164+
* This is combination of syscall_enter_from_user_mode_prepare() and
165+
* syscall_enter_from_user_mode_work().
166+
*
167+
* Returns: The original or a modified syscall number. See
168+
* syscall_enter_from_user_mode_work() for further explanation.
136169
*/
137170
long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall);
138171

0 commit comments

Comments
 (0)