Skip to content

Commit 3c88c69

Browse files
Peter ZijlstraIngo Molnar
authored andcommitted
x86/stackframe/32: Provide consistent pt_regs
Currently pt_regs on x86_32 has an oddity in that kernel regs (!user_mode(regs)) are short two entries (esp/ss). This means that any code trying to use them (typically: regs->sp) needs to jump through some unfortunate hoops. Change the entry code to fix this up and create a full pt_regs frame. This then simplifies various trampolines in ftrace and kprobes, the stack unwinder, ptrace, kdump and kgdb. Much thanks to Josh for help with the cleanups! Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Reviewed-by: Josh Poimboeuf <[email protected]> Acked-by: Masami Hiramatsu <[email protected]> Cc: Linus Torvalds <[email protected]> Cc: Peter Zijlstra <[email protected]> Cc: Thomas Gleixner <[email protected]> Signed-off-by: Ingo Molnar <[email protected]>
1 parent ea1ed38 commit 3c88c69

File tree

15 files changed

+177
-190
lines changed

15 files changed

+177
-190
lines changed

arch/x86/entry/entry_32.S

Lines changed: 95 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -202,9 +202,102 @@
202202
.Lend_\@:
203203
.endm
204204

205+
#define CS_FROM_ENTRY_STACK (1 << 31)
206+
#define CS_FROM_USER_CR3 (1 << 30)
207+
#define CS_FROM_KERNEL (1 << 29)
208+
209+
.macro FIXUP_FRAME
210+
/*
211+
* The high bits of the CS dword (__csh) are used for CS_FROM_*.
212+
* Clear them in case hardware didn't do this for us.
213+
*/
214+
andl $0x0000ffff, 3*4(%esp)
215+
216+
#ifdef CONFIG_VM86
217+
testl $X86_EFLAGS_VM, 4*4(%esp)
218+
jnz .Lfrom_usermode_no_fixup_\@
219+
#endif
220+
testl $SEGMENT_RPL_MASK, 3*4(%esp)
221+
jnz .Lfrom_usermode_no_fixup_\@
222+
223+
orl $CS_FROM_KERNEL, 3*4(%esp)
224+
225+
/*
226+
* When we're here from kernel mode; the (exception) stack looks like:
227+
*
228+
* 5*4(%esp) - <previous context>
229+
* 4*4(%esp) - flags
230+
* 3*4(%esp) - cs
231+
* 2*4(%esp) - ip
232+
* 1*4(%esp) - orig_eax
233+
* 0*4(%esp) - gs / function
234+
*
235+
* Lets build a 5 entry IRET frame after that, such that struct pt_regs
236+
* is complete and in particular regs->sp is correct. This gives us
237+
* the original 5 enties as gap:
238+
*
239+
* 12*4(%esp) - <previous context>
240+
* 11*4(%esp) - gap / flags
241+
* 10*4(%esp) - gap / cs
242+
* 9*4(%esp) - gap / ip
243+
* 8*4(%esp) - gap / orig_eax
244+
* 7*4(%esp) - gap / gs / function
245+
* 6*4(%esp) - ss
246+
* 5*4(%esp) - sp
247+
* 4*4(%esp) - flags
248+
* 3*4(%esp) - cs
249+
* 2*4(%esp) - ip
250+
* 1*4(%esp) - orig_eax
251+
* 0*4(%esp) - gs / function
252+
*/
253+
254+
pushl %ss # ss
255+
pushl %esp # sp (points at ss)
256+
addl $6*4, (%esp) # point sp back at the previous context
257+
pushl 6*4(%esp) # flags
258+
pushl 6*4(%esp) # cs
259+
pushl 6*4(%esp) # ip
260+
pushl 6*4(%esp) # orig_eax
261+
pushl 6*4(%esp) # gs / function
262+
.Lfrom_usermode_no_fixup_\@:
263+
.endm
264+
265+
.macro IRET_FRAME
266+
testl $CS_FROM_KERNEL, 1*4(%esp)
267+
jz .Lfinished_frame_\@
268+
269+
/*
270+
* Reconstruct the 3 entry IRET frame right after the (modified)
271+
* regs->sp without lowering %esp in between, such that an NMI in the
272+
* middle doesn't scribble our stack.
273+
*/
274+
pushl %eax
275+
pushl %ecx
276+
movl 5*4(%esp), %eax # (modified) regs->sp
277+
278+
movl 4*4(%esp), %ecx # flags
279+
movl %ecx, -4(%eax)
280+
281+
movl 3*4(%esp), %ecx # cs
282+
andl $0x0000ffff, %ecx
283+
movl %ecx, -8(%eax)
284+
285+
movl 2*4(%esp), %ecx # ip
286+
movl %ecx, -12(%eax)
287+
288+
movl 1*4(%esp), %ecx # eax
289+
movl %ecx, -16(%eax)
290+
291+
popl %ecx
292+
lea -16(%eax), %esp
293+
popl %eax
294+
.Lfinished_frame_\@:
295+
.endm
296+
205297
.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0
206298
cld
207299
PUSH_GS
300+
FIXUP_FRAME
208301
pushl %fs
209302
pushl %es
210303
pushl %ds
@@ -358,9 +451,6 @@
358451
* switch to it before we do any copying.
359452
*/
360453

361-
#define CS_FROM_ENTRY_STACK (1 << 31)
362-
#define CS_FROM_USER_CR3 (1 << 30)
363-
364454
.macro SWITCH_TO_KERNEL_STACK
365455

366456
ALTERNATIVE "", "jmp .Lend_\@", X86_FEATURE_XENPV
@@ -374,13 +464,6 @@
374464
* that register for the time this macro runs
375465
*/
376466

377-
/*
378-
* The high bits of the CS dword (__csh) are used for
379-
* CS_FROM_ENTRY_STACK and CS_FROM_USER_CR3. Clear them in case
380-
* hardware didn't do this for us.
381-
*/
382-
andl $(0x0000ffff), PT_CS(%esp)
383-
384467
/* Are we on the entry stack? Bail out if not! */
385468
movl PER_CPU_VAR(cpu_entry_area), %ecx
386469
addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx
@@ -990,6 +1073,7 @@ restore_all:
9901073
/* Restore user state */
9911074
RESTORE_REGS pop=4 # skip orig_eax/error_code
9921075
.Lirq_return:
1076+
IRET_FRAME
9931077
/*
9941078
* ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization
9951079
* when returning from IPI handler and when returning from
@@ -1340,6 +1424,7 @@ END(page_fault)
13401424

13411425
common_exception:
13421426
/* the function address is in %gs's slot on the stack */
1427+
FIXUP_FRAME
13431428
pushl %fs
13441429
pushl %es
13451430
pushl %ds

arch/x86/include/asm/kexec.h

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -70,22 +70,6 @@ struct kimage;
7070
#define KEXEC_BACKUP_SRC_START (0UL)
7171
#define KEXEC_BACKUP_SRC_END (640 * 1024UL - 1) /* 640K */
7272

73-
/*
74-
* CPU does not save ss and sp on stack if execution is already
75-
* running in kernel mode at the time of NMI occurrence. This code
76-
* fixes it.
77-
*/
78-
static inline void crash_fixup_ss_esp(struct pt_regs *newregs,
79-
struct pt_regs *oldregs)
80-
{
81-
#ifdef CONFIG_X86_32
82-
newregs->sp = (unsigned long)&(oldregs->sp);
83-
asm volatile("xorl %%eax, %%eax\n\t"
84-
"movw %%ss, %%ax\n\t"
85-
:"=a"(newregs->ss));
86-
#endif
87-
}
88-
8973
/*
9074
* This function is responsible for capturing register states if coming
9175
* via panic otherwise just fix up the ss and sp if coming via kernel
@@ -96,7 +80,6 @@ static inline void crash_setup_regs(struct pt_regs *newregs,
9680
{
9781
if (oldregs) {
9882
memcpy(newregs, oldregs, sizeof(*newregs));
99-
crash_fixup_ss_esp(newregs, oldregs);
10083
} else {
10184
#ifdef CONFIG_X86_32
10285
asm volatile("movl %%ebx,%0" : "=m"(newregs->bx));

arch/x86/include/asm/ptrace.h

Lines changed: 2 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -166,14 +166,10 @@ static inline bool user_64bit_mode(struct pt_regs *regs)
166166
#define compat_user_stack_pointer() current_pt_regs()->sp
167167
#endif
168168

169-
#ifdef CONFIG_X86_32
170-
extern unsigned long kernel_stack_pointer(struct pt_regs *regs);
171-
#else
172169
static inline unsigned long kernel_stack_pointer(struct pt_regs *regs)
173170
{
174171
return regs->sp;
175172
}
176-
#endif
177173

178174
#define GET_IP(regs) ((regs)->ip)
179175
#define GET_FP(regs) ((regs)->bp)
@@ -201,14 +197,6 @@ static inline unsigned long regs_get_register(struct pt_regs *regs,
201197
if (unlikely(offset > MAX_REG_OFFSET))
202198
return 0;
203199
#ifdef CONFIG_X86_32
204-
/*
205-
* Traps from the kernel do not save sp and ss.
206-
* Use the helper function to retrieve sp.
207-
*/
208-
if (offset == offsetof(struct pt_regs, sp) &&
209-
regs->cs == __KERNEL_CS)
210-
return kernel_stack_pointer(regs);
211-
212200
/* The selector fields are 16-bit. */
213201
if (offset == offsetof(struct pt_regs, cs) ||
214202
offset == offsetof(struct pt_regs, ss) ||
@@ -234,8 +222,7 @@ static inline unsigned long regs_get_register(struct pt_regs *regs,
234222
static inline int regs_within_kernel_stack(struct pt_regs *regs,
235223
unsigned long addr)
236224
{
237-
return ((addr & ~(THREAD_SIZE - 1)) ==
238-
(kernel_stack_pointer(regs) & ~(THREAD_SIZE - 1)));
225+
return ((addr & ~(THREAD_SIZE - 1)) == (regs->sp & ~(THREAD_SIZE - 1)));
239226
}
240227

241228
/**
@@ -249,7 +236,7 @@ static inline int regs_within_kernel_stack(struct pt_regs *regs,
249236
*/
250237
static inline unsigned long *regs_get_kernel_stack_nth_addr(struct pt_regs *regs, unsigned int n)
251238
{
252-
unsigned long *addr = (unsigned long *)kernel_stack_pointer(regs);
239+
unsigned long *addr = (unsigned long *)regs->sp;
253240

254241
addr += n;
255242
if (regs_within_kernel_stack(regs, (unsigned long)addr))

arch/x86/include/asm/stacktrace.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ static inline unsigned long *
7878
get_stack_pointer(struct task_struct *task, struct pt_regs *regs)
7979
{
8080
if (regs)
81-
return (unsigned long *)kernel_stack_pointer(regs);
81+
return (unsigned long *)regs->sp;
8282

8383
if (task == current)
8484
return __builtin_frame_address(0);

arch/x86/kernel/crash.c

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -73,14 +73,6 @@ static inline void cpu_crash_vmclear_loaded_vmcss(void)
7373

7474
static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
7575
{
76-
#ifdef CONFIG_X86_32
77-
struct pt_regs fixed_regs;
78-
79-
if (!user_mode(regs)) {
80-
crash_fixup_ss_esp(&fixed_regs, regs);
81-
regs = &fixed_regs;
82-
}
83-
#endif
8476
crash_save_cpu(regs, cpu);
8577

8678
/*

arch/x86/kernel/ftrace_32.S

Lines changed: 42 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#include <asm/ftrace.h>
1111
#include <asm/nospec-branch.h>
1212
#include <asm/frame.h>
13+
#include <asm/asm-offsets.h>
1314

1415
# define function_hook __fentry__
1516
EXPORT_SYMBOL(__fentry__)
@@ -90,26 +91,38 @@ END(ftrace_caller)
9091

9192
ENTRY(ftrace_regs_caller)
9293
/*
93-
* i386 does not save SS and ESP when coming from kernel.
94-
* Instead, to get sp, &regs->sp is used (see ptrace.h).
95-
* Unfortunately, that means eflags must be at the same location
96-
* as the current return ip is. We move the return ip into the
97-
* regs->ip location, and move flags into the return ip location.
94+
* We're here from an mcount/fentry CALL, and the stack frame looks like:
95+
*
96+
* <previous context>
97+
* RET-IP
98+
*
99+
* The purpose of this function is to call out in an emulated INT3
100+
* environment with a stack frame like:
101+
*
102+
* <previous context>
103+
* gap / RET-IP
104+
* gap
105+
* gap
106+
* gap
107+
* pt_regs
108+
*
109+
* We do _NOT_ restore: ss, flags, cs, gs, fs, es, ds
98110
*/
99-
pushl $__KERNEL_CS
100-
pushl 4(%esp) /* Save the return ip */
101-
pushl $0 /* Load 0 into orig_ax */
111+
subl $3*4, %esp # RET-IP + 3 gaps
112+
pushl %ss # ss
113+
pushl %esp # points at ss
114+
addl $5*4, (%esp) # make it point at <previous context>
115+
pushfl # flags
116+
pushl $__KERNEL_CS # cs
117+
pushl 7*4(%esp) # ip <- RET-IP
118+
pushl $0 # orig_eax
119+
102120
pushl %gs
103121
pushl %fs
104122
pushl %es
105123
pushl %ds
106-
pushl %eax
107-
108-
/* Get flags and place them into the return ip slot */
109-
pushf
110-
popl %eax
111-
movl %eax, 8*4(%esp)
112124

125+
pushl %eax
113126
pushl %ebp
114127
pushl %edi
115128
pushl %esi
@@ -119,39 +132,35 @@ ENTRY(ftrace_regs_caller)
119132

120133
ENCODE_FRAME_POINTER
121134

122-
movl 12*4(%esp), %eax /* Load ip (1st parameter) */
123-
subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */
124-
movl 15*4(%esp), %edx /* Load parent ip (2nd parameter) */
125-
movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */
126-
pushl %esp /* Save pt_regs as 4th parameter */
135+
movl PT_EIP(%esp), %eax # 1st argument: IP
136+
subl $MCOUNT_INSN_SIZE, %eax
137+
movl 21*4(%esp), %edx # 2nd argument: parent ip
138+
movl function_trace_op, %ecx # 3rd argument: ftrace_pos
139+
pushl %esp # 4th argument: pt_regs
127140

128141
GLOBAL(ftrace_regs_call)
129142
call ftrace_stub
130143

131-
addl $4, %esp /* Skip pt_regs */
144+
addl $4, %esp # skip 4th argument
132145

133-
/* restore flags */
134-
push 14*4(%esp)
135-
popf
146+
/* place IP below the new SP */
147+
movl PT_OLDESP(%esp), %eax
148+
movl PT_EIP(%esp), %ecx
149+
movl %ecx, -4(%eax)
136150

137-
/* Move return ip back to its original location */
138-
movl 12*4(%esp), %eax
139-
movl %eax, 14*4(%esp)
151+
/* place EAX below that */
152+
movl PT_EAX(%esp), %ecx
153+
movl %ecx, -8(%eax)
140154

141155
popl %ebx
142156
popl %ecx
143157
popl %edx
144158
popl %esi
145159
popl %edi
146160
popl %ebp
147-
popl %eax
148-
popl %ds
149-
popl %es
150-
popl %fs
151-
popl %gs
152161

153-
/* use lea to not affect flags */
154-
lea 3*4(%esp), %esp /* Skip orig_ax, ip and cs */
162+
lea -8(%eax), %esp
163+
popl %eax
155164

156165
jmp .Lftrace_ret
157166

arch/x86/kernel/kgdb.c

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -118,14 +118,6 @@ char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)
118118

119119
#ifdef CONFIG_X86_32
120120
switch (regno) {
121-
case GDB_SS:
122-
if (!user_mode(regs))
123-
*(unsigned long *)mem = __KERNEL_DS;
124-
break;
125-
case GDB_SP:
126-
if (!user_mode(regs))
127-
*(unsigned long *)mem = kernel_stack_pointer(regs);
128-
break;
129121
case GDB_GS:
130122
case GDB_FS:
131123
*(unsigned long *)mem = 0xFFFF;

arch/x86/kernel/kprobes/common.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,8 @@
7272
" popl %edi\n" \
7373
" popl %ebp\n" \
7474
" popl %eax\n" \
75-
/* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\
76-
" addl $24, %esp\n"
75+
/* Skip ds, es, fs, gs, orig_ax, ip, and cs. */\
76+
" addl $7*4, %esp\n"
7777
#endif
7878

7979
/* Ensure if the instruction can be boostable */

0 commit comments

Comments
 (0)