Skip to content

Commit 4c71a2b

Browse files
committed
x86/speculation: Prepare for conditional IBPB in switch_mm()
The IBPB speculation barrier is issued from switch_mm() when the kernel switches to a user space task with a different mm than the user space task which ran last on the same CPU. An additional optimization is to avoid IBPB when the incoming task can be ptraced by the outgoing task. This optimization only works when switching directly between two user space tasks. When switching from a kernel task to a user space task the optimization fails because the previous task cannot be accessed anymore. So for quite some scenarios the optimization is just adding overhead. The upcoming conditional IBPB support will issue IBPB only for user space tasks which have the TIF_SPEC_IB bit set. This requires to handle the following cases: 1) Switch from a user space task (potential attacker) which has TIF_SPEC_IB set to a user space task (potential victim) which has TIF_SPEC_IB not set. 2) Switch from a user space task (potential attacker) which has TIF_SPEC_IB not set to a user space task (potential victim) which has TIF_SPEC_IB set. This needs to be optimized for the case where the IBPB can be avoided when only kernel threads ran in between user space tasks which belong to the same process. The current check whether two tasks belong to the same context is using the tasks context id. While correct, it's simpler to use the mm pointer because it allows to mangle the TIF_SPEC_IB bit into it. The context id based mechanism requires extra storage, which creates worse code. When a task is scheduled out its TIF_SPEC_IB bit is mangled as bit 0 into the per CPU storage which is used to track the last user space mm which was running on a CPU. This bit can be used together with the TIF_SPEC_IB bit of the incoming task to make the decision whether IBPB needs to be issued or not to cover the two cases above. As conditional IBPB is going to be the default, remove the dubious ptrace check for the IBPB always case and simply issue IBPB always when the process changes. Move the storage to a different place in the struct as the original one created a hole. Signed-off-by: Thomas Gleixner <[email protected]> Reviewed-by: Ingo Molnar <[email protected]> Cc: Peter Zijlstra <[email protected]> Cc: Andy Lutomirski <[email protected]> Cc: Linus Torvalds <[email protected]> Cc: Jiri Kosina <[email protected]> Cc: Tom Lendacky <[email protected]> Cc: Josh Poimboeuf <[email protected]> Cc: Andrea Arcangeli <[email protected]> Cc: David Woodhouse <[email protected]> Cc: Tim Chen <[email protected]> Cc: Andi Kleen <[email protected]> Cc: Dave Hansen <[email protected]> Cc: Casey Schaufler <[email protected]> Cc: Asit Mallick <[email protected]> Cc: Arjan van de Ven <[email protected]> Cc: Jon Masters <[email protected]> Cc: Waiman Long <[email protected]> Cc: Greg KH <[email protected]> Cc: Dave Stewart <[email protected]> Cc: Kees Cook <[email protected]> Cc: [email protected] Link: https://lkml.kernel.org/r/[email protected]
1 parent 5635d99 commit 4c71a2b

File tree

4 files changed

+118
-36
lines changed

4 files changed

+118
-36
lines changed

arch/x86/include/asm/nospec-branch.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,8 @@ do { \
312312
} while (0)
313313

314314
DECLARE_STATIC_KEY_FALSE(switch_to_cond_stibp);
315+
DECLARE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
316+
DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
315317

316318
#endif /* __ASSEMBLY__ */
317319

arch/x86/include/asm/tlbflush.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -169,10 +169,14 @@ struct tlb_state {
169169

170170
#define LOADED_MM_SWITCHING ((struct mm_struct *)1)
171171

172+
/* Last user mm for optimizing IBPB */
173+
union {
174+
struct mm_struct *last_user_mm;
175+
unsigned long last_user_mm_ibpb;
176+
};
177+
172178
u16 loaded_mm_asid;
173179
u16 next_asid;
174-
/* last user mm's ctx id */
175-
u64 last_ctx_id;
176180

177181
/*
178182
* We can be in one of several states:

arch/x86/kernel/cpu/bugs.c

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,10 @@ u64 __ro_after_init x86_amd_ls_cfg_ssbd_mask;
5656

5757
/* Control conditional STIPB in switch_to() */
5858
DEFINE_STATIC_KEY_FALSE(switch_to_cond_stibp);
59+
/* Control conditional IBPB in switch_mm() */
60+
DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
61+
/* Control unconditional IBPB in switch_mm() */
62+
DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
5963

6064
void __init check_bugs(void)
6165
{
@@ -331,7 +335,17 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
331335
/* Initialize Indirect Branch Prediction Barrier */
332336
if (boot_cpu_has(X86_FEATURE_IBPB)) {
333337
setup_force_cpu_cap(X86_FEATURE_USE_IBPB);
334-
pr_info("Spectre v2 mitigation: Enabling Indirect Branch Prediction Barrier\n");
338+
339+
switch (mode) {
340+
case SPECTRE_V2_USER_STRICT:
341+
static_branch_enable(&switch_mm_always_ibpb);
342+
break;
343+
default:
344+
break;
345+
}
346+
347+
pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n",
348+
mode == SPECTRE_V2_USER_STRICT ? "always-on" : "conditional");
335349
}
336350

337351
/* If enhanced IBRS is enabled no STIPB required */
@@ -955,10 +969,15 @@ static char *stibp_state(void)
955969

956970
static char *ibpb_state(void)
957971
{
958-
if (boot_cpu_has(X86_FEATURE_USE_IBPB))
959-
return ", IBPB";
960-
else
961-
return "";
972+
if (boot_cpu_has(X86_FEATURE_IBPB)) {
973+
switch (spectre_v2_user) {
974+
case SPECTRE_V2_USER_NONE:
975+
return ", IBPB: disabled";
976+
case SPECTRE_V2_USER_STRICT:
977+
return ", IBPB: always-on";
978+
}
979+
}
980+
return "";
962981
}
963982

964983
static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,

arch/x86/mm/tlb.c

Lines changed: 86 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
#include <linux/export.h>
88
#include <linux/cpu.h>
99
#include <linux/debugfs.h>
10-
#include <linux/ptrace.h>
1110

1211
#include <asm/tlbflush.h>
1312
#include <asm/mmu_context.h>
@@ -30,6 +29,12 @@
3029
* Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
3130
*/
3231

32+
/*
33+
* Use bit 0 to mangle the TIF_SPEC_IB state into the mm pointer which is
34+
* stored in cpu_tlb_state.last_user_mm_ibpb.
35+
*/
36+
#define LAST_USER_MM_IBPB 0x1UL
37+
3338
/*
3439
* We get here when we do something requiring a TLB invalidation
3540
* but could not go invalidate all of the contexts. We do the
@@ -181,17 +186,87 @@ static void sync_current_stack_to_mm(struct mm_struct *mm)
181186
}
182187
}
183188

184-
static bool ibpb_needed(struct task_struct *tsk, u64 last_ctx_id)
189+
static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next)
190+
{
191+
unsigned long next_tif = task_thread_info(next)->flags;
192+
unsigned long ibpb = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_IBPB;
193+
194+
return (unsigned long)next->mm | ibpb;
195+
}
196+
197+
static void cond_ibpb(struct task_struct *next)
185198
{
199+
if (!next || !next->mm)
200+
return;
201+
186202
/*
187-
* Check if the current (previous) task has access to the memory
188-
* of the @tsk (next) task. If access is denied, make sure to
189-
* issue a IBPB to stop user->user Spectre-v2 attacks.
190-
*
191-
* Note: __ptrace_may_access() returns 0 or -ERRNO.
203+
* Both, the conditional and the always IBPB mode use the mm
204+
* pointer to avoid the IBPB when switching between tasks of the
205+
* same process. Using the mm pointer instead of mm->context.ctx_id
206+
* opens a hypothetical hole vs. mm_struct reuse, which is more or
207+
* less impossible to control by an attacker. Aside of that it
208+
* would only affect the first schedule so the theoretically
209+
* exposed data is not really interesting.
192210
*/
193-
return (tsk && tsk->mm && tsk->mm->context.ctx_id != last_ctx_id &&
194-
ptrace_may_access_sched(tsk, PTRACE_MODE_SPEC_IBPB));
211+
if (static_branch_likely(&switch_mm_cond_ibpb)) {
212+
unsigned long prev_mm, next_mm;
213+
214+
/*
215+
* This is a bit more complex than the always mode because
216+
* it has to handle two cases:
217+
*
218+
* 1) Switch from a user space task (potential attacker)
219+
* which has TIF_SPEC_IB set to a user space task
220+
* (potential victim) which has TIF_SPEC_IB not set.
221+
*
222+
* 2) Switch from a user space task (potential attacker)
223+
* which has TIF_SPEC_IB not set to a user space task
224+
* (potential victim) which has TIF_SPEC_IB set.
225+
*
226+
* This could be done by unconditionally issuing IBPB when
227+
* a task which has TIF_SPEC_IB set is either scheduled in
228+
* or out. Though that results in two flushes when:
229+
*
230+
* - the same user space task is scheduled out and later
231+
* scheduled in again and only a kernel thread ran in
232+
* between.
233+
*
234+
* - a user space task belonging to the same process is
235+
* scheduled in after a kernel thread ran in between
236+
*
237+
* - a user space task belonging to the same process is
238+
* scheduled in immediately.
239+
*
240+
* Optimize this with reasonably small overhead for the
241+
* above cases. Mangle the TIF_SPEC_IB bit into the mm
242+
* pointer of the incoming task which is stored in
243+
* cpu_tlbstate.last_user_mm_ibpb for comparison.
244+
*/
245+
next_mm = mm_mangle_tif_spec_ib(next);
246+
prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_ibpb);
247+
248+
/*
249+
* Issue IBPB only if the mm's are different and one or
250+
* both have the IBPB bit set.
251+
*/
252+
if (next_mm != prev_mm &&
253+
(next_mm | prev_mm) & LAST_USER_MM_IBPB)
254+
indirect_branch_prediction_barrier();
255+
256+
this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, next_mm);
257+
}
258+
259+
if (static_branch_unlikely(&switch_mm_always_ibpb)) {
260+
/*
261+
* Only flush when switching to a user space task with a
262+
* different context than the user space task which ran
263+
* last on this CPU.
264+
*/
265+
if (this_cpu_read(cpu_tlbstate.last_user_mm) != next->mm) {
266+
indirect_branch_prediction_barrier();
267+
this_cpu_write(cpu_tlbstate.last_user_mm, next->mm);
268+
}
269+
}
195270
}
196271

197272
void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
@@ -292,22 +367,12 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
292367
new_asid = prev_asid;
293368
need_flush = true;
294369
} else {
295-
u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
296-
297370
/*
298371
* Avoid user/user BTB poisoning by flushing the branch
299372
* predictor when switching between processes. This stops
300373
* one process from doing Spectre-v2 attacks on another.
301-
*
302-
* As an optimization, flush indirect branches only when
303-
* switching into a processes that can't be ptrace by the
304-
* current one (as in such case, attacker has much more
305-
* convenient way how to tamper with the next process than
306-
* branch buffer poisoning).
307374
*/
308-
if (static_cpu_has(X86_FEATURE_USE_IBPB) &&
309-
ibpb_needed(tsk, last_ctx_id))
310-
indirect_branch_prediction_barrier();
375+
cond_ibpb(tsk);
311376

312377
if (IS_ENABLED(CONFIG_VMAP_STACK)) {
313378
/*
@@ -365,14 +430,6 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
365430
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
366431
}
367432

368-
/*
369-
* Record last user mm's context id, so we can avoid
370-
* flushing branch buffer with IBPB if we switch back
371-
* to the same user.
372-
*/
373-
if (next != &init_mm)
374-
this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
375-
376433
/* Make sure we write CR3 before loaded_mm. */
377434
barrier();
378435

@@ -441,7 +498,7 @@ void initialize_tlbstate_and_flush(void)
441498
write_cr3(build_cr3(mm->pgd, 0));
442499

443500
/* Reinitialize tlbstate. */
444-
this_cpu_write(cpu_tlbstate.last_ctx_id, mm->context.ctx_id);
501+
this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, LAST_USER_MM_IBPB);
445502
this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
446503
this_cpu_write(cpu_tlbstate.next_asid, 1);
447504
this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);

0 commit comments

Comments
 (0)