|
7 | 7 | #include <linux/export.h>
|
8 | 8 | #include <linux/cpu.h>
|
9 | 9 | #include <linux/debugfs.h>
|
10 |
| -#include <linux/ptrace.h> |
11 | 10 |
|
12 | 11 | #include <asm/tlbflush.h>
|
13 | 12 | #include <asm/mmu_context.h>
|
|
30 | 29 | * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
|
31 | 30 | */
|
32 | 31 |
|
| 32 | +/* |
| 33 | + * Use bit 0 to mangle the TIF_SPEC_IB state into the mm pointer which is |
| 34 | + * stored in cpu_tlb_state.last_user_mm_ibpb. |
| 35 | + */ |
| 36 | +#define LAST_USER_MM_IBPB 0x1UL |
| 37 | + |
33 | 38 | /*
|
34 | 39 | * We get here when we do something requiring a TLB invalidation
|
35 | 40 | * but could not go invalidate all of the contexts. We do the
|
@@ -181,17 +186,87 @@ static void sync_current_stack_to_mm(struct mm_struct *mm)
|
181 | 186 | }
|
182 | 187 | }
|
183 | 188 |
|
184 |
| -static bool ibpb_needed(struct task_struct *tsk, u64 last_ctx_id) |
| 189 | +static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next) |
| 190 | +{ |
| 191 | + unsigned long next_tif = task_thread_info(next)->flags; |
| 192 | + unsigned long ibpb = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_IBPB; |
| 193 | + |
| 194 | + return (unsigned long)next->mm | ibpb; |
| 195 | +} |
| 196 | + |
| 197 | +static void cond_ibpb(struct task_struct *next) |
185 | 198 | {
|
| 199 | + if (!next || !next->mm) |
| 200 | + return; |
| 201 | + |
186 | 202 | /*
|
187 |
| - * Check if the current (previous) task has access to the memory |
188 |
| - * of the @tsk (next) task. If access is denied, make sure to |
189 |
| - * issue a IBPB to stop user->user Spectre-v2 attacks. |
190 |
| - * |
191 |
| - * Note: __ptrace_may_access() returns 0 or -ERRNO. |
| 203 | + * Both, the conditional and the always IBPB mode use the mm |
| 204 | + * pointer to avoid the IBPB when switching between tasks of the |
| 205 | + * same process. Using the mm pointer instead of mm->context.ctx_id |
| 206 | + * opens a hypothetical hole vs. mm_struct reuse, which is more or |
| 207 | + * less impossible to control by an attacker. Aside of that it |
| 208 | + * would only affect the first schedule so the theoretically |
| 209 | + * exposed data is not really interesting. |
192 | 210 | */
|
193 |
| - return (tsk && tsk->mm && tsk->mm->context.ctx_id != last_ctx_id && |
194 |
| - ptrace_may_access_sched(tsk, PTRACE_MODE_SPEC_IBPB)); |
| 211 | + if (static_branch_likely(&switch_mm_cond_ibpb)) { |
| 212 | + unsigned long prev_mm, next_mm; |
| 213 | + |
| 214 | + /* |
| 215 | + * This is a bit more complex than the always mode because |
| 216 | + * it has to handle two cases: |
| 217 | + * |
| 218 | + * 1) Switch from a user space task (potential attacker) |
| 219 | + * which has TIF_SPEC_IB set to a user space task |
| 220 | + * (potential victim) which has TIF_SPEC_IB not set. |
| 221 | + * |
| 222 | + * 2) Switch from a user space task (potential attacker) |
| 223 | + * which has TIF_SPEC_IB not set to a user space task |
| 224 | + * (potential victim) which has TIF_SPEC_IB set. |
| 225 | + * |
| 226 | + * This could be done by unconditionally issuing IBPB when |
| 227 | + * a task which has TIF_SPEC_IB set is either scheduled in |
| 228 | + * or out. Though that results in two flushes when: |
| 229 | + * |
| 230 | + * - the same user space task is scheduled out and later |
| 231 | + * scheduled in again and only a kernel thread ran in |
| 232 | + * between. |
| 233 | + * |
| 234 | + * - a user space task belonging to the same process is |
| 235 | + * scheduled in after a kernel thread ran in between |
| 236 | + * |
| 237 | + * - a user space task belonging to the same process is |
| 238 | + * scheduled in immediately. |
| 239 | + * |
| 240 | + * Optimize this with reasonably small overhead for the |
| 241 | + * above cases. Mangle the TIF_SPEC_IB bit into the mm |
| 242 | + * pointer of the incoming task which is stored in |
| 243 | + * cpu_tlbstate.last_user_mm_ibpb for comparison. |
| 244 | + */ |
| 245 | + next_mm = mm_mangle_tif_spec_ib(next); |
| 246 | + prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_ibpb); |
| 247 | + |
| 248 | + /* |
| 249 | + * Issue IBPB only if the mm's are different and one or |
| 250 | + * both have the IBPB bit set. |
| 251 | + */ |
| 252 | + if (next_mm != prev_mm && |
| 253 | + (next_mm | prev_mm) & LAST_USER_MM_IBPB) |
| 254 | + indirect_branch_prediction_barrier(); |
| 255 | + |
| 256 | + this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, next_mm); |
| 257 | + } |
| 258 | + |
| 259 | + if (static_branch_unlikely(&switch_mm_always_ibpb)) { |
| 260 | + /* |
| 261 | + * Only flush when switching to a user space task with a |
| 262 | + * different context than the user space task which ran |
| 263 | + * last on this CPU. |
| 264 | + */ |
| 265 | + if (this_cpu_read(cpu_tlbstate.last_user_mm) != next->mm) { |
| 266 | + indirect_branch_prediction_barrier(); |
| 267 | + this_cpu_write(cpu_tlbstate.last_user_mm, next->mm); |
| 268 | + } |
| 269 | + } |
195 | 270 | }
|
196 | 271 |
|
197 | 272 | void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
|
@@ -292,22 +367,12 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
|
292 | 367 | new_asid = prev_asid;
|
293 | 368 | need_flush = true;
|
294 | 369 | } else {
|
295 |
| - u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id); |
296 |
| - |
297 | 370 | /*
|
298 | 371 | * Avoid user/user BTB poisoning by flushing the branch
|
299 | 372 | * predictor when switching between processes. This stops
|
300 | 373 | * one process from doing Spectre-v2 attacks on another.
|
301 |
| - * |
302 |
| - * As an optimization, flush indirect branches only when |
303 |
| - * switching into a processes that can't be ptrace by the |
304 |
| - * current one (as in such case, attacker has much more |
305 |
| - * convenient way how to tamper with the next process than |
306 |
| - * branch buffer poisoning). |
307 | 374 | */
|
308 |
| - if (static_cpu_has(X86_FEATURE_USE_IBPB) && |
309 |
| - ibpb_needed(tsk, last_ctx_id)) |
310 |
| - indirect_branch_prediction_barrier(); |
| 375 | + cond_ibpb(tsk); |
311 | 376 |
|
312 | 377 | if (IS_ENABLED(CONFIG_VMAP_STACK)) {
|
313 | 378 | /*
|
@@ -365,14 +430,6 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
|
365 | 430 | trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
|
366 | 431 | }
|
367 | 432 |
|
368 |
| - /* |
369 |
| - * Record last user mm's context id, so we can avoid |
370 |
| - * flushing branch buffer with IBPB if we switch back |
371 |
| - * to the same user. |
372 |
| - */ |
373 |
| - if (next != &init_mm) |
374 |
| - this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id); |
375 |
| - |
376 | 433 | /* Make sure we write CR3 before loaded_mm. */
|
377 | 434 | barrier();
|
378 | 435 |
|
@@ -441,7 +498,7 @@ void initialize_tlbstate_and_flush(void)
|
441 | 498 | write_cr3(build_cr3(mm->pgd, 0));
|
442 | 499 |
|
443 | 500 | /* Reinitialize tlbstate. */
|
444 |
| - this_cpu_write(cpu_tlbstate.last_ctx_id, mm->context.ctx_id); |
| 501 | + this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, LAST_USER_MM_IBPB); |
445 | 502 | this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
|
446 | 503 | this_cpu_write(cpu_tlbstate.next_asid, 1);
|
447 | 504 | this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);
|
|
0 commit comments