Skip to content

Commit 6650cdd

Browse files
Peter Zijlstra (Intel)suryasaimadhu
authored andcommitted
x86/split_lock: Enable split lock detection by kernel
A split-lock occurs when an atomic instruction operates on data that spans two cache lines. In order to maintain atomicity the core takes a global bus lock. This is typically >1000 cycles slower than an atomic operation within a cache line. It also disrupts performance on other cores (which must wait for the bus lock to be released before their memory operations can complete). For real-time systems this may mean missing deadlines. For other systems it may just be very annoying. Some CPUs have the capability to raise an #AC trap when a split lock is attempted. Provide a command line option to give the user choices on how to handle this: split_lock_detect= off - not enabled (no traps for split locks) warn - warn once when an application does a split lock, but allow it to continue running. fatal - Send SIGBUS to applications that cause split lock On systems that support split lock detection the default is "warn". Note that if the kernel hits a split lock in any mode other than "off" it will OOPs. One implementation wrinkle is that the MSR to control the split lock detection is per-core, not per thread. This might result in some short lived races on HT systems in "warn" mode if Linux tries to enable on one thread while disabling on the other. Race analysis by Sean Christopherson: - Toggling of split-lock is only done in "warn" mode. Worst case scenario of a race is that a misbehaving task will generate multiple #AC exceptions on the same instruction. And this race will only occur if both siblings are running tasks that generate split-lock #ACs, e.g. a race where sibling threads are writing different values will only occur if CPUx is disabling split-lock after an #AC and CPUy is re-enabling split-lock after *its* previous task generated an #AC. - Transitioning between off/warn/fatal modes at runtime isn't supported and disabling is tracked per task, so hardware will always reach a steady state that matches the configured mode. I.e. split-lock is guaranteed to be enabled in hardware once all _TIF_SLD threads have been scheduled out. Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Co-developed-by: Fenghua Yu <[email protected]> Signed-off-by: Fenghua Yu <[email protected]> Co-developed-by: Tony Luck <[email protected]> Signed-off-by: Tony Luck <[email protected]> Signed-off-by: Thomas Gleixner <[email protected]> Signed-off-by: Borislav Petkov <[email protected]> Link: https://lore.kernel.org/r/[email protected]
1 parent 11a48a5 commit 6650cdd

File tree

9 files changed

+250
-3
lines changed

9 files changed

+250
-3
lines changed

Documentation/admin-guide/kernel-parameters.txt

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4655,6 +4655,28 @@
46554655
spia_pedr=
46564656
spia_peddr=
46574657

4658+
split_lock_detect=
4659+
[X86] Enable split lock detection
4660+
4661+
When enabled (and if hardware support is present), atomic
4662+
instructions that access data across cache line
4663+
boundaries will result in an alignment check exception.
4664+
4665+
off - not enabled
4666+
4667+
warn - the kernel will emit rate limited warnings
4668+
about applications triggering the #AC
4669+
exception. This mode is the default on CPUs
4670+
that supports split lock detection.
4671+
4672+
fatal - the kernel will send SIGBUS to applications
4673+
that trigger the #AC exception.
4674+
4675+
If an #AC exception is hit in the kernel or in
4676+
firmware (i.e. not while executing in user mode)
4677+
the kernel will oops in either "warn" or "fatal"
4678+
mode.
4679+
46584680
srcutree.counter_wrap_check [KNL]
46594681
Specifies how frequently to check for
46604682
grace-period sequence counter wrap for the

arch/x86/include/asm/cpu.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,4 +40,16 @@ int mwait_usable(const struct cpuinfo_x86 *);
4040
unsigned int x86_family(unsigned int sig);
4141
unsigned int x86_model(unsigned int sig);
4242
unsigned int x86_stepping(unsigned int sig);
43+
#ifdef CONFIG_CPU_SUP_INTEL
44+
extern void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c);
45+
extern void switch_to_sld(unsigned long tifn);
46+
extern bool handle_user_split_lock(struct pt_regs *regs, long error_code);
47+
#else
48+
static inline void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c) {}
49+
static inline void switch_to_sld(unsigned long tifn) {}
50+
static inline bool handle_user_split_lock(struct pt_regs *regs, long error_code)
51+
{
52+
return false;
53+
}
54+
#endif
4355
#endif /* _ASM_X86_CPU_H */

arch/x86/include/asm/cpufeatures.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,7 @@
285285
#define X86_FEATURE_CQM_MBM_LOCAL (11*32+ 3) /* LLC Local MBM monitoring */
286286
#define X86_FEATURE_FENCE_SWAPGS_USER (11*32+ 4) /* "" LFENCE in user entry SWAPGS path */
287287
#define X86_FEATURE_FENCE_SWAPGS_KERNEL (11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */
288+
#define X86_FEATURE_SPLIT_LOCK_DETECT (11*32+ 6) /* #AC for split lock */
288289

289290
/* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
290291
#define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */
@@ -367,6 +368,7 @@
367368
#define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */
368369
#define X86_FEATURE_FLUSH_L1D (18*32+28) /* Flush L1D cache */
369370
#define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */
371+
#define X86_FEATURE_CORE_CAPABILITIES (18*32+30) /* "" IA32_CORE_CAPABILITIES MSR */
370372
#define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* "" Speculative Store Bypass Disable */
371373

372374
/*

arch/x86/include/asm/msr-index.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,10 @@
4141

4242
/* Intel MSRs. Some also available on other CPUs */
4343

44+
#define MSR_TEST_CTRL 0x00000033
45+
#define MSR_TEST_CTRL_SPLIT_LOCK_DETECT_BIT 29
46+
#define MSR_TEST_CTRL_SPLIT_LOCK_DETECT BIT(MSR_TEST_CTRL_SPLIT_LOCK_DETECT_BIT)
47+
4448
#define MSR_IA32_SPEC_CTRL 0x00000048 /* Speculation Control */
4549
#define SPEC_CTRL_IBRS BIT(0) /* Indirect Branch Restricted Speculation */
4650
#define SPEC_CTRL_STIBP_SHIFT 1 /* Single Thread Indirect Branch Predictor (STIBP) bit */
@@ -70,6 +74,11 @@
7074
*/
7175
#define MSR_IA32_UMWAIT_CONTROL_TIME_MASK (~0x03U)
7276

77+
/* Abbreviated from Intel SDM name IA32_CORE_CAPABILITIES */
78+
#define MSR_IA32_CORE_CAPS 0x000000cf
79+
#define MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT_BIT 5
80+
#define MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT BIT(MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT_BIT)
81+
7382
#define MSR_PKG_CST_CONFIG_CONTROL 0x000000e2
7483
#define NHM_C3_AUTO_DEMOTE (1UL << 25)
7584
#define NHM_C1_AUTO_DEMOTE (1UL << 26)

arch/x86/include/asm/thread_info.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ struct thread_info {
9292
#define TIF_NOCPUID 15 /* CPUID is not accessible in userland */
9393
#define TIF_NOTSC 16 /* TSC is not accessible in userland */
9494
#define TIF_IA32 17 /* IA32 compatibility process */
95+
#define TIF_SLD 18 /* Restore split lock detection on context switch */
9596
#define TIF_NOHZ 19 /* in adaptive nohz mode */
9697
#define TIF_MEMDIE 20 /* is terminating due to OOM killer */
9798
#define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */
@@ -122,6 +123,7 @@ struct thread_info {
122123
#define _TIF_NOCPUID (1 << TIF_NOCPUID)
123124
#define _TIF_NOTSC (1 << TIF_NOTSC)
124125
#define _TIF_IA32 (1 << TIF_IA32)
126+
#define _TIF_SLD (1 << TIF_SLD)
125127
#define _TIF_NOHZ (1 << TIF_NOHZ)
126128
#define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG)
127129
#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP)
@@ -145,7 +147,7 @@ struct thread_info {
145147
/* flags to check in __switch_to() */
146148
#define _TIF_WORK_CTXSW_BASE \
147149
(_TIF_NOCPUID | _TIF_NOTSC | _TIF_BLOCKSTEP | \
148-
_TIF_SSBD | _TIF_SPEC_FORCE_UPDATE)
150+
_TIF_SSBD | _TIF_SPEC_FORCE_UPDATE | _TIF_SLD)
149151

150152
/*
151153
* Avoid calls to __switch_to_xtra() on UP as STIBP is not evaluated.

arch/x86/kernel/cpu/common.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1224,6 +1224,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
12241224

12251225
cpu_set_bug_bits(c);
12261226

1227+
cpu_set_core_cap_bits(c);
1228+
12271229
fpu__init_system(c);
12281230

12291231
#ifdef CONFIG_X86_32

arch/x86/kernel/cpu/intel.c

Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
#include <asm/microcode_intel.h>
2020
#include <asm/hwcap2.h>
2121
#include <asm/elf.h>
22+
#include <asm/cpu_device_id.h>
23+
#include <asm/cmdline.h>
2224

2325
#ifdef CONFIG_X86_64
2426
#include <linux/topology.h>
@@ -31,6 +33,19 @@
3133
#include <asm/apic.h>
3234
#endif
3335

36+
enum split_lock_detect_state {
37+
sld_off = 0,
38+
sld_warn,
39+
sld_fatal,
40+
};
41+
42+
/*
43+
* Default to sld_off because most systems do not support split lock detection
44+
* split_lock_setup() will switch this to sld_warn on systems that support
45+
* split lock detect, unless there is a command line override.
46+
*/
47+
static enum split_lock_detect_state sld_state = sld_off;
48+
3449
/*
3550
* Processors which have self-snooping capability can handle conflicting
3651
* memory type across CPUs by snooping its own cache. However, there exists
@@ -570,6 +585,8 @@ static void init_intel_misc_features(struct cpuinfo_x86 *c)
570585
wrmsrl(MSR_MISC_FEATURES_ENABLES, msr);
571586
}
572587

588+
static void split_lock_init(void);
589+
573590
static void init_intel(struct cpuinfo_x86 *c)
574591
{
575592
early_init_intel(c);
@@ -684,6 +701,8 @@ static void init_intel(struct cpuinfo_x86 *c)
684701
tsx_enable();
685702
if (tsx_ctrl_state == TSX_CTRL_DISABLE)
686703
tsx_disable();
704+
705+
split_lock_init();
687706
}
688707

689708
#ifdef CONFIG_X86_32
@@ -945,3 +964,159 @@ static const struct cpu_dev intel_cpu_dev = {
945964
};
946965

947966
cpu_dev_register(intel_cpu_dev);
967+
968+
#undef pr_fmt
969+
#define pr_fmt(fmt) "x86/split lock detection: " fmt
970+
971+
static const struct {
972+
const char *option;
973+
enum split_lock_detect_state state;
974+
} sld_options[] __initconst = {
975+
{ "off", sld_off },
976+
{ "warn", sld_warn },
977+
{ "fatal", sld_fatal },
978+
};
979+
980+
static inline bool match_option(const char *arg, int arglen, const char *opt)
981+
{
982+
int len = strlen(opt);
983+
984+
return len == arglen && !strncmp(arg, opt, len);
985+
}
986+
987+
static void __init split_lock_setup(void)
988+
{
989+
char arg[20];
990+
int i, ret;
991+
992+
setup_force_cpu_cap(X86_FEATURE_SPLIT_LOCK_DETECT);
993+
sld_state = sld_warn;
994+
995+
ret = cmdline_find_option(boot_command_line, "split_lock_detect",
996+
arg, sizeof(arg));
997+
if (ret >= 0) {
998+
for (i = 0; i < ARRAY_SIZE(sld_options); i++) {
999+
if (match_option(arg, ret, sld_options[i].option)) {
1000+
sld_state = sld_options[i].state;
1001+
break;
1002+
}
1003+
}
1004+
}
1005+
1006+
switch (sld_state) {
1007+
case sld_off:
1008+
pr_info("disabled\n");
1009+
break;
1010+
1011+
case sld_warn:
1012+
pr_info("warning about user-space split_locks\n");
1013+
break;
1014+
1015+
case sld_fatal:
1016+
pr_info("sending SIGBUS on user-space split_locks\n");
1017+
break;
1018+
}
1019+
}
1020+
1021+
/*
1022+
* Locking is not required at the moment because only bit 29 of this
1023+
* MSR is implemented and locking would not prevent that the operation
1024+
* of one thread is immediately undone by the sibling thread.
1025+
* Use the "safe" versions of rdmsr/wrmsr here because although code
1026+
* checks CPUID and MSR bits to make sure the TEST_CTRL MSR should
1027+
* exist, there may be glitches in virtualization that leave a guest
1028+
* with an incorrect view of real h/w capabilities.
1029+
*/
1030+
static bool __sld_msr_set(bool on)
1031+
{
1032+
u64 test_ctrl_val;
1033+
1034+
if (rdmsrl_safe(MSR_TEST_CTRL, &test_ctrl_val))
1035+
return false;
1036+
1037+
if (on)
1038+
test_ctrl_val |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
1039+
else
1040+
test_ctrl_val &= ~MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
1041+
1042+
return !wrmsrl_safe(MSR_TEST_CTRL, test_ctrl_val);
1043+
}
1044+
1045+
static void split_lock_init(void)
1046+
{
1047+
if (sld_state == sld_off)
1048+
return;
1049+
1050+
if (__sld_msr_set(true))
1051+
return;
1052+
1053+
/*
1054+
* If this is anything other than the boot-cpu, you've done
1055+
* funny things and you get to keep whatever pieces.
1056+
*/
1057+
pr_warn("MSR fail -- disabled\n");
1058+
sld_state = sld_off;
1059+
}
1060+
1061+
bool handle_user_split_lock(struct pt_regs *regs, long error_code)
1062+
{
1063+
if ((regs->flags & X86_EFLAGS_AC) || sld_state == sld_fatal)
1064+
return false;
1065+
1066+
pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n",
1067+
current->comm, current->pid, regs->ip);
1068+
1069+
/*
1070+
* Disable the split lock detection for this task so it can make
1071+
* progress and set TIF_SLD so the detection is re-enabled via
1072+
* switch_to_sld() when the task is scheduled out.
1073+
*/
1074+
__sld_msr_set(false);
1075+
set_tsk_thread_flag(current, TIF_SLD);
1076+
return true;
1077+
}
1078+
1079+
/*
1080+
* This function is called only when switching between tasks with
1081+
* different split-lock detection modes. It sets the MSR for the
1082+
* mode of the new task. This is right most of the time, but since
1083+
* the MSR is shared by hyperthreads on a physical core there can
1084+
* be glitches when the two threads need different modes.
1085+
*/
1086+
void switch_to_sld(unsigned long tifn)
1087+
{
1088+
__sld_msr_set(!(tifn & _TIF_SLD));
1089+
}
1090+
1091+
#define SPLIT_LOCK_CPU(model) {X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY}
1092+
1093+
/*
1094+
* The following processors have the split lock detection feature. But
1095+
* since they don't have the IA32_CORE_CAPABILITIES MSR, the feature cannot
1096+
* be enumerated. Enable it by family and model matching on these
1097+
* processors.
1098+
*/
1099+
static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = {
1100+
SPLIT_LOCK_CPU(INTEL_FAM6_ICELAKE_X),
1101+
SPLIT_LOCK_CPU(INTEL_FAM6_ICELAKE_L),
1102+
{}
1103+
};
1104+
1105+
void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c)
1106+
{
1107+
u64 ia32_core_caps = 0;
1108+
1109+
if (c->x86_vendor != X86_VENDOR_INTEL)
1110+
return;
1111+
if (cpu_has(c, X86_FEATURE_CORE_CAPABILITIES)) {
1112+
/* Enumerate features reported in IA32_CORE_CAPABILITIES MSR. */
1113+
rdmsrl(MSR_IA32_CORE_CAPS, ia32_core_caps);
1114+
} else if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
1115+
/* Enumerate split lock detection by family and model. */
1116+
if (x86_match_cpu(split_lock_cpu_ids))
1117+
ia32_core_caps |= MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT;
1118+
}
1119+
1120+
if (ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT)
1121+
split_lock_setup();
1122+
}

arch/x86/kernel/process.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -650,6 +650,9 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
650650
/* Enforce MSR update to ensure consistent state */
651651
__speculation_ctrl_update(~tifn, tifn);
652652
}
653+
654+
if ((tifp ^ tifn) & _TIF_SLD)
655+
switch_to_sld(tifn);
653656
}
654657

655658
/*

arch/x86/kernel/traps.c

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
#include <asm/traps.h>
4747
#include <asm/desc.h>
4848
#include <asm/fpu/internal.h>
49+
#include <asm/cpu.h>
4950
#include <asm/cpu_entry_area.h>
5051
#include <asm/mce.h>
5152
#include <asm/fixmap.h>
@@ -242,7 +243,6 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
242243
{
243244
struct task_struct *tsk = current;
244245

245-
246246
if (!do_trap_no_signal(tsk, trapnr, str, regs, error_code))
247247
return;
248248

@@ -288,9 +288,29 @@ DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, 0, NULL, "coprocessor segment overru
288288
DO_ERROR(X86_TRAP_TS, SIGSEGV, 0, NULL, "invalid TSS", invalid_TSS)
289289
DO_ERROR(X86_TRAP_NP, SIGBUS, 0, NULL, "segment not present", segment_not_present)
290290
DO_ERROR(X86_TRAP_SS, SIGBUS, 0, NULL, "stack segment", stack_segment)
291-
DO_ERROR(X86_TRAP_AC, SIGBUS, BUS_ADRALN, NULL, "alignment check", alignment_check)
292291
#undef IP
293292

293+
dotraplinkage void do_alignment_check(struct pt_regs *regs, long error_code)
294+
{
295+
char *str = "alignment check";
296+
297+
RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
298+
299+
if (notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_AC, SIGBUS) == NOTIFY_STOP)
300+
return;
301+
302+
if (!user_mode(regs))
303+
die("Split lock detected\n", regs, error_code);
304+
305+
local_irq_enable();
306+
307+
if (handle_user_split_lock(regs, error_code))
308+
return;
309+
310+
do_trap(X86_TRAP_AC, SIGBUS, "alignment check", regs,
311+
error_code, BUS_ADRALN, NULL);
312+
}
313+
294314
#ifdef CONFIG_VMAP_STACK
295315
__visible void __noreturn handle_stack_overflow(const char *message,
296316
struct pt_regs *regs,

0 commit comments

Comments
 (0)