Skip to content

Commit a43de48

Browse files
committed
Merge branch 'ras-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull ras fixes from Thomas Gleixner: "A set of fixes for RAS/MCE: - Improve the error message when the kernel cannot recover from a MCE so the maximum amount of information gets provided. - Individually check MCE recovery features on SkyLake CPUs instead of assuming none when the CAPID0 register does not advertise the general ability for recovery. - Prevent MCE to output inconsistent messages which first show an error location and then claim that the source is unknown. - Prevent overwriting MCi_STATUS in the attempt to gather more information when a fatal MCE has alreay been detected. This leads to empty status values in the printout and failing to react promptly on the fatal event" * 'ras-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/mce: Fix incorrect "Machine check from unknown source" message x86/mce: Do not overwrite MCi_STATUS in mce_no_way_out() x86/mce: Check for alternate indication of machine check recovery on Skylake x86/mce: Improve error message when kernel cannot recover
2 parents 6242258 + 40c36e2 commit a43de48

File tree

3 files changed

+42
-18
lines changed

3 files changed

+42
-18
lines changed

arch/x86/kernel/cpu/mcheck/mce-severity.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,11 @@ static struct severity {
160160
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),
161161
USER
162162
),
163+
MCESEV(
164+
PANIC, "Data load in unrecoverable area of kernel",
165+
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
166+
KERNEL
167+
),
163168
#endif
164169
MCESEV(
165170
PANIC, "Action required: unknown MCACOD",

arch/x86/kernel/cpu/mcheck/mce.c

Lines changed: 28 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -772,23 +772,25 @@ EXPORT_SYMBOL_GPL(machine_check_poll);
772772
static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
773773
struct pt_regs *regs)
774774
{
775-
int i, ret = 0;
776775
char *tmp;
776+
int i;
777777

778778
for (i = 0; i < mca_cfg.banks; i++) {
779779
m->status = mce_rdmsrl(msr_ops.status(i));
780-
if (m->status & MCI_STATUS_VAL) {
781-
__set_bit(i, validp);
782-
if (quirk_no_way_out)
783-
quirk_no_way_out(i, m, regs);
784-
}
780+
if (!(m->status & MCI_STATUS_VAL))
781+
continue;
782+
783+
__set_bit(i, validp);
784+
if (quirk_no_way_out)
785+
quirk_no_way_out(i, m, regs);
785786

786787
if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
788+
mce_read_aux(m, i);
787789
*msg = tmp;
788-
ret = 1;
790+
return 1;
789791
}
790792
}
791-
return ret;
793+
return 0;
792794
}
793795

794796
/*
@@ -1205,13 +1207,18 @@ void do_machine_check(struct pt_regs *regs, long error_code)
12051207
lmce = m.mcgstatus & MCG_STATUS_LMCES;
12061208

12071209
/*
1210+
* Local machine check may already know that we have to panic.
1211+
* Broadcast machine check begins rendezvous in mce_start()
12081212
* Go through all banks in exclusion of the other CPUs. This way we
12091213
* don't report duplicated events on shared banks because the first one
1210-
* to see it will clear it. If this is a Local MCE, then no need to
1211-
* perform rendezvous.
1214+
* to see it will clear it.
12121215
*/
1213-
if (!lmce)
1216+
if (lmce) {
1217+
if (no_way_out)
1218+
mce_panic("Fatal local machine check", &m, msg);
1219+
} else {
12141220
order = mce_start(&no_way_out);
1221+
}
12151222

12161223
for (i = 0; i < cfg->banks; i++) {
12171224
__clear_bit(i, toclear);
@@ -1287,12 +1294,17 @@ void do_machine_check(struct pt_regs *regs, long error_code)
12871294
no_way_out = worst >= MCE_PANIC_SEVERITY;
12881295
} else {
12891296
/*
1290-
* Local MCE skipped calling mce_reign()
1291-
* If we found a fatal error, we need to panic here.
1297+
* If there was a fatal machine check we should have
1298+
* already called mce_panic earlier in this function.
1299+
* Since we re-read the banks, we might have found
1300+
* something new. Check again to see if we found a
1301+
* fatal error. We call "mce_severity()" again to
1302+
* make sure we have the right "msg".
12921303
*/
1293-
if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
1294-
mce_panic("Machine check from unknown source",
1295-
NULL, NULL);
1304+
if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) {
1305+
mce_severity(&m, cfg->tolerant, &msg, true);
1306+
mce_panic("Local fatal machine check!", &m, msg);
1307+
}
12961308
}
12971309

12981310
/*

arch/x86/kernel/quirks.c

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -645,12 +645,19 @@ static void quirk_intel_brickland_xeon_ras_cap(struct pci_dev *pdev)
645645
/* Skylake */
646646
static void quirk_intel_purley_xeon_ras_cap(struct pci_dev *pdev)
647647
{
648-
u32 capid0;
648+
u32 capid0, capid5;
649649

650650
pci_read_config_dword(pdev, 0x84, &capid0);
651+
pci_read_config_dword(pdev, 0x98, &capid5);
651652

652-
if ((capid0 & 0xc0) == 0xc0)
653+
/*
654+
* CAPID0{7:6} indicate whether this is an advanced RAS SKU
655+
* CAPID5{8:5} indicate that various NVDIMM usage modes are
656+
* enabled, so memory machine check recovery is also enabled.
657+
*/
658+
if ((capid0 & 0xc0) == 0xc0 || (capid5 & 0x1e0))
653659
static_branch_inc(&mcsafe_key);
660+
654661
}
655662
DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x0ec3, quirk_intel_brickland_xeon_ras_cap);
656663
DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, quirk_intel_brickland_xeon_ras_cap);

0 commit comments

Comments
 (0)