Skip to content

Commit a1c75e1

Browse files
committed
Merge branch 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 RAS updates from Ingo Molnar: - various AMD SMCA error parsing/reporting improvements (Yazen Ghannam) - extend Intel CMCI error reporting to more cases (Xie XiuQi) * 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/MCE: Make correctable error detection look at the Deferred bit x86/MCE: Report only DRAM ECC as memory errors on AMD systems x86/MCE/AMD: Define a function to get SMCA bank type x86/mce/AMD: Don't set DEF_INT_TYPE in MSR_CU_DEF_ERR on SMCA systems x86/MCE: Extend table to report action optional errors through CMCI too
2 parents d8b91dd + 179eb85 commit a1c75e1

File tree

4 files changed

+60
-14
lines changed

4 files changed

+60
-14
lines changed

arch/x86/include/asm/mce.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -376,6 +376,7 @@ struct smca_bank {
376376
extern struct smca_bank smca_banks[MAX_NR_BANKS];
377377

378378
extern const char *smca_get_long_name(enum smca_bank_types t);
379+
extern bool amd_mce_is_memory_error(struct mce *m);
379380

380381
extern int mce_threshold_create_device(unsigned int cpu);
381382
extern int mce_threshold_remove_device(unsigned int cpu);
@@ -384,6 +385,7 @@ extern int mce_threshold_remove_device(unsigned int cpu);
384385

385386
static inline int mce_threshold_create_device(unsigned int cpu) { return 0; };
386387
static inline int mce_threshold_remove_device(unsigned int cpu) { return 0; };
388+
static inline bool amd_mce_is_memory_error(struct mce *m) { return false; };
387389

388390
#endif
389391

arch/x86/kernel/cpu/mcheck/mce-severity.c

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ static struct severity {
5959
#define MCGMASK(x, y) .mcgmask = x, .mcgres = y
6060
#define MASK(x, y) .mask = x, .result = y
6161
#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
62+
#define MCI_UC_AR (MCI_STATUS_UC|MCI_STATUS_AR)
6263
#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
6364
#define MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV)
6465

@@ -101,6 +102,22 @@ static struct severity {
101102
NOSER, BITCLR(MCI_STATUS_UC)
102103
),
103104

105+
/*
106+
* known AO MCACODs reported via MCE or CMC:
107+
*
108+
* SRAO could be signaled either via a machine check exception or
109+
* CMCI with the corresponding bit S 1 or 0. So we don't need to
110+
* check bit S for SRAO.
111+
*/
112+
MCESEV(
113+
AO, "Action optional: memory scrubbing error",
114+
SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD_SCRUBMSK, MCI_STATUS_UC|MCACOD_SCRUB)
115+
),
116+
MCESEV(
117+
AO, "Action optional: last level cache writeback error",
118+
SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB)
119+
),
120+
104121
/* ignore OVER for UCNA */
105122
MCESEV(
106123
UCNA, "Uncorrected no action required",
@@ -149,15 +166,6 @@ static struct severity {
149166
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR)
150167
),
151168

152-
/* known AO MCACODs: */
153-
MCESEV(
154-
AO, "Action optional: memory scrubbing error",
155-
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD_SCRUBMSK, MCI_UC_S|MCACOD_SCRUB)
156-
),
157-
MCESEV(
158-
AO, "Action optional: last level cache writeback error",
159-
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|MCACOD_L3WB)
160-
),
161169
MCESEV(
162170
SOME, "Action optional: unknown MCACOD",
163171
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S)

arch/x86/kernel/cpu/mcheck/mce.c

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -503,10 +503,8 @@ static int mce_usable_address(struct mce *m)
503503
bool mce_is_memory_error(struct mce *m)
504504
{
505505
if (m->cpuvendor == X86_VENDOR_AMD) {
506-
/* ErrCodeExt[20:16] */
507-
u8 xec = (m->status >> 16) & 0x1f;
506+
return amd_mce_is_memory_error(m);
508507

509-
return (xec == 0x0 || xec == 0x8);
510508
} else if (m->cpuvendor == X86_VENDOR_INTEL) {
511509
/*
512510
* Intel SDM Volume 3B - 15.9.2 Compound Error Codes
@@ -530,14 +528,25 @@ bool mce_is_memory_error(struct mce *m)
530528
}
531529
EXPORT_SYMBOL_GPL(mce_is_memory_error);
532530

531+
static bool mce_is_correctable(struct mce *m)
532+
{
533+
if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED)
534+
return false;
535+
536+
if (m->status & MCI_STATUS_UC)
537+
return false;
538+
539+
return true;
540+
}
541+
533542
static bool cec_add_mce(struct mce *m)
534543
{
535544
if (!m)
536545
return false;
537546

538547
/* We eat only correctable DRAM errors with usable addresses. */
539548
if (mce_is_memory_error(m) &&
540-
!(m->status & MCI_STATUS_UC) &&
549+
mce_is_correctable(m) &&
541550
mce_usable_address(m))
542551
if (!cec_add_elem(m->addr >> PAGE_SHIFT))
543552
return true;

arch/x86/kernel/cpu/mcheck/mce_amd.c

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,20 @@ const char *smca_get_long_name(enum smca_bank_types t)
110110
}
111111
EXPORT_SYMBOL_GPL(smca_get_long_name);
112112

113+
static enum smca_bank_types smca_get_bank_type(struct mce *m)
114+
{
115+
struct smca_bank *b;
116+
117+
if (m->bank >= N_SMCA_BANK_TYPES)
118+
return N_SMCA_BANK_TYPES;
119+
120+
b = &smca_banks[m->bank];
121+
if (!b->hwid)
122+
return N_SMCA_BANK_TYPES;
123+
124+
return b->hwid->bank_type;
125+
}
126+
113127
static struct smca_hwid smca_hwid_mcatypes[] = {
114128
/* { bank_type, hwid_mcatype, xec_bitmap } */
115129

@@ -407,7 +421,9 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c)
407421
(deferred_error_int_vector != amd_deferred_error_interrupt))
408422
deferred_error_int_vector = amd_deferred_error_interrupt;
409423

410-
low = (low & ~MASK_DEF_INT_TYPE) | DEF_INT_TYPE_APIC;
424+
if (!mce_flags.smca)
425+
low = (low & ~MASK_DEF_INT_TYPE) | DEF_INT_TYPE_APIC;
426+
411427
wrmsr(MSR_CU_DEF_ERR, low, high);
412428
}
413429

@@ -738,6 +754,17 @@ int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr)
738754
}
739755
EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr);
740756

757+
bool amd_mce_is_memory_error(struct mce *m)
758+
{
759+
/* ErrCodeExt[20:16] */
760+
u8 xec = (m->status >> 16) & 0x1f;
761+
762+
if (mce_flags.smca)
763+
return smca_get_bank_type(m) == SMCA_UMC && xec == 0x0;
764+
765+
return m->bank == 4 && xec == 0x8;
766+
}
767+
741768
static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)
742769
{
743770
struct mce m;

0 commit comments

Comments
 (0)