Skip to content

Commit 011d826

Browse files
suryasaimadhuIngo Molnar
authored andcommitted
RAS: Add a Corrected Errors Collector
Introduce a simple data structure for collecting correctable errors along with accessors. More detailed description in the code itself. The error decoding is done with the decoding chain now and mce_first_notifier() gets to see the error first and the CEC decides whether to log it and then the rest of the chain doesn't hear about it - basically the main reason for the CE collector - or to continue running the notifiers. When the CEC hits the action threshold, it will try to soft-offine the page containing the ECC and then the whole decoding chain gets to see the error. Signed-off-by: Borislav Petkov <[email protected]> Cc: Linus Torvalds <[email protected]> Cc: Peter Zijlstra <[email protected]> Cc: Thomas Gleixner <[email protected]> Cc: linux-edac <[email protected]> Link: http://lkml.kernel.org/r/[email protected] Signed-off-by: Ingo Molnar <[email protected]>
1 parent e64edfc commit 011d826

File tree

10 files changed

+706
-83
lines changed

10 files changed

+706
-83
lines changed

Documentation/admin-guide/kernel-parameters.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3172,6 +3172,12 @@
31723172
ramdisk_size= [RAM] Sizes of RAM disks in kilobytes
31733173
See Documentation/blockdev/ramdisk.txt.
31743174

3175+
ras=option[,option,...] [KNL] RAS-specific options
3176+
3177+
cec_disable [X86]
3178+
Disable the Correctable Errors Collector,
3179+
see CONFIG_RAS_CEC help text.
3180+
31753181
rcu_nocbs= [KNL]
31763182
The argument is a cpu list, as described above.
31773183

arch/x86/include/asm/mce.h

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -191,10 +191,11 @@ extern struct mca_config mca_cfg;
191191
extern struct mca_msr_regs msr_ops;
192192

193193
enum mce_notifier_prios {
194-
MCE_PRIO_SRAO = INT_MAX,
195-
MCE_PRIO_EXTLOG = INT_MAX - 1,
196-
MCE_PRIO_NFIT = INT_MAX - 2,
197-
MCE_PRIO_EDAC = INT_MAX - 3,
194+
MCE_PRIO_FIRST = INT_MAX,
195+
MCE_PRIO_SRAO = INT_MAX - 1,
196+
MCE_PRIO_EXTLOG = INT_MAX - 2,
197+
MCE_PRIO_NFIT = INT_MAX - 3,
198+
MCE_PRIO_EDAC = INT_MAX - 4,
198199
MCE_PRIO_LOWEST = 0,
199200
};
200201

arch/x86/kernel/cpu/mcheck/mce.c

Lines changed: 115 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
#include <linux/poll.h>
3636
#include <linux/nmi.h>
3737
#include <linux/cpu.h>
38+
#include <linux/ras.h>
3839
#include <linux/smp.h>
3940
#include <linux/fs.h>
4041
#include <linux/mm.h>
@@ -160,47 +161,8 @@ static struct mce_log_buffer mcelog_buf = {
160161

161162
void mce_log(struct mce *m)
162163
{
163-
unsigned next, entry;
164-
165-
/* Emit the trace record: */
166-
trace_mce_record(m);
167-
168164
if (!mce_gen_pool_add(m))
169165
irq_work_queue(&mce_irq_work);
170-
171-
wmb();
172-
for (;;) {
173-
entry = mce_log_get_idx_check(mcelog_buf.next);
174-
for (;;) {
175-
176-
/*
177-
* When the buffer fills up discard new entries.
178-
* Assume that the earlier errors are the more
179-
* interesting ones:
180-
*/
181-
if (entry >= MCE_LOG_LEN) {
182-
set_bit(MCE_OVERFLOW,
183-
(unsigned long *)&mcelog_buf.flags);
184-
return;
185-
}
186-
/* Old left over entry. Skip: */
187-
if (mcelog_buf.entry[entry].finished) {
188-
entry++;
189-
continue;
190-
}
191-
break;
192-
}
193-
smp_rmb();
194-
next = entry + 1;
195-
if (cmpxchg(&mcelog_buf.next, entry, next) == entry)
196-
break;
197-
}
198-
memcpy(mcelog_buf.entry + entry, m, sizeof(struct mce));
199-
wmb();
200-
mcelog_buf.entry[entry].finished = 1;
201-
wmb();
202-
203-
set_bit(0, &mce_need_notify);
204166
}
205167

206168
void mce_inject_log(struct mce *m)
@@ -213,6 +175,12 @@ EXPORT_SYMBOL_GPL(mce_inject_log);
213175

214176
static struct notifier_block mce_srao_nb;
215177

178+
/*
179+
* We run the default notifier if we have only the SRAO, the first and the
180+
* default notifier registered. I.e., the mandatory NUM_DEFAULT_NOTIFIERS
181+
* notifiers registered on the chain.
182+
*/
183+
#define NUM_DEFAULT_NOTIFIERS 3
216184
static atomic_t num_notifiers;
217185

218186
void mce_register_decode_chain(struct notifier_block *nb)
@@ -522,7 +490,6 @@ static void mce_schedule_work(void)
522490

523491
static void mce_irq_work_cb(struct irq_work *entry)
524492
{
525-
mce_notify_irq();
526493
mce_schedule_work();
527494
}
528495

@@ -565,6 +532,111 @@ static int mce_usable_address(struct mce *m)
565532
return 1;
566533
}
567534

535+
static bool memory_error(struct mce *m)
536+
{
537+
struct cpuinfo_x86 *c = &boot_cpu_data;
538+
539+
if (c->x86_vendor == X86_VENDOR_AMD) {
540+
/* ErrCodeExt[20:16] */
541+
u8 xec = (m->status >> 16) & 0x1f;
542+
543+
return (xec == 0x0 || xec == 0x8);
544+
} else if (c->x86_vendor == X86_VENDOR_INTEL) {
545+
/*
546+
* Intel SDM Volume 3B - 15.9.2 Compound Error Codes
547+
*
548+
* Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
549+
* indicating a memory error. Bit 8 is used for indicating a
550+
* cache hierarchy error. The combination of bit 2 and bit 3
551+
* is used for indicating a `generic' cache hierarchy error
552+
* But we can't just blindly check the above bits, because if
553+
* bit 11 is set, then it is a bus/interconnect error - and
554+
* either way the above bits just gives more detail on what
555+
* bus/interconnect error happened. Note that bit 12 can be
556+
* ignored, as it's the "filter" bit.
557+
*/
558+
return (m->status & 0xef80) == BIT(7) ||
559+
(m->status & 0xef00) == BIT(8) ||
560+
(m->status & 0xeffc) == 0xc;
561+
}
562+
563+
return false;
564+
}
565+
566+
static bool cec_add_mce(struct mce *m)
567+
{
568+
if (!m)
569+
return false;
570+
571+
/* We eat only correctable DRAM errors with usable addresses. */
572+
if (memory_error(m) &&
573+
!(m->status & MCI_STATUS_UC) &&
574+
mce_usable_address(m))
575+
if (!cec_add_elem(m->addr >> PAGE_SHIFT))
576+
return true;
577+
578+
return false;
579+
}
580+
581+
static int mce_first_notifier(struct notifier_block *nb, unsigned long val,
582+
void *data)
583+
{
584+
struct mce *m = (struct mce *)data;
585+
unsigned int next, entry;
586+
587+
if (!m)
588+
return NOTIFY_DONE;
589+
590+
if (cec_add_mce(m))
591+
return NOTIFY_STOP;
592+
593+
/* Emit the trace record: */
594+
trace_mce_record(m);
595+
596+
wmb();
597+
for (;;) {
598+
entry = mce_log_get_idx_check(mcelog_buf.next);
599+
for (;;) {
600+
601+
/*
602+
* When the buffer fills up discard new entries.
603+
* Assume that the earlier errors are the more
604+
* interesting ones:
605+
*/
606+
if (entry >= MCE_LOG_LEN) {
607+
set_bit(MCE_OVERFLOW,
608+
(unsigned long *)&mcelog_buf.flags);
609+
return NOTIFY_DONE;
610+
}
611+
/* Old left over entry. Skip: */
612+
if (mcelog_buf.entry[entry].finished) {
613+
entry++;
614+
continue;
615+
}
616+
break;
617+
}
618+
smp_rmb();
619+
next = entry + 1;
620+
if (cmpxchg(&mcelog_buf.next, entry, next) == entry)
621+
break;
622+
}
623+
memcpy(mcelog_buf.entry + entry, m, sizeof(struct mce));
624+
wmb();
625+
mcelog_buf.entry[entry].finished = 1;
626+
wmb();
627+
628+
set_bit(0, &mce_need_notify);
629+
630+
mce_notify_irq();
631+
632+
return NOTIFY_DONE;
633+
}
634+
635+
static struct notifier_block first_nb = {
636+
.notifier_call = mce_first_notifier,
637+
.priority = MCE_PRIO_FIRST,
638+
};
639+
568640
static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
569641
void *data)
570642
{
@@ -594,11 +666,7 @@ static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
594666
if (!m)
595667
return NOTIFY_DONE;
596668

597-
/*
598-
* Run the default notifier if we have only the SRAO
599-
* notifier and us registered.
600-
*/
601-
if (atomic_read(&num_notifiers) > 2)
669+
if (atomic_read(&num_notifiers) > NUM_DEFAULT_NOTIFIERS)
602670
return NOTIFY_DONE;
603671

604672
/* Don't print when mcelog is running */
@@ -655,37 +723,6 @@ static void mce_read_aux(struct mce *m, int i)
655723
}
656724
}
657725

658-
static bool memory_error(struct mce *m)
659-
{
660-
struct cpuinfo_x86 *c = &boot_cpu_data;
661-
662-
if (c->x86_vendor == X86_VENDOR_AMD) {
663-
/* ErrCodeExt[20:16] */
664-
u8 xec = (m->status >> 16) & 0x1f;
665-
666-
return (xec == 0x0 || xec == 0x8);
667-
} else if (c->x86_vendor == X86_VENDOR_INTEL) {
668-
/*
669-
* Intel SDM Volume 3B - 15.9.2 Compound Error Codes
670-
*
671-
* Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
672-
* indicating a memory error. Bit 8 is used for indicating a
673-
* cache hierarchy error. The combination of bit 2 and bit 3
674-
* is used for indicating a `generic' cache hierarchy error
675-
* But we can't just blindly check the above bits, because if
676-
* bit 11 is set, then it is a bus/interconnect error - and
677-
* either way the above bits just gives more detail on what
678-
* bus/interconnect error happened. Note that bit 12 can be
679-
* ignored, as it's the "filter" bit.
680-
*/
681-
return (m->status & 0xef80) == BIT(7) ||
682-
(m->status & 0xef00) == BIT(8) ||
683-
(m->status & 0xeffc) == 0xc;
684-
}
685-
686-
return false;
687-
}
688-
689726
DEFINE_PER_CPU(unsigned, mce_poll_count);
690727

691728
/*
@@ -2167,6 +2204,7 @@ __setup("mce", mcheck_enable);
21672204
int __init mcheck_init(void)
21682205
{
21692206
mcheck_intel_therm_init();
2207+
mce_register_decode_chain(&first_nb);
21702208
mce_register_decode_chain(&mce_srao_nb);
21712209
mce_register_decode_chain(&mce_default_nb);
21722210
mcheck_vendor_init_severity();
@@ -2716,6 +2754,7 @@ static int __init mcheck_late_init(void)
27162754
static_branch_inc(&mcsafe_key);
27172755

27182756
mcheck_debugfs_init();
2757+
cec_init();
27192758

27202759
/*
27212760
* Flush out everything that has been logged during early boot, now that

arch/x86/ras/Kconfig

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,17 @@ config MCE_AMD_INJ
77
aspects of the MCE handling code.
88

99
WARNING: Do not even assume this interface is staying stable!
10+
11+
config RAS_CEC
12+
bool "Correctable Errors Collector"
13+
depends on X86_MCE && MEMORY_FAILURE && DEBUG_FS
14+
---help---
15+
This is a small cache which collects correctable memory errors per 4K
16+
page PFN and counts their repeated occurrence. Once the counter for a
17+
PFN overflows, we try to soft-offline that page as we take it to mean
18+
that it has reached a relatively high error count and would probably
19+
be best if we don't use it anymore.
20+
21+
Bear in mind that this is absolutely useless if your platform doesn't
22+
have ECC DIMMs and doesn't have DRAM ECC checking enabled in the BIOS.
23+

drivers/ras/Makefile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
1-
obj-$(CONFIG_RAS) += ras.o debugfs.o
1+
obj-$(CONFIG_RAS) += ras.o debugfs.o
2+
obj-$(CONFIG_RAS_CEC) += cec.o

0 commit comments

Comments
 (0)