Skip to content

Commit cb51a37

Browse files
Robert Richtersuryasaimadhu
authored andcommitted
EDAC/ghes: Setup DIMM label from DMI and use it in error reports
The ghes driver reports errors with 'unknown label' even if the actual DIMM label is known, e.g.: EDAC MC0: 1 CE Single-bit ECC on unknown label (node:0 card:0 module:0 rank:1 bank:0 col:13 bit_pos:16 DIMM location:N0 DIMM_A0 page:0x966a9b3 offset:0x0 grain:1 syndrome:0x0 - APEI location: node:0 card:0 module:0 rank:1 bank:0 col:13 bit_pos:16 DIMM location:N0 DIMM_A0 status(0x0000000000000400): Storage error in DRAM memory) Fix this by using struct dimm_info's label string in error reports: EDAC MC0: 1 CE Single-bit ECC on N0 DIMM_A0 (node:0 card:0 module:0 rank:1 bank:515 col:14 bit_pos:16 DIMM location:N0 DIMM_A0 page:0x99223d8 offset:0x0 grain:1 syndrome:0x0 - APEI location: node:0 card:0 module:0 rank:1 bank:515 col:14 bit_pos:16 DIMM location:N0 DIMM_A0 status(0x0000000000000400): Storage error in DRAM memory) The labels are initialized by reading the bank and device strings from DMI. Now, the label information can also read from sysfs. E.g. a ThunderX2 system will show the following: /sys/devices/system/edac/mc/mc0/dimm0/dimm_label:N0 DIMM_A0 /sys/devices/system/edac/mc/mc0/dimm1/dimm_label:N0 DIMM_B0 /sys/devices/system/edac/mc/mc0/dimm2/dimm_label:N0 DIMM_C0 /sys/devices/system/edac/mc/mc0/dimm3/dimm_label:N0 DIMM_D0 /sys/devices/system/edac/mc/mc0/dimm4/dimm_label:N0 DIMM_E0 /sys/devices/system/edac/mc/mc0/dimm5/dimm_label:N0 DIMM_F0 /sys/devices/system/edac/mc/mc0/dimm6/dimm_label:N0 DIMM_G0 /sys/devices/system/edac/mc/mc0/dimm7/dimm_label:N0 DIMM_H0 /sys/devices/system/edac/mc/mc0/dimm8/dimm_label:N1 DIMM_I0 /sys/devices/system/edac/mc/mc0/dimm9/dimm_label:N1 DIMM_J0 /sys/devices/system/edac/mc/mc0/dimm10/dimm_label:N1 DIMM_K0 /sys/devices/system/edac/mc/mc0/dimm11/dimm_label:N1 DIMM_L0 /sys/devices/system/edac/mc/mc0/dimm12/dimm_label:N1 DIMM_M0 /sys/devices/system/edac/mc/mc0/dimm13/dimm_label:N1 DIMM_N0 /sys/devices/system/edac/mc/mc0/dimm14/dimm_label:N1 DIMM_O0 /sys/devices/system/edac/mc/mc0/dimm15/dimm_label:N1 DIMM_P0 Since dimm_labels can be rewritten, that label will be used in a later error report: # echo foobar >/sys/devices/system/edac/mc/mc0/dimm0/dimm_label # # some error injection here # dmesg | grep foobar [ 751.383533] EDAC MC0: 1 CE Single-bit ECC on foobar (node:0 card:0 module:0 rank:1 bank:259 col:3 bit_pos:16 DIMM location:N0 DIMM_A0 page:0x8c8dc74 offset:0x0 grain:1 syndrome:0x0 - APEI location: node:0 card:0 module:0 rank:1 bank:259 col:3 bit_pos:16 DIMM location:N0 DIMM_A0 status(0x0000000000000400): Storage error in DRAM memory) [ bp: Remove curly brackets around a single if-statement in dimm_setup_label(). ] Signed-off-by: Robert Richter <[email protected]> Signed-off-by: Borislav Petkov <[email protected]> Link: https://lkml.kernel.org/r/[email protected]
1 parent b3a9e3b commit cb51a37

File tree

1 file changed

+24
-11
lines changed

1 file changed

+24
-11
lines changed

drivers/edac/ghes_edac.c

Lines changed: 24 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -87,16 +87,27 @@ static void ghes_edac_count_dimms(const struct dmi_header *dh, void *arg)
8787
(*num_dimm)++;
8888
}
8989

90-
static int get_dimm_smbios_index(struct mem_ctl_info *mci, u16 handle)
90+
static struct dimm_info *find_dimm_by_handle(struct mem_ctl_info *mci, u16 handle)
9191
{
9292
struct dimm_info *dimm;
9393

9494
mci_for_each_dimm(mci, dimm) {
9595
if (dimm->smbios_handle == handle)
96-
return dimm->idx;
96+
return dimm;
9797
}
9898

99-
return -1;
99+
return NULL;
100+
}
101+
102+
static void dimm_setup_label(struct dimm_info *dimm, u16 handle)
103+
{
104+
const char *bank = NULL, *device = NULL;
105+
106+
dmi_memdev_name(handle, &bank, &device);
107+
108+
/* both strings must be non-zero */
109+
if (bank && *bank && device && *device)
110+
snprintf(dimm->label, sizeof(dimm->label), "%s %s", bank, device);
100111
}
101112

102113
static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
@@ -179,9 +190,7 @@ static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
179190
dimm->dtype = DEV_UNKNOWN;
180191
dimm->grain = 128; /* Likely, worse case */
181192

182-
/*
183-
* FIXME: It shouldn't be hard to also fill the DIMM labels
184-
*/
193+
dimm_setup_label(dimm, entry->handle);
185194

186195
if (dimm->nr_pages) {
187196
edac_dbg(1, "DIMM%i: %s size = %d MB%s\n",
@@ -228,7 +237,6 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
228237
memset(e, 0, sizeof (*e));
229238
e->error_count = 1;
230239
e->grain = 1;
231-
strcpy(e->label, "unknown label");
232240
e->msg = pvt->msg;
233241
e->other_detail = pvt->other_detail;
234242
e->top_layer = -1;
@@ -345,7 +353,7 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
345353
p += sprintf(p, "bit_pos:%d ", mem_err->bit_pos);
346354
if (mem_err->validation_bits & CPER_MEM_VALID_MODULE_HANDLE) {
347355
const char *bank = NULL, *device = NULL;
348-
int index = -1;
356+
struct dimm_info *dimm;
349357

350358
dmi_memdev_name(mem_err->mem_dev_handle, &bank, &device);
351359
if (bank != NULL && device != NULL)
@@ -354,13 +362,18 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
354362
p += sprintf(p, "DIMM DMI handle: 0x%.4x ",
355363
mem_err->mem_dev_handle);
356364

357-
index = get_dimm_smbios_index(mci, mem_err->mem_dev_handle);
358-
if (index >= 0)
359-
e->top_layer = index;
365+
dimm = find_dimm_by_handle(mci, mem_err->mem_dev_handle);
366+
if (dimm) {
367+
e->top_layer = dimm->idx;
368+
strcpy(e->label, dimm->label);
369+
}
360370
}
361371
if (p > e->location)
362372
*(p - 1) = '\0';
363373

374+
if (!*e->label)
375+
strcpy(e->label, "unknown memory");
376+
364377
/* All other fields are mapped on e->other_detail */
365378
p = pvt->other_detail;
366379
p += snprintf(p, sizeof(pvt->other_detail),

0 commit comments

Comments
 (0)