Skip to content

Commit 1a1d569

Browse files
committed
Merge tag 'edac_urgent_for_v6.15_rc3' of git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras
Pull EDAC fixes from Borislav Petkov: "Two fixes to the AMD translation library for the MI300 side of things: - Use the row[13] bit when calculating the memory row to retire - Mask the physical row address in order to avoid creating duplicate error records" * tag 'edac_urgent_for_v6.15_rc3' of git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras: RAS/AMD/FMPM: Get masked address RAS/AMD/ATL: Include row[13] bit in row retirement
2 parents 065d498 + 58029c3 commit 1a1d569

File tree

3 files changed

+28
-3
lines changed

3 files changed

+28
-3
lines changed

drivers/ras/amd/atl/internal.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -362,4 +362,7 @@ static inline void atl_debug_on_bad_intlv_mode(struct addr_ctx *ctx)
362362
atl_debug(ctx, "Unrecognized interleave mode: %u", ctx->map.intlv_mode);
363363
}
364364

365+
#define MI300_UMC_MCA_COL GENMASK(5, 1)
366+
#define MI300_UMC_MCA_ROW13 BIT(23)
367+
365368
#endif /* __AMD_ATL_INTERNAL_H__ */

drivers/ras/amd/atl/umc.c

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -229,7 +229,6 @@ int get_umc_info_mi300(void)
229229
* Additionally, the PC and Bank bits may be hashed. This must be accounted for before
230230
* reconstructing the normalized address.
231231
*/
232-
#define MI300_UMC_MCA_COL GENMASK(5, 1)
233232
#define MI300_UMC_MCA_BANK GENMASK(9, 6)
234233
#define MI300_UMC_MCA_ROW GENMASK(24, 10)
235234
#define MI300_UMC_MCA_PC BIT(25)
@@ -320,7 +319,7 @@ static unsigned long convert_dram_to_norm_addr_mi300(unsigned long addr)
320319
* See amd_atl::convert_dram_to_norm_addr_mi300() for MI300 address formats.
321320
*/
322321
#define MI300_NUM_COL BIT(HWEIGHT(MI300_UMC_MCA_COL))
323-
static void retire_row_mi300(struct atl_err *a_err)
322+
static void _retire_row_mi300(struct atl_err *a_err)
324323
{
325324
unsigned long addr;
326325
struct page *p;
@@ -351,6 +350,22 @@ static void retire_row_mi300(struct atl_err *a_err)
351350
}
352351
}
353352

353+
/*
354+
* In addition to the column bits, the row[13] bit should also be included when
355+
* calculating addresses affected by a physical row.
356+
*
357+
* Instead of running through another loop over a single bit, just run through
358+
* the column bits twice and flip the row[13] bit in-between.
359+
*
360+
* See MI300_UMC_MCA_ROW for the row bits in MCA_ADDR_UMC value.
361+
*/
362+
static void retire_row_mi300(struct atl_err *a_err)
363+
{
364+
_retire_row_mi300(a_err);
365+
a_err->addr ^= MI300_UMC_MCA_ROW13;
366+
_retire_row_mi300(a_err);
367+
}
368+
354369
void amd_retire_dram_row(struct atl_err *a_err)
355370
{
356371
if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous)

drivers/ras/amd/fmpm.c

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,13 @@ static bool rec_has_valid_entries(struct fru_rec *rec)
250250
return true;
251251
}
252252

253+
/*
254+
* Row retirement is done on MI300 systems, and some bits are 'don't
255+
* care' for comparing addresses with unique physical rows. This
256+
* includes all column bits and the row[13] bit.
257+
*/
258+
#define MASK_ADDR(addr) ((addr) & ~(MI300_UMC_MCA_ROW13 | MI300_UMC_MCA_COL))
259+
253260
static bool fpds_equal(struct cper_fru_poison_desc *old, struct cper_fru_poison_desc *new)
254261
{
255262
/*
@@ -258,7 +265,7 @@ static bool fpds_equal(struct cper_fru_poison_desc *old, struct cper_fru_poison_
258265
*
259266
* Also, order the checks from most->least likely to fail to shortcut the code.
260267
*/
261-
if (old->addr != new->addr)
268+
if (MASK_ADDR(old->addr) != MASK_ADDR(new->addr))
262269
return false;
263270

264271
if (old->hw_id != new->hw_id)

0 commit comments

Comments
 (0)