Skip to content

Commit 3b566b3

Browse files
yghannambp3tk0v
authored andcommitted
RAS/AMD/ATL: Add MI300 row retirement support
DRAM row retirement depends on model-specific information that is best done within the AMD Address Translation Library. Export a generic wrapper function for other modules to use. Add any model-specific helpers here. Signed-off-by: Yazen Ghannam <[email protected]> Signed-off-by: Borislav Petkov (AMD) <[email protected]> Link: https://lore.kernel.org/r/[email protected]
1 parent 0e4fd81 commit 3b566b3

File tree

3 files changed

+54
-0
lines changed

3 files changed

+54
-0
lines changed

drivers/ras/amd/atl/Kconfig

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
config AMD_ATL
1111
tristate "AMD Address Translation Library"
1212
depends on AMD_NB && X86_64 && RAS
13+
depends on MEMORY_FAILURE
1314
default N
1415
help
1516
This library includes support for implementation-specific

drivers/ras/amd/atl/umc.c

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,57 @@ static unsigned long convert_dram_to_norm_addr_mi300(unsigned long addr)
239239
return addr;
240240
}
241241

242+
/*
243+
* When a DRAM ECC error occurs on MI300 systems, it is recommended to retire
244+
* all memory within that DRAM row. This applies to the memory with a DRAM
245+
* bank.
246+
*
247+
* To find the memory addresses, loop through permutations of the DRAM column
248+
* bits and find the System Physical address of each. The column bits are used
249+
* to calculate the intermediate Normalized address, so all permutations should
250+
* be checked.
251+
*
252+
* See amd_atl::convert_dram_to_norm_addr_mi300() for MI300 address formats.
253+
*/
254+
#define MI300_NUM_COL BIT(HWEIGHT(MI300_UMC_MCA_COL))
255+
static void retire_row_mi300(struct atl_err *a_err)
256+
{
257+
unsigned long addr;
258+
struct page *p;
259+
u8 col;
260+
261+
for (col = 0; col < MI300_NUM_COL; col++) {
262+
a_err->addr &= ~MI300_UMC_MCA_COL;
263+
a_err->addr |= FIELD_PREP(MI300_UMC_MCA_COL, col);
264+
265+
addr = amd_convert_umc_mca_addr_to_sys_addr(a_err);
266+
if (IS_ERR_VALUE(addr))
267+
continue;
268+
269+
addr = PHYS_PFN(addr);
270+
271+
/*
272+
* Skip invalid or already poisoned pages to avoid unnecessary
273+
* error messages from memory_failure().
274+
*/
275+
p = pfn_to_online_page(addr);
276+
if (!p)
277+
continue;
278+
279+
if (PageHWPoison(p))
280+
continue;
281+
282+
memory_failure(addr, 0);
283+
}
284+
}
285+
286+
void amd_retire_dram_row(struct atl_err *a_err)
287+
{
288+
if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous)
289+
return retire_row_mi300(a_err);
290+
}
291+
EXPORT_SYMBOL_GPL(amd_retire_dram_row);
292+
242293
static unsigned long get_addr(unsigned long addr)
243294
{
244295
if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous)

include/linux/ras.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,10 @@ struct atl_err {
4545
#if IS_ENABLED(CONFIG_AMD_ATL)
4646
void amd_atl_register_decoder(unsigned long (*f)(struct atl_err *));
4747
void amd_atl_unregister_decoder(void);
48+
void amd_retire_dram_row(struct atl_err *err);
4849
unsigned long amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err);
4950
#else
51+
static inline void amd_retire_dram_row(struct atl_err *err) { }
5052
static inline unsigned long
5153
amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err) { return -EINVAL; }
5254
#endif /* CONFIG_AMD_ATL */

0 commit comments

Comments
 (0)