Skip to content

Commit 87a6123

Browse files
yghannambp3tk0v
authored andcommitted
RAS/AMD/ATL: Add MI300 DRAM to normalized address translation support
Zen-based AMD systems report DRAM ECC errors through Unified Memory Controller (UMC) MCA banks. The value provided in MCA_ADDR is a "normalized" address which represents the UMC's view of its managed memory. The normalized address must be translated to a system physical address for software to take action. MI300 systems, uniquely, do not provide a normalized address in MCA_ADDR for DRAM ECC errors. Rather, the "DRAM" address is reported. This value includes identifiers for the bank, row, column, pseudochannel and stack of the memory location. The DRAM address must be converted to a normalized address in order to be further translated to a system physical address. Add helper functions to do the DRAM to normalized translation for MI300 systems. The method is based on the fixed hardware layout of the on-chip memory. [ bp: Massage commit message, decapitalize some, rename function. ] Signed-off-by: Yazen Ghannam <[email protected]> Co-developed-by: Muralidhara M K <[email protected]> Signed-off-by: Muralidhara M K <[email protected]> Signed-off-by: Borislav Petkov (AMD) <[email protected]> Tested-by: Muralidhara M K <[email protected]> Link: https://lore.kernel.org/r/[email protected]
1 parent a7b5737 commit 87a6123

File tree

3 files changed

+205
-2
lines changed

3 files changed

+205
-2
lines changed

drivers/ras/amd/atl/internal.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,7 @@ int df_indirect_read_broadcast(u16 node, u8 func, u16 reg, u32 *lo);
224224

225225
int get_df_system_info(void);
226226
int determine_node_id(struct addr_ctx *ctx, u8 socket_num, u8 die_num);
227+
int get_addr_hash_mi300(void);
227228

228229
int get_address_map(struct addr_ctx *ctx);
229230

drivers/ras/amd/atl/system.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,9 +124,13 @@ static int df4_determine_df_rev(u32 reg)
124124
if (reg == DF_FUNC0_ID_ZEN4_SERVER)
125125
df_cfg.flags.socket_id_shift_quirk = 1;
126126

127-
if (reg == DF_FUNC0_ID_MI300)
127+
if (reg == DF_FUNC0_ID_MI300) {
128128
df_cfg.flags.heterogeneous = 1;
129129

130+
if (get_addr_hash_mi300())
131+
return -EINVAL;
132+
}
133+
130134
return df4_get_fabric_id_mask_registers();
131135
}
132136

drivers/ras/amd/atl/umc.c

Lines changed: 199 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,204 @@ static u8 get_coh_st_inst_id_mi300(struct atl_err *err)
4949
return i;
5050
}
5151

52+
/* XOR the bits in @val. */
53+
static u16 bitwise_xor_bits(u16 val)
54+
{
55+
u16 tmp = 0;
56+
u8 i;
57+
58+
for (i = 0; i < 16; i++)
59+
tmp ^= (val >> i) & 0x1;
60+
61+
return tmp;
62+
}
63+
64+
struct xor_bits {
65+
bool xor_enable;
66+
u16 col_xor;
67+
u32 row_xor;
68+
};
69+
70+
#define NUM_BANK_BITS 4
71+
72+
static struct {
73+
/* UMC::CH::AddrHashBank */
74+
struct xor_bits bank[NUM_BANK_BITS];
75+
76+
/* UMC::CH::AddrHashPC */
77+
struct xor_bits pc;
78+
79+
/* UMC::CH::AddrHashPC2 */
80+
u8 bank_xor;
81+
} addr_hash;
82+
83+
#define MI300_UMC_CH_BASE 0x90000
84+
#define MI300_ADDR_HASH_BANK0 (MI300_UMC_CH_BASE + 0xC8)
85+
#define MI300_ADDR_HASH_PC (MI300_UMC_CH_BASE + 0xE0)
86+
#define MI300_ADDR_HASH_PC2 (MI300_UMC_CH_BASE + 0xE4)
87+
88+
#define ADDR_HASH_XOR_EN BIT(0)
89+
#define ADDR_HASH_COL_XOR GENMASK(13, 1)
90+
#define ADDR_HASH_ROW_XOR GENMASK(31, 14)
91+
#define ADDR_HASH_BANK_XOR GENMASK(5, 0)
92+
93+
/*
94+
* Read UMC::CH::AddrHash{Bank,PC,PC2} registers to get XOR bits used
95+
* for hashing. Do this during module init, since the values will not
96+
* change during run time.
97+
*
98+
* These registers are instantiated for each UMC across each AMD Node.
99+
* However, they should be identically programmed due to the fixed hardware
100+
* design of MI300 systems. So read the values from Node 0 UMC 0 and keep a
101+
* single global structure for simplicity.
102+
*/
103+
int get_addr_hash_mi300(void)
104+
{
105+
u32 temp;
106+
int ret;
107+
u8 i;
108+
109+
for (i = 0; i < NUM_BANK_BITS; i++) {
110+
ret = amd_smn_read(0, MI300_ADDR_HASH_BANK0 + (i * 4), &temp);
111+
if (ret)
112+
return ret;
113+
114+
addr_hash.bank[i].xor_enable = FIELD_GET(ADDR_HASH_XOR_EN, temp);
115+
addr_hash.bank[i].col_xor = FIELD_GET(ADDR_HASH_COL_XOR, temp);
116+
addr_hash.bank[i].row_xor = FIELD_GET(ADDR_HASH_ROW_XOR, temp);
117+
}
118+
119+
ret = amd_smn_read(0, MI300_ADDR_HASH_PC, &temp);
120+
if (ret)
121+
return ret;
122+
123+
addr_hash.pc.xor_enable = FIELD_GET(ADDR_HASH_XOR_EN, temp);
124+
addr_hash.pc.col_xor = FIELD_GET(ADDR_HASH_COL_XOR, temp);
125+
addr_hash.pc.row_xor = FIELD_GET(ADDR_HASH_ROW_XOR, temp);
126+
127+
ret = amd_smn_read(0, MI300_ADDR_HASH_PC2, &temp);
128+
if (ret)
129+
return ret;
130+
131+
addr_hash.bank_xor = FIELD_GET(ADDR_HASH_BANK_XOR, temp);
132+
133+
return 0;
134+
}
135+
136+
/*
137+
* MI300 systems report a DRAM address in MCA_ADDR for DRAM ECC errors. This must
138+
* be converted to the intermediate normalized address (NA) before translating to a
139+
* system physical address.
140+
*
141+
* The DRAM address includes bank, row, and column. Also included are bits for
142+
* pseudochannel (PC) and stack ID (SID).
143+
*
144+
* Abbreviations: (S)tack ID, (P)seudochannel, (R)ow, (B)ank, (C)olumn, (Z)ero
145+
*
146+
* The MCA address format is as follows:
147+
* MCA_ADDR[27:0] = {S[1:0], P[0], R[14:0], B[3:0], C[4:0], Z[0]}
148+
*
149+
* The normalized address format is fixed in hardware and is as follows:
150+
* NA[30:0] = {S[1:0], R[13:0], C4, B[1:0], B[3:2], C[3:2], P, C[1:0], Z[4:0]}
151+
*
152+
* Additionally, the PC and Bank bits may be hashed. This must be accounted for before
153+
* reconstructing the normalized address.
154+
*/
155+
#define MI300_UMC_MCA_COL GENMASK(5, 1)
156+
#define MI300_UMC_MCA_BANK GENMASK(9, 6)
157+
#define MI300_UMC_MCA_ROW GENMASK(24, 10)
158+
#define MI300_UMC_MCA_PC BIT(25)
159+
#define MI300_UMC_MCA_SID GENMASK(27, 26)
160+
161+
#define MI300_NA_COL_1_0 GENMASK(6, 5)
162+
#define MI300_NA_PC BIT(7)
163+
#define MI300_NA_COL_3_2 GENMASK(9, 8)
164+
#define MI300_NA_BANK_3_2 GENMASK(11, 10)
165+
#define MI300_NA_BANK_1_0 GENMASK(13, 12)
166+
#define MI300_NA_COL_4 BIT(14)
167+
#define MI300_NA_ROW GENMASK(28, 15)
168+
#define MI300_NA_SID GENMASK(30, 29)
169+
170+
static unsigned long convert_dram_to_norm_addr_mi300(unsigned long addr)
171+
{
172+
u16 i, col, row, bank, pc, sid, temp;
173+
174+
col = FIELD_GET(MI300_UMC_MCA_COL, addr);
175+
bank = FIELD_GET(MI300_UMC_MCA_BANK, addr);
176+
row = FIELD_GET(MI300_UMC_MCA_ROW, addr);
177+
pc = FIELD_GET(MI300_UMC_MCA_PC, addr);
178+
sid = FIELD_GET(MI300_UMC_MCA_SID, addr);
179+
180+
/* Calculate hash for each Bank bit. */
181+
for (i = 0; i < NUM_BANK_BITS; i++) {
182+
if (!addr_hash.bank[i].xor_enable)
183+
continue;
184+
185+
temp = bitwise_xor_bits(col & addr_hash.bank[i].col_xor);
186+
temp ^= bitwise_xor_bits(row & addr_hash.bank[i].row_xor);
187+
bank ^= temp << i;
188+
}
189+
190+
/* Calculate hash for PC bit. */
191+
if (addr_hash.pc.xor_enable) {
192+
/* Bits SID[1:0] act as Bank[6:5] for PC hash, so apply them here. */
193+
bank |= sid << 5;
194+
195+
temp = bitwise_xor_bits(col & addr_hash.pc.col_xor);
196+
temp ^= bitwise_xor_bits(row & addr_hash.pc.row_xor);
197+
temp ^= bitwise_xor_bits(bank & addr_hash.bank_xor);
198+
pc ^= temp;
199+
200+
/* Drop SID bits for the sake of debug printing later. */
201+
bank &= 0x1F;
202+
}
203+
204+
/* Reconstruct the normalized address starting with NA[4:0] = 0 */
205+
addr = 0;
206+
207+
/* NA[6:5] = Column[1:0] */
208+
temp = col & 0x3;
209+
addr |= FIELD_PREP(MI300_NA_COL_1_0, temp);
210+
211+
/* NA[7] = PC */
212+
addr |= FIELD_PREP(MI300_NA_PC, pc);
213+
214+
/* NA[9:8] = Column[3:2] */
215+
temp = (col >> 2) & 0x3;
216+
addr |= FIELD_PREP(MI300_NA_COL_3_2, temp);
217+
218+
/* NA[11:10] = Bank[3:2] */
219+
temp = (bank >> 2) & 0x3;
220+
addr |= FIELD_PREP(MI300_NA_BANK_3_2, temp);
221+
222+
/* NA[13:12] = Bank[1:0] */
223+
temp = bank & 0x3;
224+
addr |= FIELD_PREP(MI300_NA_BANK_1_0, temp);
225+
226+
/* NA[14] = Column[4] */
227+
temp = (col >> 4) & 0x1;
228+
addr |= FIELD_PREP(MI300_NA_COL_4, temp);
229+
230+
/* NA[28:15] = Row[13:0] */
231+
addr |= FIELD_PREP(MI300_NA_ROW, row);
232+
233+
/* NA[30:29] = SID[1:0] */
234+
addr |= FIELD_PREP(MI300_NA_SID, sid);
235+
236+
pr_debug("Addr=0x%016lx", addr);
237+
pr_debug("Bank=%u Row=%u Column=%u PC=%u SID=%u", bank, row, col, pc, sid);
238+
239+
return addr;
240+
}
241+
242+
static unsigned long get_addr(unsigned long addr)
243+
{
244+
if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous)
245+
return convert_dram_to_norm_addr_mi300(addr);
246+
247+
return addr;
248+
}
249+
52250
#define MCA_IPID_INST_ID_HI GENMASK_ULL(47, 44)
53251
static u8 get_die_id(struct atl_err *err)
54252
{
@@ -82,7 +280,7 @@ unsigned long convert_umc_mca_addr_to_sys_addr(struct atl_err *err)
82280
{
83281
u8 socket_id = topology_physical_package_id(err->cpu);
84282
u8 coh_st_inst_id = get_coh_st_inst_id(err);
85-
unsigned long addr = err->addr;
283+
unsigned long addr = get_addr(err->addr);
86284
u8 die_id = get_die_id(err);
87285

88286
pr_debug("socket_id=0x%x die_id=0x%x coh_st_inst_id=0x%x addr=0x%016lx",

0 commit comments

Comments
 (0)