Skip to content

Commit 0b5ccb0

Browse files
shijujose4davejiang
authored andcommitted
cxl/edac: Support for finding memory operation attributes from the current boot
Certain operations on memory, such as memory repair, are permitted only when the address and other attributes for the operation are from the current boot. This is determined by checking whether the memory attributes for the operation match those in the CXL gen_media or CXL DRAM memory event records reported during the current boot. The CXL event records must be backed up because they are cleared in the hardware after being processed by the kernel. Support is added for storing CXL gen_media or CXL DRAM memory event records in xarrays. Old records are deleted when they expire or when there is an overflow and which depends on platform correctly report Event Record Timestamp field of CXL spec Table 8-55 Common Event Record Format. Additionally, helper functions are implemented to find a matching record in the xarray storage based on the memory attributes and repair type. Add validity check, when matching attributes for sparing, using the validity flag in the DRAM event record, to ensure that all required attributes for a requested repair operation are valid and set. Reviewed-by: Dave Jiang <[email protected]> Co-developed-by: Jonathan Cameron <[email protected]> Signed-off-by: Jonathan Cameron <[email protected]> Signed-off-by: Shiju Jose <[email protected]> Reviewed-by: Alison Schofield <[email protected]> Acked-by: Dan Williams <[email protected]> Link: https://patch.msgid.link/[email protected] Signed-off-by: Dave Jiang <[email protected]>
1 parent 077ee5f commit 0b5ccb0

File tree

5 files changed

+357
-2
lines changed

5 files changed

+357
-2
lines changed

drivers/cxl/Kconfig

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,27 @@ config CXL_EDAC_ECS
164164
of a memory ECS feature established by the platform/device.
165165
Otherwise say 'n'.
166166

167+
config CXL_EDAC_MEM_REPAIR
168+
bool "Enable CXL Memory Repair"
169+
depends on CXL_EDAC_MEM_FEATURES
170+
depends on EDAC_MEM_REPAIR
171+
help
172+
The CXL EDAC memory repair control is optional and allows host
173+
to control the memory repair features (e.g. sparing, PPR)
174+
configurations of CXL memory expander devices.
175+
176+
When enabled, the memory repair feature requires an additional
177+
memory of approximately 43KB to store CXL DRAM and CXL general
178+
media event records.
179+
180+
When enabled 'cxl_mem' EDAC devices are published with memory
181+
repair control attributes as described by
182+
Documentation/ABI/testing/sysfs-edac-memory-repair.
183+
184+
Say 'y' if you have an expert need to change default settings
185+
of a memory repair feature established by the platform/device.
186+
Otherwise say 'n'.
187+
167188
config CXL_PORT
168189
default CXL_BUS
169190
tristate

drivers/cxl/core/edac.c

Lines changed: 311 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,12 @@
1414
#include <linux/cleanup.h>
1515
#include <linux/edac.h>
1616
#include <linux/limits.h>
17+
#include <linux/xarray.h>
1718
#include <cxl/features.h>
1819
#include <cxl.h>
1920
#include <cxlmem.h>
2021
#include "core.h"
22+
#include "trace.h"
2123

2224
#define CXL_NR_EDAC_DEV_FEATURES 2
2325

@@ -862,10 +864,285 @@ static int cxl_perform_maintenance(struct cxl_mailbox *cxl_mbox, u8 class,
862864
return cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
863865
}
864866

867+
/*
868+
* Support for finding a memory operation attributes
869+
* are from the current boot or not.
870+
*/
871+
872+
struct cxl_mem_err_rec {
873+
struct xarray rec_gen_media;
874+
struct xarray rec_dram;
875+
};
876+
877+
enum cxl_mem_repair_type {
878+
CXL_PPR,
879+
CXL_CACHELINE_SPARING,
880+
CXL_ROW_SPARING,
881+
CXL_BANK_SPARING,
882+
CXL_RANK_SPARING,
883+
CXL_REPAIR_MAX,
884+
};
885+
886+
/**
887+
* struct cxl_mem_repair_attrbs - CXL memory repair attributes
888+
* @dpa: DPA of memory to repair
889+
* @nibble_mask: nibble mask, identifies one or more nibbles on the memory bus
890+
* @row: row of memory to repair
891+
* @column: column of memory to repair
892+
* @channel: channel of memory to repair
893+
* @sub_channel: sub channel of memory to repair
894+
* @rank: rank of memory to repair
895+
* @bank_group: bank group of memory to repair
896+
* @bank: bank of memory to repair
897+
* @repair_type: repair type. For eg. PPR, memory sparing etc.
898+
*/
899+
struct cxl_mem_repair_attrbs {
900+
u64 dpa;
901+
u32 nibble_mask;
902+
u32 row;
903+
u16 column;
904+
u8 channel;
905+
u8 sub_channel;
906+
u8 rank;
907+
u8 bank_group;
908+
u8 bank;
909+
enum cxl_mem_repair_type repair_type;
910+
};
911+
912+
static struct cxl_event_gen_media *
913+
cxl_find_rec_gen_media(struct cxl_memdev *cxlmd,
914+
struct cxl_mem_repair_attrbs *attrbs)
915+
{
916+
struct cxl_mem_err_rec *array_rec = cxlmd->err_rec_array;
917+
struct cxl_event_gen_media *rec;
918+
919+
if (!array_rec)
920+
return NULL;
921+
922+
rec = xa_load(&array_rec->rec_gen_media, attrbs->dpa);
923+
if (!rec)
924+
return NULL;
925+
926+
if (attrbs->repair_type == CXL_PPR)
927+
return rec;
928+
929+
return NULL;
930+
}
931+
932+
static struct cxl_event_dram *
933+
cxl_find_rec_dram(struct cxl_memdev *cxlmd,
934+
struct cxl_mem_repair_attrbs *attrbs)
935+
{
936+
struct cxl_mem_err_rec *array_rec = cxlmd->err_rec_array;
937+
struct cxl_event_dram *rec;
938+
u16 validity_flags;
939+
940+
if (!array_rec)
941+
return NULL;
942+
943+
rec = xa_load(&array_rec->rec_dram, attrbs->dpa);
944+
if (!rec)
945+
return NULL;
946+
947+
validity_flags = get_unaligned_le16(rec->media_hdr.validity_flags);
948+
if (!(validity_flags & CXL_DER_VALID_CHANNEL) ||
949+
!(validity_flags & CXL_DER_VALID_RANK))
950+
return NULL;
951+
952+
switch (attrbs->repair_type) {
953+
case CXL_PPR:
954+
if (!(validity_flags & CXL_DER_VALID_NIBBLE) ||
955+
get_unaligned_le24(rec->nibble_mask) == attrbs->nibble_mask)
956+
return rec;
957+
break;
958+
case CXL_CACHELINE_SPARING:
959+
if (!(validity_flags & CXL_DER_VALID_BANK_GROUP) ||
960+
!(validity_flags & CXL_DER_VALID_BANK) ||
961+
!(validity_flags & CXL_DER_VALID_ROW) ||
962+
!(validity_flags & CXL_DER_VALID_COLUMN))
963+
return NULL;
964+
965+
if (rec->media_hdr.channel == attrbs->channel &&
966+
rec->media_hdr.rank == attrbs->rank &&
967+
rec->bank_group == attrbs->bank_group &&
968+
rec->bank == attrbs->bank &&
969+
get_unaligned_le24(rec->row) == attrbs->row &&
970+
get_unaligned_le16(rec->column) == attrbs->column &&
971+
(!(validity_flags & CXL_DER_VALID_NIBBLE) ||
972+
get_unaligned_le24(rec->nibble_mask) ==
973+
attrbs->nibble_mask) &&
974+
(!(validity_flags & CXL_DER_VALID_SUB_CHANNEL) ||
975+
rec->sub_channel == attrbs->sub_channel))
976+
return rec;
977+
break;
978+
case CXL_ROW_SPARING:
979+
if (!(validity_flags & CXL_DER_VALID_BANK_GROUP) ||
980+
!(validity_flags & CXL_DER_VALID_BANK) ||
981+
!(validity_flags & CXL_DER_VALID_ROW))
982+
return NULL;
983+
984+
if (rec->media_hdr.channel == attrbs->channel &&
985+
rec->media_hdr.rank == attrbs->rank &&
986+
rec->bank_group == attrbs->bank_group &&
987+
rec->bank == attrbs->bank &&
988+
get_unaligned_le24(rec->row) == attrbs->row &&
989+
(!(validity_flags & CXL_DER_VALID_NIBBLE) ||
990+
get_unaligned_le24(rec->nibble_mask) ==
991+
attrbs->nibble_mask))
992+
return rec;
993+
break;
994+
case CXL_BANK_SPARING:
995+
if (!(validity_flags & CXL_DER_VALID_BANK_GROUP) ||
996+
!(validity_flags & CXL_DER_VALID_BANK))
997+
return NULL;
998+
999+
if (rec->media_hdr.channel == attrbs->channel &&
1000+
rec->media_hdr.rank == attrbs->rank &&
1001+
rec->bank_group == attrbs->bank_group &&
1002+
rec->bank == attrbs->bank &&
1003+
(!(validity_flags & CXL_DER_VALID_NIBBLE) ||
1004+
get_unaligned_le24(rec->nibble_mask) ==
1005+
attrbs->nibble_mask))
1006+
return rec;
1007+
break;
1008+
case CXL_RANK_SPARING:
1009+
if (rec->media_hdr.channel == attrbs->channel &&
1010+
rec->media_hdr.rank == attrbs->rank &&
1011+
(!(validity_flags & CXL_DER_VALID_NIBBLE) ||
1012+
get_unaligned_le24(rec->nibble_mask) ==
1013+
attrbs->nibble_mask))
1014+
return rec;
1015+
break;
1016+
default:
1017+
return NULL;
1018+
}
1019+
1020+
return NULL;
1021+
}
1022+
1023+
#define CXL_MAX_STORAGE_DAYS 10
1024+
#define CXL_MAX_STORAGE_TIME_SECS (CXL_MAX_STORAGE_DAYS * 24 * 60 * 60)
1025+
1026+
static void cxl_del_expired_gmedia_recs(struct xarray *rec_xarray,
1027+
struct cxl_event_gen_media *cur_rec)
1028+
{
1029+
u64 cur_ts = le64_to_cpu(cur_rec->media_hdr.hdr.timestamp);
1030+
struct cxl_event_gen_media *rec;
1031+
unsigned long index;
1032+
u64 delta_ts_secs;
1033+
1034+
xa_for_each(rec_xarray, index, rec) {
1035+
delta_ts_secs = (cur_ts -
1036+
le64_to_cpu(rec->media_hdr.hdr.timestamp)) / 1000000000ULL;
1037+
if (delta_ts_secs >= CXL_MAX_STORAGE_TIME_SECS) {
1038+
xa_erase(rec_xarray, index);
1039+
kfree(rec);
1040+
}
1041+
}
1042+
}
1043+
1044+
static void cxl_del_expired_dram_recs(struct xarray *rec_xarray,
1045+
struct cxl_event_dram *cur_rec)
1046+
{
1047+
u64 cur_ts = le64_to_cpu(cur_rec->media_hdr.hdr.timestamp);
1048+
struct cxl_event_dram *rec;
1049+
unsigned long index;
1050+
u64 delta_secs;
1051+
1052+
xa_for_each(rec_xarray, index, rec) {
1053+
delta_secs = (cur_ts -
1054+
le64_to_cpu(rec->media_hdr.hdr.timestamp)) / 1000000000ULL;
1055+
if (delta_secs >= CXL_MAX_STORAGE_TIME_SECS) {
1056+
xa_erase(rec_xarray, index);
1057+
kfree(rec);
1058+
}
1059+
}
1060+
}
1061+
1062+
#define CXL_MAX_REC_STORAGE_COUNT 200
1063+
1064+
static void cxl_del_overflow_old_recs(struct xarray *rec_xarray)
1065+
{
1066+
void *err_rec;
1067+
unsigned long index, count = 0;
1068+
1069+
xa_for_each(rec_xarray, index, err_rec)
1070+
count++;
1071+
1072+
if (count <= CXL_MAX_REC_STORAGE_COUNT)
1073+
return;
1074+
1075+
count -= CXL_MAX_REC_STORAGE_COUNT;
1076+
xa_for_each(rec_xarray, index, err_rec) {
1077+
xa_erase(rec_xarray, index);
1078+
kfree(err_rec);
1079+
count--;
1080+
if (!count)
1081+
break;
1082+
}
1083+
}
1084+
1085+
int cxl_store_rec_gen_media(struct cxl_memdev *cxlmd, union cxl_event *evt)
1086+
{
1087+
struct cxl_mem_err_rec *array_rec = cxlmd->err_rec_array;
1088+
struct cxl_event_gen_media *rec;
1089+
void *old_rec;
1090+
1091+
if (!IS_ENABLED(CONFIG_CXL_EDAC_MEM_REPAIR) || !array_rec)
1092+
return 0;
1093+
1094+
rec = kmemdup(&evt->gen_media, sizeof(*rec), GFP_KERNEL);
1095+
if (!rec)
1096+
return -ENOMEM;
1097+
1098+
old_rec = xa_store(&array_rec->rec_gen_media,
1099+
le64_to_cpu(rec->media_hdr.phys_addr), rec,
1100+
GFP_KERNEL);
1101+
if (xa_is_err(old_rec))
1102+
return xa_err(old_rec);
1103+
1104+
kfree(old_rec);
1105+
1106+
cxl_del_expired_gmedia_recs(&array_rec->rec_gen_media, rec);
1107+
cxl_del_overflow_old_recs(&array_rec->rec_gen_media);
1108+
1109+
return 0;
1110+
}
1111+
EXPORT_SYMBOL_NS_GPL(cxl_store_rec_gen_media, "CXL");
1112+
1113+
int cxl_store_rec_dram(struct cxl_memdev *cxlmd, union cxl_event *evt)
1114+
{
1115+
struct cxl_mem_err_rec *array_rec = cxlmd->err_rec_array;
1116+
struct cxl_event_dram *rec;
1117+
void *old_rec;
1118+
1119+
if (!IS_ENABLED(CONFIG_CXL_EDAC_MEM_REPAIR) || !array_rec)
1120+
return 0;
1121+
1122+
rec = kmemdup(&evt->dram, sizeof(*rec), GFP_KERNEL);
1123+
if (!rec)
1124+
return -ENOMEM;
1125+
1126+
old_rec = xa_store(&array_rec->rec_dram,
1127+
le64_to_cpu(rec->media_hdr.phys_addr), rec,
1128+
GFP_KERNEL);
1129+
if (xa_is_err(old_rec))
1130+
return xa_err(old_rec);
1131+
1132+
kfree(old_rec);
1133+
1134+
cxl_del_expired_dram_recs(&array_rec->rec_dram, rec);
1135+
cxl_del_overflow_old_recs(&array_rec->rec_dram);
1136+
1137+
return 0;
1138+
}
1139+
EXPORT_SYMBOL_NS_GPL(cxl_store_rec_dram, "CXL");
1140+
8651141
int devm_cxl_memdev_edac_register(struct cxl_memdev *cxlmd)
8661142
{
8671143
struct edac_dev_feature ras_features[CXL_NR_EDAC_DEV_FEATURES];
8681144
int num_ras_features = 0;
1145+
u8 repair_inst = 0;
8691146
int rc;
8701147

8711148
if (IS_ENABLED(CONFIG_CXL_EDAC_SCRUB)) {
@@ -886,6 +1163,20 @@ int devm_cxl_memdev_edac_register(struct cxl_memdev *cxlmd)
8861163
num_ras_features++;
8871164
}
8881165

1166+
if (IS_ENABLED(CONFIG_CXL_EDAC_MEM_REPAIR)) {
1167+
if (repair_inst) {
1168+
struct cxl_mem_err_rec *array_rec =
1169+
devm_kzalloc(&cxlmd->dev, sizeof(*array_rec),
1170+
GFP_KERNEL);
1171+
if (!array_rec)
1172+
return -ENOMEM;
1173+
1174+
xa_init(&array_rec->rec_gen_media);
1175+
xa_init(&array_rec->rec_dram);
1176+
cxlmd->err_rec_array = array_rec;
1177+
}
1178+
}
1179+
8891180
if (!num_ras_features)
8901181
return -EINVAL;
8911182

@@ -923,3 +1214,23 @@ int devm_cxl_region_edac_register(struct cxl_region *cxlr)
9231214
num_ras_features, ras_features);
9241215
}
9251216
EXPORT_SYMBOL_NS_GPL(devm_cxl_region_edac_register, "CXL");
1217+
1218+
void devm_cxl_memdev_edac_release(struct cxl_memdev *cxlmd)
1219+
{
1220+
struct cxl_mem_err_rec *array_rec = cxlmd->err_rec_array;
1221+
struct cxl_event_gen_media *rec_gen_media;
1222+
struct cxl_event_dram *rec_dram;
1223+
unsigned long index;
1224+
1225+
if (!IS_ENABLED(CONFIG_CXL_EDAC_MEM_REPAIR) || !array_rec)
1226+
return;
1227+
1228+
xa_for_each(&array_rec->rec_dram, index, rec_dram)
1229+
kfree(rec_dram);
1230+
xa_destroy(&array_rec->rec_dram);
1231+
1232+
xa_for_each(&array_rec->rec_gen_media, index, rec_gen_media)
1233+
kfree(rec_gen_media);
1234+
xa_destroy(&array_rec->rec_gen_media);
1235+
}
1236+
EXPORT_SYMBOL_NS_GPL(devm_cxl_memdev_edac_release, "CXL");

drivers/cxl/core/mbox.c

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -922,12 +922,19 @@ void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
922922
hpa_alias = hpa - cache_size;
923923
}
924924

925-
if (event_type == CXL_CPER_EVENT_GEN_MEDIA)
925+
if (event_type == CXL_CPER_EVENT_GEN_MEDIA) {
926+
if (cxl_store_rec_gen_media((struct cxl_memdev *)cxlmd, evt))
927+
dev_dbg(&cxlmd->dev, "CXL store rec_gen_media failed\n");
928+
926929
trace_cxl_general_media(cxlmd, type, cxlr, hpa,
927930
hpa_alias, &evt->gen_media);
928-
else if (event_type == CXL_CPER_EVENT_DRAM)
931+
} else if (event_type == CXL_CPER_EVENT_DRAM) {
932+
if (cxl_store_rec_dram((struct cxl_memdev *)cxlmd, evt))
933+
dev_dbg(&cxlmd->dev, "CXL store rec_dram failed\n");
934+
929935
trace_cxl_dram(cxlmd, type, cxlr, hpa, hpa_alias,
930936
&evt->dram);
937+
}
931938
}
932939
}
933940
EXPORT_SYMBOL_NS_GPL(cxl_event_trace_record, "CXL");

0 commit comments

Comments
 (0)