Skip to content

Commit 630ba00

Browse files
shijujose4davem330
authored andcommitted
net: hns3: add handling of RDMA RAS errors
This patch handles the RDMA RAS errors. 1. Enable RAS interrupt, print error detail info and clear error status. 2. Do CORE reset to recovery when these non-fatal errors happened. Signed-off-by: Xiaofei Tan <[email protected]> Signed-off-by: Shiju Jose <[email protected]> Signed-off-by: Salil Mehta <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent c352917 commit 630ba00

File tree

3 files changed

+199
-1
lines changed

3 files changed

+199
-1
lines changed

drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,9 @@ enum hclge_opcode_type {
229229
HCLGE_QUERY_MSIX_INT_STS_BD_NUM = 0x1513,
230230
HCLGE_QUERY_CLEAR_ALL_MPF_MSIX_INT = 0x1514,
231231
HCLGE_QUERY_CLEAR_ALL_PF_MSIX_INT = 0x1515,
232+
HCLGE_CONFIG_ROCEE_RAS_INT_EN = 0x1580,
233+
HCLGE_QUERY_CLEAR_ROCEE_RAS_INT = 0x1581,
234+
HCLGE_ROCEE_PF_RAS_INT_CMD = 0x1584,
232235
HCLGE_IGU_EGU_TNL_INT_EN = 0x1803,
233236
HCLGE_IGU_COMMON_INT_EN = 0x1806,
234237
HCLGE_TM_QCN_MEM_INT_CFG = 0x1A14,

drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c

Lines changed: 184 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,30 @@ static const struct hclge_hw_error hclge_ssu_port_based_pf_int[] = {
337337
{ /* sentinel */ }
338338
};
339339

340+
static const struct hclge_hw_error hclge_rocee_qmm_ovf_err_int[] = {
341+
{ .int_msk = 0, .msg = "rocee qmm ovf: sgid invalid err" },
342+
{ .int_msk = 0x4, .msg = "rocee qmm ovf: sgid ovf err" },
343+
{ .int_msk = 0x8, .msg = "rocee qmm ovf: smac invalid err" },
344+
{ .int_msk = 0xC, .msg = "rocee qmm ovf: smac ovf err" },
345+
{ .int_msk = 0x10, .msg = "rocee qmm ovf: cqc invalid err" },
346+
{ .int_msk = 0x11, .msg = "rocee qmm ovf: cqc ovf err" },
347+
{ .int_msk = 0x12, .msg = "rocee qmm ovf: cqc hopnum err" },
348+
{ .int_msk = 0x13, .msg = "rocee qmm ovf: cqc ba0 err" },
349+
{ .int_msk = 0x14, .msg = "rocee qmm ovf: srqc invalid err" },
350+
{ .int_msk = 0x15, .msg = "rocee qmm ovf: srqc ovf err" },
351+
{ .int_msk = 0x16, .msg = "rocee qmm ovf: srqc hopnum err" },
352+
{ .int_msk = 0x17, .msg = "rocee qmm ovf: srqc ba0 err" },
353+
{ .int_msk = 0x18, .msg = "rocee qmm ovf: mpt invalid err" },
354+
{ .int_msk = 0x19, .msg = "rocee qmm ovf: mpt ovf err" },
355+
{ .int_msk = 0x1A, .msg = "rocee qmm ovf: mpt hopnum err" },
356+
{ .int_msk = 0x1B, .msg = "rocee qmm ovf: mpt ba0 err" },
357+
{ .int_msk = 0x1C, .msg = "rocee qmm ovf: qpc invalid err" },
358+
{ .int_msk = 0x1D, .msg = "rocee qmm ovf: qpc ovf err" },
359+
{ .int_msk = 0x1E, .msg = "rocee qmm ovf: qpc hopnum err" },
360+
{ .int_msk = 0x1F, .msg = "rocee qmm ovf: qpc ba0 err" },
361+
{ /* sentinel */ }
362+
};
363+
340364
static void hclge_log_error(struct device *dev, char *reg,
341365
const struct hclge_hw_error *err,
342366
u32 err_sts)
@@ -1023,6 +1047,148 @@ static int hclge_handle_all_ras_errors(struct hclge_dev *hdev)
10231047
return ret;
10241048
}
10251049

1050+
static int hclge_log_rocee_ovf_error(struct hclge_dev *hdev)
1051+
{
1052+
struct device *dev = &hdev->pdev->dev;
1053+
struct hclge_desc desc[2];
1054+
int ret;
1055+
1056+
/* read overflow error status */
1057+
ret = hclge_cmd_query_error(hdev, &desc[0],
1058+
HCLGE_ROCEE_PF_RAS_INT_CMD,
1059+
0, 0, 0);
1060+
if (ret) {
1061+
dev_err(dev, "failed(%d) to query ROCEE OVF error sts\n", ret);
1062+
return ret;
1063+
}
1064+
1065+
/* log overflow error */
1066+
if (le32_to_cpu(desc[0].data[0]) & HCLGE_ROCEE_OVF_ERR_INT_MASK) {
1067+
const struct hclge_hw_error *err;
1068+
u32 err_sts;
1069+
1070+
err = &hclge_rocee_qmm_ovf_err_int[0];
1071+
err_sts = HCLGE_ROCEE_OVF_ERR_TYPE_MASK &
1072+
le32_to_cpu(desc[0].data[0]);
1073+
while (err->msg) {
1074+
if (err->int_msk == err_sts) {
1075+
dev_warn(dev, "%s [error status=0x%x] found\n",
1076+
err->msg,
1077+
le32_to_cpu(desc[0].data[0]));
1078+
break;
1079+
}
1080+
err++;
1081+
}
1082+
}
1083+
1084+
if (le32_to_cpu(desc[0].data[1]) & HCLGE_ROCEE_OVF_ERR_INT_MASK) {
1085+
dev_warn(dev, "ROCEE TSP OVF [error status=0x%x] found\n",
1086+
le32_to_cpu(desc[0].data[1]));
1087+
}
1088+
1089+
if (le32_to_cpu(desc[0].data[2]) & HCLGE_ROCEE_OVF_ERR_INT_MASK) {
1090+
dev_warn(dev, "ROCEE SCC OVF [error status=0x%x] found\n",
1091+
le32_to_cpu(desc[0].data[2]));
1092+
}
1093+
1094+
return 0;
1095+
}
1096+
1097+
static int hclge_log_and_clear_rocee_ras_error(struct hclge_dev *hdev)
1098+
{
1099+
enum hnae3_reset_type reset_type = HNAE3_FUNC_RESET;
1100+
struct hnae3_ae_dev *ae_dev = hdev->ae_dev;
1101+
struct device *dev = &hdev->pdev->dev;
1102+
struct hclge_desc desc[2];
1103+
unsigned int status;
1104+
int ret;
1105+
1106+
/* read RAS error interrupt status */
1107+
ret = hclge_cmd_query_error(hdev, &desc[0],
1108+
HCLGE_QUERY_CLEAR_ROCEE_RAS_INT,
1109+
0, 0, 0);
1110+
if (ret) {
1111+
dev_err(dev, "failed(%d) to query ROCEE RAS INT SRC\n", ret);
1112+
/* reset everything for now */
1113+
HCLGE_SET_DEFAULT_RESET_REQUEST(HNAE3_GLOBAL_RESET);
1114+
return ret;
1115+
}
1116+
1117+
status = le32_to_cpu(desc[0].data[0]);
1118+
1119+
if (status & HCLGE_ROCEE_RERR_INT_MASK)
1120+
dev_warn(dev, "ROCEE RAS AXI rresp error\n");
1121+
1122+
if (status & HCLGE_ROCEE_BERR_INT_MASK)
1123+
dev_warn(dev, "ROCEE RAS AXI bresp error\n");
1124+
1125+
if (status & HCLGE_ROCEE_ECC_INT_MASK) {
1126+
dev_warn(dev, "ROCEE RAS 2bit ECC error\n");
1127+
reset_type = HNAE3_GLOBAL_RESET;
1128+
}
1129+
1130+
if (status & HCLGE_ROCEE_OVF_INT_MASK) {
1131+
ret = hclge_log_rocee_ovf_error(hdev);
1132+
if (ret) {
1133+
dev_err(dev, "failed(%d) to process ovf error\n", ret);
1134+
/* reset everything for now */
1135+
HCLGE_SET_DEFAULT_RESET_REQUEST(HNAE3_GLOBAL_RESET);
1136+
return ret;
1137+
}
1138+
}
1139+
1140+
/* clear error status */
1141+
hclge_cmd_reuse_desc(&desc[0], false);
1142+
ret = hclge_cmd_send(&hdev->hw, &desc[0], 1);
1143+
if (ret) {
1144+
dev_err(dev, "failed(%d) to clear ROCEE RAS error\n", ret);
1145+
/* reset everything for now */
1146+
reset_type = HNAE3_GLOBAL_RESET;
1147+
}
1148+
1149+
HCLGE_SET_DEFAULT_RESET_REQUEST(reset_type);
1150+
1151+
return ret;
1152+
}
1153+
1154+
static int hclge_config_rocee_ras_interrupt(struct hclge_dev *hdev, bool en)
1155+
{
1156+
struct device *dev = &hdev->pdev->dev;
1157+
struct hclge_desc desc;
1158+
int ret;
1159+
1160+
if (hdev->pdev->revision < 0x21 || !hnae3_dev_roce_supported(hdev))
1161+
return 0;
1162+
1163+
hclge_cmd_setup_basic_desc(&desc, HCLGE_CONFIG_ROCEE_RAS_INT_EN, false);
1164+
if (en) {
1165+
/* enable ROCEE hw error interrupts */
1166+
desc.data[0] = cpu_to_le32(HCLGE_ROCEE_RAS_NFE_INT_EN);
1167+
desc.data[1] = cpu_to_le32(HCLGE_ROCEE_RAS_CE_INT_EN);
1168+
1169+
hclge_log_and_clear_rocee_ras_error(hdev);
1170+
}
1171+
desc.data[2] = cpu_to_le32(HCLGE_ROCEE_RAS_NFE_INT_EN_MASK);
1172+
desc.data[3] = cpu_to_le32(HCLGE_ROCEE_RAS_CE_INT_EN_MASK);
1173+
1174+
ret = hclge_cmd_send(&hdev->hw, &desc, 1);
1175+
if (ret)
1176+
dev_err(dev, "failed(%d) to config ROCEE RAS interrupt\n", ret);
1177+
1178+
return ret;
1179+
}
1180+
1181+
static int hclge_handle_rocee_ras_error(struct hnae3_ae_dev *ae_dev)
1182+
{
1183+
struct hclge_dev *hdev = ae_dev->priv;
1184+
1185+
if (test_bit(HCLGE_STATE_RST_HANDLING, &hdev->state) ||
1186+
hdev->pdev->revision < 0x21)
1187+
return HNAE3_NONE_RESET;
1188+
1189+
return hclge_log_and_clear_rocee_ras_error(hdev);
1190+
}
1191+
10261192
static const struct hclge_hw_blk hw_blk[] = {
10271193
{
10281194
.msk = BIT(0), .name = "IGU_EGU",
@@ -1058,6 +1224,7 @@ static const struct hclge_hw_blk hw_blk[] = {
10581224
int hclge_hw_error_set_state(struct hclge_dev *hdev, bool state)
10591225
{
10601226
const struct hclge_hw_blk *module = hw_blk;
1227+
struct device *dev = &hdev->pdev->dev;
10611228
int ret = 0;
10621229

10631230
while (module->name) {
@@ -1069,6 +1236,10 @@ int hclge_hw_error_set_state(struct hclge_dev *hdev, bool state)
10691236
module++;
10701237
}
10711238

1239+
ret = hclge_config_rocee_ras_interrupt(hdev, state);
1240+
if (ret)
1241+
dev_err(dev, "fail(%d) to configure ROCEE err int\n", ret);
1242+
10721243
return ret;
10731244
}
10741245

@@ -1086,9 +1257,21 @@ pci_ers_result_t hclge_handle_hw_ras_error(struct hnae3_ae_dev *ae_dev)
10861257
"HNS Non-Fatal RAS error(status=0x%x) identified\n",
10871258
status);
10881259
hclge_handle_all_ras_errors(hdev);
1089-
return PCI_ERS_RESULT_NEED_RESET;
1260+
} else {
1261+
if (test_bit(HCLGE_STATE_RST_HANDLING, &hdev->state) ||
1262+
hdev->pdev->revision < 0x21)
1263+
return PCI_ERS_RESULT_RECOVERED;
1264+
}
1265+
1266+
if (status & HCLGE_RAS_REG_ROCEE_ERR_MASK) {
1267+
dev_warn(dev, "ROCEE uncorrected RAS error identified\n");
1268+
hclge_handle_rocee_ras_error(ae_dev);
10901269
}
10911270

1271+
if (status & HCLGE_RAS_REG_NFE_MASK ||
1272+
status & HCLGE_RAS_REG_ROCEE_ERR_MASK)
1273+
return PCI_ERS_RESULT_NEED_RESET;
1274+
10921275
return PCI_ERS_RESULT_RECOVERED;
10931276
}
10941277

drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
#define HCLGE_RAS_PF_OTHER_INT_STS_REG 0x20B00
1010
#define HCLGE_RAS_REG_NFE_MASK 0xFF00
11+
#define HCLGE_RAS_REG_ROCEE_ERR_MASK 0x3000000
1112

1213
#define HCLGE_VECTOR0_PF_OTHER_INT_STS_REG 0x20800
1314
#define HCLGE_VECTOR0_REG_MSIX_MASK 0x1FF00
@@ -83,6 +84,17 @@
8384
#define HCLGE_QCN_ECC_INT_MASK GENMASK(21, 0)
8485
#define HCLGE_NCSI_ECC_INT_MASK GENMASK(1, 0)
8586

87+
#define HCLGE_ROCEE_RAS_NFE_INT_EN 0xF
88+
#define HCLGE_ROCEE_RAS_CE_INT_EN 0x1
89+
#define HCLGE_ROCEE_RAS_NFE_INT_EN_MASK 0xF
90+
#define HCLGE_ROCEE_RAS_CE_INT_EN_MASK 0x1
91+
#define HCLGE_ROCEE_RERR_INT_MASK BIT(0)
92+
#define HCLGE_ROCEE_BERR_INT_MASK BIT(1)
93+
#define HCLGE_ROCEE_ECC_INT_MASK BIT(2)
94+
#define HCLGE_ROCEE_OVF_INT_MASK BIT(3)
95+
#define HCLGE_ROCEE_OVF_ERR_INT_MASK 0x10000
96+
#define HCLGE_ROCEE_OVF_ERR_TYPE_MASK 0x3F
97+
8698
enum hclge_err_int_type {
8799
HCLGE_ERR_INT_MSIX = 0,
88100
HCLGE_ERR_INT_RAS_CE = 1,

0 commit comments

Comments
 (0)