@@ -337,6 +337,30 @@ static const struct hclge_hw_error hclge_ssu_port_based_pf_int[] = {
337
337
{ /* sentinel */ }
338
338
};
339
339
340
+ static const struct hclge_hw_error hclge_rocee_qmm_ovf_err_int [] = {
341
+ { .int_msk = 0 , .msg = "rocee qmm ovf: sgid invalid err" },
342
+ { .int_msk = 0x4 , .msg = "rocee qmm ovf: sgid ovf err" },
343
+ { .int_msk = 0x8 , .msg = "rocee qmm ovf: smac invalid err" },
344
+ { .int_msk = 0xC , .msg = "rocee qmm ovf: smac ovf err" },
345
+ { .int_msk = 0x10 , .msg = "rocee qmm ovf: cqc invalid err" },
346
+ { .int_msk = 0x11 , .msg = "rocee qmm ovf: cqc ovf err" },
347
+ { .int_msk = 0x12 , .msg = "rocee qmm ovf: cqc hopnum err" },
348
+ { .int_msk = 0x13 , .msg = "rocee qmm ovf: cqc ba0 err" },
349
+ { .int_msk = 0x14 , .msg = "rocee qmm ovf: srqc invalid err" },
350
+ { .int_msk = 0x15 , .msg = "rocee qmm ovf: srqc ovf err" },
351
+ { .int_msk = 0x16 , .msg = "rocee qmm ovf: srqc hopnum err" },
352
+ { .int_msk = 0x17 , .msg = "rocee qmm ovf: srqc ba0 err" },
353
+ { .int_msk = 0x18 , .msg = "rocee qmm ovf: mpt invalid err" },
354
+ { .int_msk = 0x19 , .msg = "rocee qmm ovf: mpt ovf err" },
355
+ { .int_msk = 0x1A , .msg = "rocee qmm ovf: mpt hopnum err" },
356
+ { .int_msk = 0x1B , .msg = "rocee qmm ovf: mpt ba0 err" },
357
+ { .int_msk = 0x1C , .msg = "rocee qmm ovf: qpc invalid err" },
358
+ { .int_msk = 0x1D , .msg = "rocee qmm ovf: qpc ovf err" },
359
+ { .int_msk = 0x1E , .msg = "rocee qmm ovf: qpc hopnum err" },
360
+ { .int_msk = 0x1F , .msg = "rocee qmm ovf: qpc ba0 err" },
361
+ { /* sentinel */ }
362
+ };
363
+
340
364
static void hclge_log_error (struct device * dev , char * reg ,
341
365
const struct hclge_hw_error * err ,
342
366
u32 err_sts )
@@ -1023,6 +1047,148 @@ static int hclge_handle_all_ras_errors(struct hclge_dev *hdev)
1023
1047
return ret ;
1024
1048
}
1025
1049
1050
+ static int hclge_log_rocee_ovf_error (struct hclge_dev * hdev )
1051
+ {
1052
+ struct device * dev = & hdev -> pdev -> dev ;
1053
+ struct hclge_desc desc [2 ];
1054
+ int ret ;
1055
+
1056
+ /* read overflow error status */
1057
+ ret = hclge_cmd_query_error (hdev , & desc [0 ],
1058
+ HCLGE_ROCEE_PF_RAS_INT_CMD ,
1059
+ 0 , 0 , 0 );
1060
+ if (ret ) {
1061
+ dev_err (dev , "failed(%d) to query ROCEE OVF error sts\n" , ret );
1062
+ return ret ;
1063
+ }
1064
+
1065
+ /* log overflow error */
1066
+ if (le32_to_cpu (desc [0 ].data [0 ]) & HCLGE_ROCEE_OVF_ERR_INT_MASK ) {
1067
+ const struct hclge_hw_error * err ;
1068
+ u32 err_sts ;
1069
+
1070
+ err = & hclge_rocee_qmm_ovf_err_int [0 ];
1071
+ err_sts = HCLGE_ROCEE_OVF_ERR_TYPE_MASK &
1072
+ le32_to_cpu (desc [0 ].data [0 ]);
1073
+ while (err -> msg ) {
1074
+ if (err -> int_msk == err_sts ) {
1075
+ dev_warn (dev , "%s [error status=0x%x] found\n" ,
1076
+ err -> msg ,
1077
+ le32_to_cpu (desc [0 ].data [0 ]));
1078
+ break ;
1079
+ }
1080
+ err ++ ;
1081
+ }
1082
+ }
1083
+
1084
+ if (le32_to_cpu (desc [0 ].data [1 ]) & HCLGE_ROCEE_OVF_ERR_INT_MASK ) {
1085
+ dev_warn (dev , "ROCEE TSP OVF [error status=0x%x] found\n" ,
1086
+ le32_to_cpu (desc [0 ].data [1 ]));
1087
+ }
1088
+
1089
+ if (le32_to_cpu (desc [0 ].data [2 ]) & HCLGE_ROCEE_OVF_ERR_INT_MASK ) {
1090
+ dev_warn (dev , "ROCEE SCC OVF [error status=0x%x] found\n" ,
1091
+ le32_to_cpu (desc [0 ].data [2 ]));
1092
+ }
1093
+
1094
+ return 0 ;
1095
+ }
1096
+
1097
+ static int hclge_log_and_clear_rocee_ras_error (struct hclge_dev * hdev )
1098
+ {
1099
+ enum hnae3_reset_type reset_type = HNAE3_FUNC_RESET ;
1100
+ struct hnae3_ae_dev * ae_dev = hdev -> ae_dev ;
1101
+ struct device * dev = & hdev -> pdev -> dev ;
1102
+ struct hclge_desc desc [2 ];
1103
+ unsigned int status ;
1104
+ int ret ;
1105
+
1106
+ /* read RAS error interrupt status */
1107
+ ret = hclge_cmd_query_error (hdev , & desc [0 ],
1108
+ HCLGE_QUERY_CLEAR_ROCEE_RAS_INT ,
1109
+ 0 , 0 , 0 );
1110
+ if (ret ) {
1111
+ dev_err (dev , "failed(%d) to query ROCEE RAS INT SRC\n" , ret );
1112
+ /* reset everything for now */
1113
+ HCLGE_SET_DEFAULT_RESET_REQUEST (HNAE3_GLOBAL_RESET );
1114
+ return ret ;
1115
+ }
1116
+
1117
+ status = le32_to_cpu (desc [0 ].data [0 ]);
1118
+
1119
+ if (status & HCLGE_ROCEE_RERR_INT_MASK )
1120
+ dev_warn (dev , "ROCEE RAS AXI rresp error\n" );
1121
+
1122
+ if (status & HCLGE_ROCEE_BERR_INT_MASK )
1123
+ dev_warn (dev , "ROCEE RAS AXI bresp error\n" );
1124
+
1125
+ if (status & HCLGE_ROCEE_ECC_INT_MASK ) {
1126
+ dev_warn (dev , "ROCEE RAS 2bit ECC error\n" );
1127
+ reset_type = HNAE3_GLOBAL_RESET ;
1128
+ }
1129
+
1130
+ if (status & HCLGE_ROCEE_OVF_INT_MASK ) {
1131
+ ret = hclge_log_rocee_ovf_error (hdev );
1132
+ if (ret ) {
1133
+ dev_err (dev , "failed(%d) to process ovf error\n" , ret );
1134
+ /* reset everything for now */
1135
+ HCLGE_SET_DEFAULT_RESET_REQUEST (HNAE3_GLOBAL_RESET );
1136
+ return ret ;
1137
+ }
1138
+ }
1139
+
1140
+ /* clear error status */
1141
+ hclge_cmd_reuse_desc (& desc [0 ], false);
1142
+ ret = hclge_cmd_send (& hdev -> hw , & desc [0 ], 1 );
1143
+ if (ret ) {
1144
+ dev_err (dev , "failed(%d) to clear ROCEE RAS error\n" , ret );
1145
+ /* reset everything for now */
1146
+ reset_type = HNAE3_GLOBAL_RESET ;
1147
+ }
1148
+
1149
+ HCLGE_SET_DEFAULT_RESET_REQUEST (reset_type );
1150
+
1151
+ return ret ;
1152
+ }
1153
+
1154
+ static int hclge_config_rocee_ras_interrupt (struct hclge_dev * hdev , bool en )
1155
+ {
1156
+ struct device * dev = & hdev -> pdev -> dev ;
1157
+ struct hclge_desc desc ;
1158
+ int ret ;
1159
+
1160
+ if (hdev -> pdev -> revision < 0x21 || !hnae3_dev_roce_supported (hdev ))
1161
+ return 0 ;
1162
+
1163
+ hclge_cmd_setup_basic_desc (& desc , HCLGE_CONFIG_ROCEE_RAS_INT_EN , false);
1164
+ if (en ) {
1165
+ /* enable ROCEE hw error interrupts */
1166
+ desc .data [0 ] = cpu_to_le32 (HCLGE_ROCEE_RAS_NFE_INT_EN );
1167
+ desc .data [1 ] = cpu_to_le32 (HCLGE_ROCEE_RAS_CE_INT_EN );
1168
+
1169
+ hclge_log_and_clear_rocee_ras_error (hdev );
1170
+ }
1171
+ desc .data [2 ] = cpu_to_le32 (HCLGE_ROCEE_RAS_NFE_INT_EN_MASK );
1172
+ desc .data [3 ] = cpu_to_le32 (HCLGE_ROCEE_RAS_CE_INT_EN_MASK );
1173
+
1174
+ ret = hclge_cmd_send (& hdev -> hw , & desc , 1 );
1175
+ if (ret )
1176
+ dev_err (dev , "failed(%d) to config ROCEE RAS interrupt\n" , ret );
1177
+
1178
+ return ret ;
1179
+ }
1180
+
1181
+ static int hclge_handle_rocee_ras_error (struct hnae3_ae_dev * ae_dev )
1182
+ {
1183
+ struct hclge_dev * hdev = ae_dev -> priv ;
1184
+
1185
+ if (test_bit (HCLGE_STATE_RST_HANDLING , & hdev -> state ) ||
1186
+ hdev -> pdev -> revision < 0x21 )
1187
+ return HNAE3_NONE_RESET ;
1188
+
1189
+ return hclge_log_and_clear_rocee_ras_error (hdev );
1190
+ }
1191
+
1026
1192
static const struct hclge_hw_blk hw_blk [] = {
1027
1193
{
1028
1194
.msk = BIT (0 ), .name = "IGU_EGU" ,
@@ -1058,6 +1224,7 @@ static const struct hclge_hw_blk hw_blk[] = {
1058
1224
int hclge_hw_error_set_state (struct hclge_dev * hdev , bool state )
1059
1225
{
1060
1226
const struct hclge_hw_blk * module = hw_blk ;
1227
+ struct device * dev = & hdev -> pdev -> dev ;
1061
1228
int ret = 0 ;
1062
1229
1063
1230
while (module -> name ) {
@@ -1069,6 +1236,10 @@ int hclge_hw_error_set_state(struct hclge_dev *hdev, bool state)
1069
1236
module ++ ;
1070
1237
}
1071
1238
1239
+ ret = hclge_config_rocee_ras_interrupt (hdev , state );
1240
+ if (ret )
1241
+ dev_err (dev , "fail(%d) to configure ROCEE err int\n" , ret );
1242
+
1072
1243
return ret ;
1073
1244
}
1074
1245
@@ -1086,9 +1257,21 @@ pci_ers_result_t hclge_handle_hw_ras_error(struct hnae3_ae_dev *ae_dev)
1086
1257
"HNS Non-Fatal RAS error(status=0x%x) identified\n" ,
1087
1258
status );
1088
1259
hclge_handle_all_ras_errors (hdev );
1089
- return PCI_ERS_RESULT_NEED_RESET ;
1260
+ } else {
1261
+ if (test_bit (HCLGE_STATE_RST_HANDLING , & hdev -> state ) ||
1262
+ hdev -> pdev -> revision < 0x21 )
1263
+ return PCI_ERS_RESULT_RECOVERED ;
1264
+ }
1265
+
1266
+ if (status & HCLGE_RAS_REG_ROCEE_ERR_MASK ) {
1267
+ dev_warn (dev , "ROCEE uncorrected RAS error identified\n" );
1268
+ hclge_handle_rocee_ras_error (ae_dev );
1090
1269
}
1091
1270
1271
+ if (status & HCLGE_RAS_REG_NFE_MASK ||
1272
+ status & HCLGE_RAS_REG_ROCEE_ERR_MASK )
1273
+ return PCI_ERS_RESULT_NEED_RESET ;
1274
+
1092
1275
return PCI_ERS_RESULT_RECOVERED ;
1093
1276
}
1094
1277
0 commit comments