Skip to content

Commit d58c183

Browse files
kwan-intcdledford
authored andcommitted
IB/hfi1: Drop stale TID RDMA packets
In a congested fabric with adaptive routing enabled, traces show that the sender could receive stale TID RDMA NAK packets that contain newer KDETH PSNs and older Verbs PSNs. If not dropped, these packets could cause the incorrect rewinding of the software flows and the incorrect completion of TID RDMA WRITE requests, and eventually leading to memory corruption and kernel crash. The current code drops stale TID RDMA ACK/NAK packets solely based on KDETH PSNs, which may lead to erroneous processing. This patch fixes the issue by also checking the Verbs PSN. Addition checks are added before rewinding the TID RDMA WRITE DATA packets. Fixes: 9e93e96 ("IB/hfi1: Add a function to receive TID RDMA ACK packet") Cc: <[email protected]> Reviewed-by: Mike Marciniszyn <[email protected]> Signed-off-by: Kaike Wan <[email protected]> Signed-off-by: Dennis Dalessandro <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Doug Ledford <[email protected]>
1 parent 9b44007 commit d58c183

File tree

1 file changed

+11
-2
lines changed

1 file changed

+11
-2
lines changed

drivers/infiniband/hw/hfi1/tid_rdma.c

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4509,7 +4509,7 @@ void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet)
45094509
struct rvt_swqe *wqe;
45104510
struct tid_rdma_request *req;
45114511
struct tid_rdma_flow *flow;
4512-
u32 aeth, psn, req_psn, ack_psn, resync_psn, ack_kpsn;
4512+
u32 aeth, psn, req_psn, ack_psn, flpsn, resync_psn, ack_kpsn;
45134513
unsigned long flags;
45144514
u16 fidx;
45154515

@@ -4538,6 +4538,9 @@ void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet)
45384538
ack_kpsn--;
45394539
}
45404540

4541+
if (unlikely(qp->s_acked == qp->s_tail))
4542+
goto ack_op_err;
4543+
45414544
wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
45424545

45434546
if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)
@@ -4550,7 +4553,8 @@ void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet)
45504553
trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail, flow);
45514554

45524555
/* Drop stale ACK/NAK */
4553-
if (cmp_psn(psn, full_flow_psn(flow, flow->flow_state.spsn)) < 0)
4556+
if (cmp_psn(psn, full_flow_psn(flow, flow->flow_state.spsn)) < 0 ||
4557+
cmp_psn(req_psn, flow->flow_state.resp_ib_psn) < 0)
45544558
goto ack_op_err;
45554559

45564560
while (cmp_psn(ack_kpsn,
@@ -4712,7 +4716,12 @@ void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet)
47124716
switch ((aeth >> IB_AETH_CREDIT_SHIFT) &
47134717
IB_AETH_CREDIT_MASK) {
47144718
case 0: /* PSN sequence error */
4719+
if (!req->flows)
4720+
break;
47154721
flow = &req->flows[req->acked_tail];
4722+
flpsn = full_flow_psn(flow, flow->flow_state.lpsn);
4723+
if (cmp_psn(psn, flpsn) > 0)
4724+
break;
47164725
trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail,
47174726
flow);
47184727
req->r_ack_psn = mask_psn(be32_to_cpu(ohdr->bth[2]));

0 commit comments

Comments
 (0)