Skip to content

Commit d9d1f5e

Browse files
kwan-intcdledford
authored andcommitted
IB/hfi1: Drop stale TID RDMA packets that cause TIDErr
In a congested fabric with adaptive routing enabled, traces show that packets could be delivered out of order. A stale TID RDMA data packet could lead to TidErr if the TID entries have been released by duplicate data packets generated from retries, and subsequently erroneously force the qp into error state in the current implementation. Since the payload has already been dropped by hardware, the packet can be simply dropped and it is no longer necessary to put the qp into error state. Fixes: 9905bf0 ("IB/hfi1: Add functions to receive TID RDMA READ response") Cc: <[email protected]> Reviewed-by: Mike Marciniszyn <[email protected]> Signed-off-by: Kaike Wan <[email protected]> Signed-off-by: Dennis Dalessandro <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Doug Ledford <[email protected]>
1 parent 90fdae6 commit d9d1f5e

File tree

1 file changed

+3
-44
lines changed

1 file changed

+3
-44
lines changed

drivers/infiniband/hw/hfi1/tid_rdma.c

Lines changed: 3 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -2574,18 +2574,9 @@ void hfi1_kern_read_tid_flow_free(struct rvt_qp *qp)
25742574
hfi1_kern_clear_hw_flow(priv->rcd, qp);
25752575
}
25762576

2577-
static bool tid_rdma_tid_err(struct hfi1_ctxtdata *rcd,
2578-
struct hfi1_packet *packet, u8 rcv_type,
2579-
u8 opcode)
2577+
static bool tid_rdma_tid_err(struct hfi1_packet *packet, u8 rcv_type)
25802578
{
25812579
struct rvt_qp *qp = packet->qp;
2582-
struct hfi1_qp_priv *qpriv = qp->priv;
2583-
u32 ipsn;
2584-
struct ib_other_headers *ohdr = packet->ohdr;
2585-
struct rvt_ack_entry *e;
2586-
struct tid_rdma_request *req;
2587-
struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
2588-
u32 i;
25892580

25902581
if (rcv_type >= RHF_RCV_TYPE_IB)
25912582
goto done;
@@ -2602,41 +2593,9 @@ static bool tid_rdma_tid_err(struct hfi1_ctxtdata *rcd,
26022593
if (rcv_type == RHF_RCV_TYPE_EAGER) {
26032594
hfi1_restart_rc(qp, qp->s_last_psn + 1, 1);
26042595
hfi1_schedule_send(qp);
2605-
goto done_unlock;
26062596
}
26072597

2608-
/*
2609-
* For TID READ response, error out QP after freeing the tid
2610-
* resources.
2611-
*/
2612-
if (opcode == TID_OP(READ_RESP)) {
2613-
ipsn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn));
2614-
if (cmp_psn(ipsn, qp->s_last_psn) > 0 &&
2615-
cmp_psn(ipsn, qp->s_psn) < 0) {
2616-
hfi1_kern_read_tid_flow_free(qp);
2617-
spin_unlock(&qp->s_lock);
2618-
rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
2619-
goto done;
2620-
}
2621-
goto done_unlock;
2622-
}
2623-
2624-
/*
2625-
* Error out the qp for TID RDMA WRITE
2626-
*/
2627-
hfi1_kern_clear_hw_flow(qpriv->rcd, qp);
2628-
for (i = 0; i < rvt_max_atomic(rdi); i++) {
2629-
e = &qp->s_ack_queue[i];
2630-
if (e->opcode == TID_OP(WRITE_REQ)) {
2631-
req = ack_to_tid_req(e);
2632-
hfi1_kern_exp_rcv_clear_all(req);
2633-
}
2634-
}
2635-
spin_unlock(&qp->s_lock);
2636-
rvt_rc_error(qp, IB_WC_LOC_LEN_ERR);
2637-
goto done;
2638-
2639-
done_unlock:
2598+
/* Since no payload is delivered, just drop the packet */
26402599
spin_unlock(&qp->s_lock);
26412600
done:
26422601
return true;
@@ -2925,7 +2884,7 @@ bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd,
29252884
if (lnh == HFI1_LRH_GRH)
29262885
goto r_unlock;
29272886

2928-
if (tid_rdma_tid_err(rcd, packet, rcv_type, opcode))
2887+
if (tid_rdma_tid_err(packet, rcv_type))
29292888
goto r_unlock;
29302889
}
29312890

0 commit comments

Comments
 (0)