Skip to content

Commit 0492458

Browse files
sheftyrleon
authored andcommitted
IB/cm: Explicitly mark if a response MAD is a retransmission
In several situations the CM may send a reply to a received MAD without the reply being directly linked with a cm_id. For example, it may send a REJ in response to a REQ which does not match a listener. Or, it may send a DREP in response to a DREQ if the cm_id has already been destroyed. This can happen if the original DREP was lost and the DREQ was retried. When such a response MAD completes, it updates a counter tracking how many MADs were retried. However, not all response MADs issued directly by the CM may be retries. The REJ mentioned in the example above is such a case. To distinguish between responses which were retries versus those that are not, the send_handler performs the following check: is a retry if the response is not associated with a cm_id and the response is not a REJ message. Replace this indirect method of checking if a response is a retry with an explicit check. Note that these retries are generated directly by the CM, rather than retried by the MAD layer. This change will be needed by later changes which would otherwise break the indirect check. Signed-off-by: Sean Hefty <[email protected]> Signed-off-by: Or Har-Toov <[email protected]> Signed-off-by: Vlad Dumitrescu <[email protected]> Link: https://patch.msgid.link/1ee6e2a68f8de1992b9da23aa1d7e3f9f25e0036.1731495873.git.leon@kernel.org Signed-off-by: Leon Romanovsky <[email protected]>
1 parent ede132a commit 0492458

File tree

1 file changed

+31
-20
lines changed
  • drivers/infiniband/core

1 file changed

+31
-20
lines changed

drivers/infiniband/core/cm.c

Lines changed: 31 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ MODULE_DESCRIPTION("InfiniBand CM");
3535
MODULE_LICENSE("Dual BSD/GPL");
3636

3737
#define CM_DESTROY_ID_WAIT_TIMEOUT 10000 /* msecs */
38+
#define CM_DIRECT_RETRY_CTX ((void *) 1UL)
39+
3840
static const char * const ibcm_rej_reason_strs[] = {
3941
[IB_CM_REJ_NO_QP] = "no QP",
4042
[IB_CM_REJ_NO_EEC] = "no EEC",
@@ -358,13 +360,20 @@ static void cm_free_priv_msg(struct ib_mad_send_buf *msg)
358360
ib_free_send_mad(msg);
359361
}
360362

361-
static struct ib_mad_send_buf *cm_alloc_response_msg_no_ah(struct cm_port *port,
362-
struct ib_mad_recv_wc *mad_recv_wc)
363+
static struct ib_mad_send_buf *
364+
cm_alloc_response_msg_no_ah(struct cm_port *port,
365+
struct ib_mad_recv_wc *mad_recv_wc,
366+
bool direct_retry)
363367
{
364-
return ib_create_send_mad(port->mad_agent, 1, mad_recv_wc->wc->pkey_index,
365-
0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
366-
GFP_ATOMIC,
367-
IB_MGMT_BASE_VERSION);
368+
struct ib_mad_send_buf *m;
369+
370+
m = ib_create_send_mad(port->mad_agent, 1, mad_recv_wc->wc->pkey_index,
371+
0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
372+
GFP_ATOMIC, IB_MGMT_BASE_VERSION);
373+
if (!IS_ERR(m))
374+
m->context[0] = direct_retry ? CM_DIRECT_RETRY_CTX : NULL;
375+
376+
return m;
368377
}
369378

370379
static int cm_create_response_msg_ah(struct cm_port *port,
@@ -384,12 +393,13 @@ static int cm_create_response_msg_ah(struct cm_port *port,
384393

385394
static int cm_alloc_response_msg(struct cm_port *port,
386395
struct ib_mad_recv_wc *mad_recv_wc,
396+
bool direct_retry,
387397
struct ib_mad_send_buf **msg)
388398
{
389399
struct ib_mad_send_buf *m;
390400
int ret;
391401

392-
m = cm_alloc_response_msg_no_ah(port, mad_recv_wc);
402+
m = cm_alloc_response_msg_no_ah(port, mad_recv_wc, direct_retry);
393403
if (IS_ERR(m))
394404
return PTR_ERR(m);
395405

@@ -1598,7 +1608,7 @@ static int cm_issue_rej(struct cm_port *port,
15981608
struct cm_rej_msg *rej_msg, *rcv_msg;
15991609
int ret;
16001610

1601-
ret = cm_alloc_response_msg(port, mad_recv_wc, &msg);
1611+
ret = cm_alloc_response_msg(port, mad_recv_wc, false, &msg);
16021612
if (ret)
16031613
return ret;
16041614

@@ -1951,7 +1961,7 @@ static void cm_dup_req_handler(struct cm_work *work,
19511961
}
19521962
spin_unlock_irq(&cm_id_priv->lock);
19531963

1954-
ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg);
1964+
ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, true, &msg);
19551965
if (ret)
19561966
return;
19571967

@@ -2444,7 +2454,7 @@ static void cm_dup_rep_handler(struct cm_work *work)
24442454

24452455
atomic_long_inc(
24462456
&work->port->counters[CM_RECV_DUPLICATES][CM_REP_COUNTER]);
2447-
ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg);
2457+
ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, true, &msg);
24482458
if (ret)
24492459
goto deref;
24502460

@@ -2791,7 +2801,7 @@ static int cm_issue_drep(struct cm_port *port,
27912801
struct cm_drep_msg *drep_msg;
27922802
int ret;
27932803

2794-
ret = cm_alloc_response_msg(port, mad_recv_wc, &msg);
2804+
ret = cm_alloc_response_msg(port, mad_recv_wc, true, &msg);
27952805
if (ret)
27962806
return ret;
27972807

@@ -2856,7 +2866,8 @@ static int cm_dreq_handler(struct cm_work *work)
28562866
case IB_CM_TIMEWAIT:
28572867
atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES]
28582868
[CM_DREQ_COUNTER]);
2859-
msg = cm_alloc_response_msg_no_ah(work->port, work->mad_recv_wc);
2869+
msg = cm_alloc_response_msg_no_ah(work->port, work->mad_recv_wc,
2870+
true);
28602871
if (IS_ERR(msg))
28612872
goto unlock;
28622873

@@ -3361,7 +3372,8 @@ static int cm_lap_handler(struct cm_work *work)
33613372
case IB_CM_MRA_LAP_SENT:
33623373
atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES]
33633374
[CM_LAP_COUNTER]);
3364-
msg = cm_alloc_response_msg_no_ah(work->port, work->mad_recv_wc);
3375+
msg = cm_alloc_response_msg_no_ah(work->port, work->mad_recv_wc,
3376+
true);
33653377
if (IS_ERR(msg))
33663378
goto unlock;
33673379

@@ -3826,7 +3838,7 @@ static void cm_send_handler(struct ib_mad_agent *mad_agent,
38263838
struct ib_mad_send_wc *mad_send_wc)
38273839
{
38283840
struct ib_mad_send_buf *msg = mad_send_wc->send_buf;
3829-
struct cm_id_private *cm_id_priv = msg->context[0];
3841+
struct cm_id_private *cm_id_priv;
38303842
enum ib_cm_state state =
38313843
(enum ib_cm_state)(unsigned long)msg->context[1];
38323844
struct cm_port *port;
@@ -3836,13 +3848,12 @@ static void cm_send_handler(struct ib_mad_agent *mad_agent,
38363848
attr_index = be16_to_cpu(((struct ib_mad_hdr *)
38373849
msg->mad)->attr_id) - CM_ATTR_ID_OFFSET;
38383850

3839-
/*
3840-
* If the send was in response to a received message (context[0] is not
3841-
* set to a cm_id), and is not a REJ, then it is a send that was
3842-
* manually retried.
3843-
*/
3844-
if (!cm_id_priv && (attr_index != CM_REJ_COUNTER))
3851+
if (msg->context[0] == CM_DIRECT_RETRY_CTX) {
38453852
msg->retries = 1;
3853+
cm_id_priv = NULL;
3854+
} else {
3855+
cm_id_priv = msg->context[0];
3856+
}
38463857

38473858
atomic_long_add(1 + msg->retries, &port->counters[CM_XMIT][attr_index]);
38483859
if (msg->retries)

0 commit comments

Comments
 (0)