Skip to content

Commit 637d078

Browse files
Wei Lin Guaygerd-rausch
authored andcommitted
net/rds: Avoid stalled connection due to CM REQ retries
RDS drops a connection and destroys its cm_id once a CM REJ is sent. In a congested fabric, there is a race where a remote node receives a CM REJ after CM has retried another CM REQ. In this scenario, the cm_id that sends the CM REQ is no longer exists even though the remote end might respond with a CM REP, and wait for an incoming CM RTU. This RDS connection establishment is stuck until the connection is destroyed after the CM timeout. As a result, this leads to a very long brownout time. Thus, this patch adds a mechanism to detect a rejected CM REQ and rejects all the subsequent CM REQ that are retried by the CM. Orabug: 25521901 Signed-off-by: Wei Lin Guay <[email protected]> Tested-by: Dib Chatterjee <[email protected]> (cherry picked from commit c5c4f1472bc788ddc69af713f975ad92bdefe206 repo https://linux-git.us.oracle.com/UEK/linux-wguay-public) Conflict: net/rds/ib_cm.c Made it checkpatch clean. v1->v2: Added Shannon's recommendations Signed-off-by: Håkon Bugge <[email protected]> Reviewed-by: Shannon Nelson <[email protected]>
1 parent d4ead45 commit 637d078

File tree

2 files changed

+40
-6
lines changed

2 files changed

+40
-6
lines changed

net/rds/ib.h

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@
5555
#define RDS_IB_CLEAN_CACHE 1
5656

5757
#define RDS_IB_DEFAULT_FREG_PORT_NUM 1
58+
#define RDS_CM_RETRY_SEQ_EN BIT(7)
59+
#define RDS_CM_REQ_SEQ_SZ (RDS_CM_RETRY_SEQ_EN - 1)
5860

5961
extern struct rw_semaphore rds_ib_devices_lock;
6062
extern struct list_head rds_ib_devices;
@@ -92,7 +94,7 @@ struct rds_ib_conn_priv_cmn {
9294
u8 ricpc_protocol_minor;
9395
__be16 ricpc_protocol_minor_mask; /* bitmask */
9496
u8 ricpc_tos;
95-
u8 ricpc_reserved1;
97+
u8 ricpc_cm_seq;
9698
__be16 ricpc_frag_sz;
9799
__be64 ricpc_ack_seq;
98100
__be32 ricpc_credit; /* non-zero enables flow ctl */
@@ -116,7 +118,7 @@ struct rds6_ib_connect_private {
116118
#define dp_protocol_minor dp_cmn.ricpc_protocol_minor
117119
#define dp_protocol_minor_mask dp_cmn.ricpc_protocol_minor_mask
118120
#define dp_tos dp_cmn.ricpc_tos
119-
#define dp_reserved1 dp_cmn.ricpc_reserved1
121+
#define dp_cm_seq dp_cmn.ricpc_cm_seq
120122
#define dp_frag_sz dp_cmn.ricpc_frag_sz
121123
#define dp_ack_seq dp_cmn.ricpc_ack_seq
122124
#define dp_credit dp_cmn.ricpc_credit
@@ -275,13 +277,17 @@ struct rds_ib_connection {
275277
unsigned int i_rx_wait_for_handler;
276278
atomic_t i_worker_has_rx;
277279
atomic_t i_cq_quiesce;
280+
u8 i_req_sequence;
281+
u8 i_prev_seq;
282+
u8 i_last_rej_seq;
278283
};
279284

280285
/* This assumes that atomic_t is at least 32 bits */
281286
#define IB_GET_SEND_CREDITS(v) ((v) & 0xffff)
282287
#define IB_GET_POST_CREDITS(v) ((v) >> 16)
283288
#define IB_SET_SEND_CREDITS(v) ((v) & 0xffff)
284289
#define IB_SET_POST_CREDITS(v) ((v) << 16)
290+
#define IB_GET_CM_SEQ_NUM(v) ((v) & RDS_CM_REQ_SEQ_SZ)
285291

286292
struct rds_ib_ipaddr {
287293
struct list_head list;

net/rds/ib_cm.c

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -373,7 +373,7 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
373373
u32 protocol_version,
374374
u32 max_responder_resources,
375375
u32 max_initiator_depth, u16 frag,
376-
bool isv6)
376+
bool isv6, u8 seq)
377377
{
378378
struct rds_ib_connection *ic = conn->c_transport_data;
379379
struct rds_ib_device *rds_ibdev = ic->rds_ibdev;
@@ -403,6 +403,7 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
403403
cpu_to_be64(rds_ib_piggyb_ack(ic));
404404
dp->ricp_v6.dp_tos = conn->c_tos;
405405
dp->ricp_v6.dp_frag_sz = cpu_to_be16(frag);
406+
dp->ricp_v6.dp_cm_seq = seq;
406407

407408
conn_param->private_data = &dp->ricp_v6;
408409
conn_param->private_data_len = sizeof(dp->ricp_v6);
@@ -419,6 +420,7 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
419420
cpu_to_be64(rds_ib_piggyb_ack(ic));
420421
dp->ricp_v4.dp_tos = conn->c_tos;
421422
dp->ricp_v4.dp_frag_sz = cpu_to_be16(frag);
423+
dp->ricp_v4.dp_cm_seq = seq;
422424

423425
conn_param->private_data = &dp->ricp_v4;
424426
conn_param->private_data_len = sizeof(dp->ricp_v4);
@@ -995,6 +997,8 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
995997
u32 version;
996998
int err = 1;
997999
u16 frag;
1000+
u8 cm_req_seq = 0;
1001+
bool cm_seq_check_enable = false;
9981002

9991003
/* Check whether the remote protocol version matches ours. */
10001004
version = rds_ib_protocol_compatible(event, isv6);
@@ -1019,12 +1023,16 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
10191023
goto out;
10201024
}
10211025
}
1026+
cm_seq_check_enable = dp->ricp_v6.dp_cm_seq & RDS_CM_RETRY_SEQ_EN;
1027+
cm_req_seq = IB_GET_CM_SEQ_NUM(dp->ricp_v6.dp_cm_seq);
10221028
} else {
10231029
dp_cmn = &dp->ricp_v4.dp_cmn;
10241030
ipv6_addr_set_v4mapped(dp->ricp_v4.dp_saddr, &s_mapped_addr);
10251031
ipv6_addr_set_v4mapped(dp->ricp_v4.dp_daddr, &d_mapped_addr);
10261032
saddr6 = &s_mapped_addr;
10271033
daddr6 = &d_mapped_addr;
1034+
cm_seq_check_enable = dp->ricp_v4.dp_cm_seq & RDS_CM_RETRY_SEQ_EN;
1035+
cm_req_seq = IB_GET_CM_SEQ_NUM(dp->ricp_v4.dp_cm_seq);
10281036
}
10291037

10301038
rds_rtd_ptr(RDS_RTD_CM,
@@ -1076,12 +1084,27 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
10761084
* see the comment above rds_queue_reconnect()
10771085
*/
10781086
mutex_lock(&conn->c_cm_lock);
1087+
ic = conn->c_transport_data;
1088+
1089+
if (ic && cm_seq_check_enable) {
1090+
if (cm_req_seq != ic->i_prev_seq) {
1091+
rds_rtd(RDS_RTD_CM_EXT_P,
1092+
"cm_id %p conn %p updating ic->i_prev_seq %d cm_req_seq %d\n",
1093+
cm_id, conn, ic->i_prev_seq, cm_req_seq);
1094+
ic->i_prev_seq = cm_req_seq;
1095+
} else if (cm_req_seq == ic->i_prev_seq && ic->i_last_rej_seq == cm_req_seq) {
1096+
rds_rtd(RDS_RTD_CM_EXT_P,
1097+
"duplicated REQ cm_id %p conn %p reject! ic->i_last_rej_seq %d cm_req_seq %d\n",
1098+
cm_id, conn, ic->i_last_rej_seq, cm_req_seq);
1099+
goto out;
1100+
}
1101+
}
1102+
10791103
if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
10801104
/*
10811105
* in both of the cases below, the conn is half setup.
10821106
* we need to make sure the lower layers don't destroy it
10831107
*/
1084-
ic = conn->c_transport_data;
10851108
if (ic && ic->i_cm_id == cm_id)
10861109
destroy = 0;
10871110
if (rds_conn_state(conn) == RDS_CONN_UP) {
@@ -1125,6 +1148,8 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
11251148
rds_ib_stats_inc(s_ib_connect_raced);
11261149
}
11271150
}
1151+
if (ic && cm_seq_check_enable)
1152+
ic->i_last_rej_seq = cm_req_seq;
11281153
goto out;
11291154
}
11301155

@@ -1172,7 +1197,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
11721197
rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version,
11731198
event->param.conn.responder_resources,
11741199
event->param.conn.initiator_depth,
1175-
frag, isv6);
1200+
frag, isv6, cm_req_seq);
11761201

11771202
/* rdma_accept() calls rdma_reject() internally if it fails */
11781203
err = rdma_accept(cm_id, &conn_param);
@@ -1224,6 +1249,7 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6)
12241249
union rds_ib_conn_priv dp;
12251250
u16 frag;
12261251
int ret;
1252+
u8 seq;
12271253

12281254
#ifdef CONFIG_RDS_ACL
12291255

@@ -1265,9 +1291,11 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6)
12651291
goto out;
12661292
}
12671293
frag = rds_ib_set_frag_size(conn, ib_init_frag_size);
1294+
ic->i_req_sequence = IB_GET_CM_SEQ_NUM(ic->i_req_sequence + 1);
1295+
seq = RDS_CM_RETRY_SEQ_EN | ic->i_req_sequence;
12681296
rds_ib_cm_fill_conn_param(conn, &conn_param, &dp,
12691297
conn->c_proposed_version, UINT_MAX, UINT_MAX,
1270-
frag, isv6);
1298+
frag, isv6, seq);
12711299
ret = rdma_connect(cm_id, &conn_param);
12721300
if (ret) {
12731301
pr_warn("RDS/IB: rdma_connect failed (%d)\n", ret);

0 commit comments

Comments
 (0)