Skip to content

Commit 8991a87

Browse files
Bang NguyenJerry Snitselaar
authored andcommitted
RDS: SA query optimization
SA query optimization The fact is all QoS lanes share the same physical path b/w an IP pair. The only difference is the service level that affects the quality of service for each lane. With that, we have the following optimization: 1. Lane 0 to issue SA query request to the SM. All other lanes will wait for lane 0 to finish route resolution, then copy in the resolved path and fill in its service level. 2. One-side reconnect to reduce reconnect racing, thus further reducing the number of SA queries to the SM. Reducing brownout for non-zero lanes In some case, RDMA CM is delaying the disconnect event after switch/node failure and this is causing extra brownout for RDS reconnection. The workaround is to have lane 0 probe other lanes by sending a HB msg. If the lane is down, this will cause a send completion error and an immediate reconnect. Orabug: 18801977 Signed-off-by: Bang Nguyen <[email protected]> Signed-off-by: Mukesh Kacker <[email protected]> (cherry picked from commit 8f84b1ff46e449e99c5fcf4d4f94dc2e8ea82cd7) Signed-off-by: Jerry Snitselaar <[email protected]>
1 parent 5bddb28 commit 8991a87

File tree

8 files changed

+106
-17
lines changed

8 files changed

+106
-17
lines changed

net/rds/connection.c

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
209209
INIT_DELAYED_WORK(&conn->c_recv_w, rds_recv_worker);
210210
INIT_DELAYED_WORK(&conn->c_conn_w, rds_connect_worker);
211211
INIT_DELAYED_WORK(&conn->c_hb_w, rds_hb_worker);
212+
INIT_DELAYED_WORK(&conn->c_reconn_w, rds_reconnect_timeout);
212213
INIT_DELAYED_WORK(&conn->c_reject_w, rds_reject_worker);
213214
INIT_WORK(&conn->c_down_w, rds_shutdown_worker);
214215
mutex_init(&conn->c_cm_lock);
@@ -577,15 +578,7 @@ void rds_conn_exit(void)
577578
rds_conn_message_info_retrans);
578579
}
579580

580-
/*
581-
* Drop connections when the idled QoS connection not getting
582-
* disconnect event when the remote peer reboots. This is causing
583-
* delayed reconnect, hence application brownout when the peer comes online.
584-
* The fix was to proactively drop and reconnect them when the base lane is
585-
* going through the reconnect to the reboot peer, in effect forcing all
586-
* the lanes to go through the reconnect at the same time.
587-
*/
588-
static void rds_conn_shutdown_lanes(struct rds_connection *conn)
581+
static void rds_conn_probe_lanes(struct rds_connection *conn)
589582
{
590583
struct hlist_head *head =
591584
rds_conn_bucket(conn->c_laddr, conn->c_faddr);
@@ -598,7 +591,8 @@ static void rds_conn_shutdown_lanes(struct rds_connection *conn)
598591
tmp->c_laddr == conn->c_laddr &&
599592
tmp->c_tos != 0 &&
600593
tmp->c_trans == conn->c_trans) {
601-
rds_conn_drop(tmp);
594+
if (rds_conn_up(tmp))
595+
rds_send_hb(tmp, 0);
602596
}
603597
}
604598
rcu_read_unlock();
@@ -616,11 +610,16 @@ void rds_conn_drop(struct rds_connection *conn)
616610
conn->c_reconnect_warn = 1;
617611
conn->c_reconnect_drops = 0;
618612
conn->c_reconnect_err = 0;
613+
conn->c_reconnect_racing = 0;
619614
printk(KERN_INFO "RDS/IB: connection "
620615
"<%u.%u.%u.%u,%u.%u.%u.%u,%d> dropped\n",
621616
NIPQUAD(conn->c_laddr),
622617
NIPQUAD(conn->c_faddr),
623618
conn->c_tos);
619+
620+
if (conn->c_tos == 0)
621+
rds_conn_probe_lanes(conn);
622+
624623
} else if ((conn->c_reconnect_warn) &&
625624
(now - conn->c_reconnect_start > 60)) {
626625
printk(KERN_INFO "RDS/IB: re-connect "
@@ -633,9 +632,8 @@ void rds_conn_drop(struct rds_connection *conn)
633632
conn->c_reconnect_err);
634633
conn->c_reconnect_warn = 0;
635634

636-
/* see comment for rds_conn_shutdown_lanes() */
637635
if (conn->c_tos == 0)
638-
rds_conn_shutdown_lanes(conn);
636+
rds_conn_probe_lanes(conn);
639637
}
640638
conn->c_reconnect_drops++;
641639

net/rds/ib.c

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -410,7 +410,7 @@ static void rds_ib_send_gratuitous_arp(struct net_device *out_dev,
410410

411411
/* Send multiple ARPs to improve reliability */
412412
for (i = 0; i < rds_ib_active_bonding_arps; i++) {
413-
arp_send(ARPOP_REQUEST, ETH_P_ARP,
413+
arp_send(ARPOP_REPLY, ETH_P_ARP,
414414
ip_addr, out_dev,
415415
ip_addr, NULL,
416416
dev_addr, NULL);
@@ -1719,8 +1719,14 @@ static int rds_ib_netdev_callback(struct notifier_block *self, unsigned long eve
17191719

17201720
break;
17211721
case NETDEV_DOWN:
1722-
INIT_DELAYED_WORK(&work->work, rds_ib_failover);
1723-
queue_delayed_work(rds_wq, &work->work, 0);
1722+
if (rds_ib_sysctl_active_bonding) {
1723+
INIT_DELAYED_WORK(&work->work, rds_ib_failover);
1724+
queue_delayed_work(rds_wq, &work->work, 0);
1725+
} else {
1726+
ip_config[port].port_state = RDS_IB_PORT_INIT;
1727+
ip_config[port].ip_active_port = port;
1728+
kfree(work);
1729+
}
17241730
break;
17251731
}
17261732

net/rds/ib.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -594,6 +594,7 @@ extern unsigned long rds_ib_sysctl_max_unsig_wrs;
594594
extern unsigned long rds_ib_sysctl_max_unsig_bytes;
595595
extern unsigned long rds_ib_sysctl_max_recv_allocation;
596596
extern unsigned int rds_ib_sysctl_flow_control;
597+
extern unsigned int rds_ib_sysctl_active_bonding;
597598
extern ctl_table rds_ib_sysctl_table[];
598599

599600
#endif

net/rds/ib_cm.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -813,6 +813,8 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
813813
} else if (rds_conn_state(conn) == RDS_CONN_CONNECTING) {
814814
unsigned long now = get_seconds();
815815

816+
conn->c_reconnect_racing++;
817+
816818
/*
817819
* after 15 seconds, give up on existing connection
818820
* attempts and make them try again. At this point

net/rds/ib_sysctl.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ static unsigned long rds_ib_sysctl_max_unsig_wr_max = 64;
6161
*/
6262

6363
unsigned int rds_ib_sysctl_flow_control = 0;
64+
unsigned int rds_ib_sysctl_active_bonding = 1;
6465

6566
ctl_table rds_ib_sysctl_table[] = {
6667
{
@@ -104,6 +105,13 @@ ctl_table rds_ib_sysctl_table[] = {
104105
.mode = 0644,
105106
.proc_handler = &proc_dointvec,
106107
},
108+
{
109+
.procname = "active_bonding",
110+
.data = &rds_ib_sysctl_active_bonding,
111+
.maxlen = sizeof(rds_ib_sysctl_active_bonding),
112+
.mode = 0644,
113+
.proc_handler = &proc_dointvec,
114+
},
107115
{ }
108116
};
109117

net/rds/rdma_transport.c

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
*
3232
*/
3333
#include <rdma/rdma_cm.h>
34+
#include <rdma/rdma_cm_ib.h>
3435

3536
#include "rdma_transport.h"
3637
#include "ib.h"
@@ -94,6 +95,40 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
9495
if (rds_ib_apm_enabled)
9596
rdma_set_timeout(cm_id, rds_ib_apm_timeout);
9697

98+
if (conn->c_tos && conn->c_reconnect) {
99+
struct rds_ib_connection *base_ic =
100+
conn->c_base_conn->c_transport_data;
101+
102+
mutex_lock(&conn->c_base_conn->c_cm_lock);
103+
if (rds_conn_transition(conn->c_base_conn, RDS_CONN_UP,
104+
RDS_CONN_UP)) {
105+
ret = rdma_set_ib_paths(cm_id,
106+
base_ic->i_cm_id->route.path_rec,
107+
base_ic->i_cm_id->route.num_paths);
108+
if (!ret) {
109+
struct rds_ib_connection *ic =
110+
conn->c_transport_data;
111+
112+
cm_id->route.path_rec[0].sl =
113+
ic->i_sl;
114+
cm_id->route.path_rec[0].qos_class =
115+
conn->c_tos;
116+
ret = trans->cm_initiate_connect(cm_id);
117+
}
118+
} else {
119+
ret = 1;
120+
}
121+
mutex_unlock(&conn->c_base_conn->c_cm_lock);
122+
123+
if (ret) {
124+
rds_conn_drop(conn);
125+
ret = 0;
126+
}
127+
128+
break;
129+
}
130+
131+
97132
/* XXX do we need to clean up if this fails? */
98133
ret = rdma_resolve_route(cm_id,
99134
rds_rdma_resolve_to_ms[conn->c_to_index]);
@@ -172,6 +207,10 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
172207
break;
173208

174209
case RDMA_CM_EVENT_ADDR_ERROR:
210+
if (conn)
211+
rds_conn_drop(conn);
212+
break;
213+
175214
case RDMA_CM_EVENT_CONNECT_ERROR:
176215
case RDMA_CM_EVENT_UNREACHABLE:
177216
case RDMA_CM_EVENT_DEVICE_REMOVAL:
@@ -181,8 +220,14 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
181220

182221
case RDMA_CM_EVENT_REJECTED:
183222
err = (int *)event->param.conn.private_data;
223+
224+
if (conn && event->status == RDS_REJ_CONSUMER_DEFINED &&
225+
*err <= 1)
226+
conn->c_reconnect_racing++;
227+
184228
if (conn) {
185-
if (event->status == RDS_REJ_CONSUMER_DEFINED && (*err) == 0) {
229+
if (event->status == RDS_REJ_CONSUMER_DEFINED &&
230+
(*err) == 0) {
186231
/* Rejection from RDSV3.1 */
187232
if (!conn->c_tos) {
188233
conn->c_proposed_version =

net/rds/rds.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ enum {
8787
#define RDS_RECV_REFILL 3
8888

8989
#define RDS_RDMA_RESOLVE_TO_MAX_INDEX 5
90+
#define RDS_ADDR_RES_TM_INDEX_MAX 5
9091

9192
struct rds_connection {
9293
struct hlist_node c_hash_node;
@@ -125,6 +126,7 @@ struct rds_connection {
125126
struct delayed_work c_conn_w;
126127
struct delayed_work c_reject_w;
127128
struct delayed_work c_hb_w;
129+
struct delayed_work c_reconn_w;
128130
struct work_struct c_down_w;
129131
struct mutex c_cm_lock; /* protect conn state & cm */
130132
wait_queue_head_t c_waitq;
@@ -161,6 +163,8 @@ struct rds_connection {
161163
unsigned int c_route_to_base;
162164

163165
unsigned int c_rdsinfo_pending;
166+
167+
unsigned int c_reconnect_racing;
164168
};
165169

166170
#define RDS_FLAG_CONG_BITMAP 0x01
@@ -867,6 +871,7 @@ void rds_send_worker(struct work_struct *);
867871
void rds_reject_worker(struct work_struct *);
868872
void rds_recv_worker(struct work_struct *);
869873
void rds_hb_worker(struct work_struct *);
874+
void rds_reconnect_timeout(struct work_struct *);
870875
void rds_connect_complete(struct rds_connection *conn);
871876

872877
/* transport.c */

net/rds/threads.c

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -264,11 +264,35 @@ void rds_hb_worker(struct work_struct *work)
264264
}
265265
}
266266

267+
void rds_reconnect_timeout(struct work_struct *work)
268+
{
269+
struct rds_connection *conn =
270+
container_of(work, struct rds_connection, c_reconn_w.work);
271+
272+
/* if the higher IP has not reconnected, reset back to two-sided
273+
* reconnect.
274+
*/
275+
if (!rds_conn_up(conn)) {
276+
rds_conn_drop(conn);
277+
conn->c_reconnect_racing = 0;
278+
}
279+
}
280+
267281
void rds_shutdown_worker(struct work_struct *work)
268282
{
269283
struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w);
270284

271-
rds_conn_shutdown(conn, 1);
285+
286+
/* if racing is detected, lower IP backs off and let the higher IP
287+
* drives the reconnect (one-sided reconnect)
288+
*/
289+
if (conn->c_laddr < conn->c_faddr && conn->c_reconnect_racing) {
290+
rds_conn_shutdown(conn, 0);
291+
queue_delayed_work(rds_wq, &conn->c_reconn_w,
292+
msecs_to_jiffies(5000));
293+
} else
294+
rds_conn_shutdown(conn, 1);
295+
272296
}
273297

274298
void rds_threads_exit(void)

0 commit comments

Comments
 (0)