Skip to content

Commit b128261

Browse files
Bang NguyenMukesh Kacker
authored andcommitted
RDS: SA query optimization
SA query optimization The fact is all QoS lanes share the same physical path b/w an IP pair. The only difference is the service level that affects the quality of service for each lane. With that, we have the following optimization: 1. Lane 0 to issue SA query request to the SM. All other lanes will wait for lane 0 to finish route resolution, then copy in the resolved path and fill in its service level. 2. One-side reconnect to reduce reconnect racing, thus further reducing the number of SA queries to the SM. Reducing brownout for non-zero lanes In some case, RDMA CM is delaying the disconnect event after switch/node failure and this is causing extra brownout for RDS reconnection. The workaround is to have lane 0 probe other lanes by sending a HB msg. If the lane is down, this will cause a send completion error and an immediate reconnect. Orabug: 18801977 Signed-off-by: Bang Nguyen <[email protected]> Signed-off-by: Mukesh Kacker <[email protected]> (cherry picked from commit 8f84b1ff46e449e99c5fcf4d4f94dc2e8ea82cd7) Signed-off-by: Jerry Snitselaar <[email protected]> (cherry picked from commit 8991a87)
1 parent d3e9f58 commit b128261

File tree

8 files changed

+107
-18
lines changed

8 files changed

+107
-18
lines changed

net/rds/connection.c

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
214214
INIT_DELAYED_WORK(&conn->c_recv_w, rds_recv_worker);
215215
INIT_DELAYED_WORK(&conn->c_conn_w, rds_connect_worker);
216216
INIT_DELAYED_WORK(&conn->c_hb_w, rds_hb_worker);
217+
INIT_DELAYED_WORK(&conn->c_reconn_w, rds_reconnect_timeout);
217218
INIT_DELAYED_WORK(&conn->c_reject_w, rds_reject_worker);
218219
INIT_WORK(&conn->c_down_w, rds_shutdown_worker);
219220
mutex_init(&conn->c_cm_lock);
@@ -580,15 +581,7 @@ void rds_conn_exit(void)
580581
rds_conn_message_info_retrans);
581582
}
582583

583-
/*
584-
* Drop connections when the idled QoS connection not getting
585-
* disconnect event when the remote peer reboots. This is causing
586-
* delayed reconnect, hence application brownout when the peer comes online.
587-
* The fix was to proactively drop and reconnect them when the base lane is
588-
* going through the reconnect to the reboot peer, in effect forcing all
589-
* the lanes to go through the reconnect at the same time.
590-
*/
591-
static void rds_conn_shutdown_lanes(struct rds_connection *conn)
584+
static void rds_conn_probe_lanes(struct rds_connection *conn)
592585
{
593586
struct hlist_head *head =
594587
rds_conn_bucket(conn->c_laddr, conn->c_faddr);
@@ -600,7 +593,8 @@ static void rds_conn_shutdown_lanes(struct rds_connection *conn)
600593
tmp->c_laddr == conn->c_laddr &&
601594
tmp->c_tos != 0 &&
602595
tmp->c_trans == conn->c_trans) {
603-
rds_conn_drop(tmp);
596+
if (rds_conn_up(tmp))
597+
rds_send_hb(tmp, 0);
604598
}
605599
}
606600
rcu_read_unlock();
@@ -618,11 +612,16 @@ void rds_conn_drop(struct rds_connection *conn)
618612
conn->c_reconnect_warn = 1;
619613
conn->c_reconnect_drops = 0;
620614
conn->c_reconnect_err = 0;
615+
conn->c_reconnect_racing = 0;
621616
printk(KERN_INFO "RDS/IB: connection "
622617
"<%u.%u.%u.%u,%u.%u.%u.%u,%d> dropped\n",
623618
NIPQUAD(conn->c_laddr),
624619
NIPQUAD(conn->c_faddr),
625620
conn->c_tos);
621+
622+
if (conn->c_tos == 0)
623+
rds_conn_probe_lanes(conn);
624+
626625
} else if ((conn->c_reconnect_warn) &&
627626
(now - conn->c_reconnect_start > 60)) {
628627
printk(KERN_INFO "RDS/IB: re-connect "
@@ -635,9 +634,8 @@ void rds_conn_drop(struct rds_connection *conn)
635634
conn->c_reconnect_err);
636635
conn->c_reconnect_warn = 0;
637636

638-
/* see comment for rds_conn_shutdown_lanes() */
639637
if (conn->c_tos == 0)
640-
rds_conn_shutdown_lanes(conn);
638+
rds_conn_probe_lanes(conn);
641639
}
642640
conn->c_reconnect_drops++;
643641

net/rds/ib.c

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -427,7 +427,7 @@ static void rds_ib_send_gratuitous_arp(struct net_device *out_dev,
427427

428428
/* Send multiple ARPs to improve reliability */
429429
for (i = 0; i < rds_ib_active_bonding_arps; i++) {
430-
arp_send(ARPOP_REQUEST, ETH_P_ARP,
430+
arp_send(ARPOP_REPLY, ETH_P_ARP,
431431
ip_addr, out_dev,
432432
ip_addr, NULL,
433433
dev_addr, NULL);
@@ -1736,8 +1736,14 @@ static int rds_ib_netdev_callback(struct notifier_block *self, unsigned long eve
17361736

17371737
break;
17381738
case NETDEV_DOWN:
1739-
INIT_DELAYED_WORK(&work->work, rds_ib_failover);
1740-
queue_delayed_work(rds_wq, &work->work, 0);
1739+
if (rds_ib_sysctl_active_bonding) {
1740+
INIT_DELAYED_WORK(&work->work, rds_ib_failover);
1741+
queue_delayed_work(rds_wq, &work->work, 0);
1742+
} else {
1743+
ip_config[port].port_state = RDS_IB_PORT_INIT;
1744+
ip_config[port].ip_active_port = port;
1745+
kfree(work);
1746+
}
17411747
break;
17421748
}
17431749

net/rds/ib.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -600,5 +600,6 @@ extern unsigned long rds_ib_sysctl_max_unsig_wrs;
600600
extern unsigned long rds_ib_sysctl_max_unsig_bytes;
601601
extern unsigned long rds_ib_sysctl_max_recv_allocation;
602602
extern unsigned int rds_ib_sysctl_flow_control;
603+
extern unsigned int rds_ib_sysctl_active_bonding;
603604

604605
#endif

net/rds/ib_cm.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -426,7 +426,7 @@ static void rds_ib_rx(struct rds_ib_connection *ic)
426426
if (ic->i_rx_poll_cq >= RDS_IB_RX_LIMIT) {
427427
ic->i_rx_w.ic = ic;
428428
/* Delay 10 msecs until the RX worker starts reaping again */
429-
queue_delayed_work(rds_aux_wq, &ic->i_rx_w,
429+
queue_delayed_work(rds_aux_wq, &ic->i_rx_w.work,
430430
msecs_to_jiffies(10));
431431
ic->i_rx_wait_for_handler = 1;
432432
}
@@ -823,6 +823,8 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
823823
} else if (rds_conn_state(conn) == RDS_CONN_CONNECTING) {
824824
unsigned long now = get_seconds();
825825

826+
conn->c_reconnect_racing++;
827+
826828
/*
827829
* after 15 seconds, give up on existing connection
828830
* attempts and make them try again. At this point

net/rds/ib_sysctl.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ static unsigned long rds_ib_sysctl_max_unsig_wr_max = 64;
6161
*/
6262

6363
unsigned int rds_ib_sysctl_flow_control = 0;
64+
unsigned int rds_ib_sysctl_active_bonding = 1;
6465

6566
static struct ctl_table rds_ib_sysctl_table[] = {
6667
{
@@ -104,6 +105,13 @@ static struct ctl_table rds_ib_sysctl_table[] = {
104105
.mode = 0644,
105106
.proc_handler = proc_dointvec,
106107
},
108+
{
109+
.procname = "active_bonding",
110+
.data = &rds_ib_sysctl_active_bonding,
111+
.maxlen = sizeof(rds_ib_sysctl_active_bonding),
112+
.mode = 0644,
113+
.proc_handler = &proc_dointvec,
114+
},
107115
{ }
108116
};
109117

net/rds/rdma_transport.c

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
*
3232
*/
3333
#include <rdma/rdma_cm.h>
34+
#include <rdma/rdma_cm_ib.h>
3435

3536
#include "rdma_transport.h"
3637
#include "ib.h"
@@ -96,6 +97,40 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
9697
rdma_set_timeout(cm_id, rds_ib_apm_timeout);
9798
#endif
9899

100+
if (conn->c_tos && conn->c_reconnect) {
101+
struct rds_ib_connection *base_ic =
102+
conn->c_base_conn->c_transport_data;
103+
104+
mutex_lock(&conn->c_base_conn->c_cm_lock);
105+
if (rds_conn_transition(conn->c_base_conn, RDS_CONN_UP,
106+
RDS_CONN_UP)) {
107+
ret = rdma_set_ib_paths(cm_id,
108+
base_ic->i_cm_id->route.path_rec,
109+
base_ic->i_cm_id->route.num_paths);
110+
if (!ret) {
111+
struct rds_ib_connection *ic =
112+
conn->c_transport_data;
113+
114+
cm_id->route.path_rec[0].sl =
115+
ic->i_sl;
116+
cm_id->route.path_rec[0].qos_class =
117+
conn->c_tos;
118+
ret = trans->cm_initiate_connect(cm_id);
119+
}
120+
} else {
121+
ret = 1;
122+
}
123+
mutex_unlock(&conn->c_base_conn->c_cm_lock);
124+
125+
if (ret) {
126+
rds_conn_drop(conn);
127+
ret = 0;
128+
}
129+
130+
break;
131+
}
132+
133+
99134
/* XXX do we need to clean up if this fails? */
100135
ret = rdma_resolve_route(cm_id,
101136
rds_rdma_resolve_to_ms[conn->c_to_index]);
@@ -176,6 +211,10 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
176211
break;
177212

178213
case RDMA_CM_EVENT_ADDR_ERROR:
214+
if (conn)
215+
rds_conn_drop(conn);
216+
break;
217+
179218
case RDMA_CM_EVENT_CONNECT_ERROR:
180219
case RDMA_CM_EVENT_UNREACHABLE:
181220
case RDMA_CM_EVENT_DEVICE_REMOVAL:
@@ -185,8 +224,14 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
185224

186225
case RDMA_CM_EVENT_REJECTED:
187226
err = (int *)event->param.conn.private_data;
227+
228+
if (conn && event->status == RDS_REJ_CONSUMER_DEFINED &&
229+
*err <= 1)
230+
conn->c_reconnect_racing++;
231+
188232
if (conn) {
189-
if (event->status == RDS_REJ_CONSUMER_DEFINED && (*err) == 0) {
233+
if (event->status == RDS_REJ_CONSUMER_DEFINED &&
234+
(*err) == 0) {
190235
/* Rejection from RDSV3.1 */
191236
if (!conn->c_tos) {
192237
conn->c_proposed_version =

net/rds/rds.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ enum {
9292
#define RDS_RECV_REFILL 3
9393

9494
#define RDS_RDMA_RESOLVE_TO_MAX_INDEX 5
95+
#define RDS_ADDR_RES_TM_INDEX_MAX 5
9596

9697
struct rds_connection {
9798
struct hlist_node c_hash_node;
@@ -130,6 +131,7 @@ struct rds_connection {
130131
struct delayed_work c_conn_w;
131132
struct delayed_work c_reject_w;
132133
struct delayed_work c_hb_w;
134+
struct delayed_work c_reconn_w;
133135
struct work_struct c_down_w;
134136
struct mutex c_cm_lock; /* protect conn state & cm */
135137
wait_queue_head_t c_waitq;
@@ -166,6 +168,8 @@ struct rds_connection {
166168
unsigned int c_route_to_base;
167169

168170
unsigned int c_rdsinfo_pending;
171+
172+
unsigned int c_reconnect_racing;
169173
};
170174

171175
#define RDS_FLAG_CONG_BITMAP 0x01
@@ -869,6 +873,7 @@ void rds_send_worker(struct work_struct *);
869873
void rds_reject_worker(struct work_struct *);
870874
void rds_recv_worker(struct work_struct *);
871875
void rds_hb_worker(struct work_struct *);
876+
void rds_reconnect_timeout(struct work_struct *);
872877
void rds_connect_complete(struct rds_connection *conn);
873878

874879
/* transport.c */

net/rds/threads.c

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -264,11 +264,35 @@ void rds_hb_worker(struct work_struct *work)
264264
}
265265
}
266266

267+
void rds_reconnect_timeout(struct work_struct *work)
268+
{
269+
struct rds_connection *conn =
270+
container_of(work, struct rds_connection, c_reconn_w.work);
271+
272+
/* if the higher IP has not reconnected, reset back to two-sided
273+
* reconnect.
274+
*/
275+
if (!rds_conn_up(conn)) {
276+
rds_conn_drop(conn);
277+
conn->c_reconnect_racing = 0;
278+
}
279+
}
280+
267281
void rds_shutdown_worker(struct work_struct *work)
268282
{
269283
struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w);
270284

271-
rds_conn_shutdown(conn, 1);
285+
286+
/* if racing is detected, lower IP backs off and let the higher IP
287+
* drives the reconnect (one-sided reconnect)
288+
*/
289+
if (conn->c_laddr < conn->c_faddr && conn->c_reconnect_racing) {
290+
rds_conn_shutdown(conn, 0);
291+
queue_delayed_work(rds_wq, &conn->c_reconn_w,
292+
msecs_to_jiffies(5000));
293+
} else
294+
rds_conn_shutdown(conn, 1);
295+
272296
}
273297

274298
void rds_threads_exit(void)

0 commit comments

Comments
 (0)