Skip to content

Commit 57b8368

Browse files
Hakon-BuggeSomasundaram Krishnasamy
authored andcommitted
rds: Introduce heartbeat interval
RDS' heartbeat mechanism polls every second for a heartbeat pong. When received, it immediately sends another heartbeat ping. In systems with many connections, this may be way to aggressive. Hence introduce the rds_conn_hb_interval module parameter and a simple state to the heartbeat mechanism, in order to delay sending the ping after the heartbeat pong has been received. Also, pseudo randomize the delay from 50% to 150% of the interval, to avoid many heartbeat pings to be sent in close proximity in time. Orabug: 30418039 Signed-off-by: Håkon Bugge <[email protected]> Reviewed-by: Ka-Cheong Poon <[email protected]> Tested-by: Michael Nowak <[email protected]> --- v1->v2: * Added Ka-Cheong's r-b * Added Mike's t-b Signed-off-by: Somasundaram Krishnasamy <[email protected]>
1 parent 9bc87c3 commit 57b8368

File tree

2 files changed

+36
-11
lines changed

2 files changed

+36
-11
lines changed

net/rds/rds.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,11 @@ enum rds_conn_drop_src {
251251
DR_TCP_SEND_FAIL,
252252
};
253253

254+
enum rds_hb_state {
255+
HB_PING_SENT,
256+
HB_PONG_RCVD,
257+
};
258+
254259
#define IS_CANONICAL(laddr, faddr) (htonl(laddr) < htonl(faddr))
255260

256261
/* Per mpath connection state */
@@ -311,6 +316,7 @@ struct rds_conn_path {
311316
unsigned int cp_pending_flush;
312317

313318
unsigned long cp_hb_start;
319+
enum rds_hb_state cp_hb_state;
314320

315321
unsigned int cp_rdsinfo_pending;
316322

net/rds/threads.c

Lines changed: 30 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,10 @@
3838

3939
static unsigned int rds_conn_hb_timeout = 0;
4040
module_param(rds_conn_hb_timeout, int, 0444);
41-
MODULE_PARM_DESC(rds_conn_hb_timeout, " Connection heartbeat timeout");
41+
MODULE_PARM_DESC(rds_conn_hb_timeout, " Connection heartbeat timeout (seconds)");
42+
static unsigned int rds_conn_hb_interval = 10;
43+
module_param(rds_conn_hb_interval, int, 0444);
44+
MODULE_PARM_DESC(rds_conn_hb_interval, " Connection heartbeat interval (seconds)");
4245

4346

4447
/*
@@ -129,9 +132,10 @@ void rds_connect_path_complete(struct rds_conn_path *cp, int curr)
129132
rds_cond_queue_send_work(cp, 0);
130133
rds_clear_queued_recv_work_bit(cp);
131134
rds_cond_queue_recv_work(cp, 0);
135+
cp->cp_hb_start = 0;
136+
cp->cp_hb_state = HB_PONG_RCVD;
132137
queue_delayed_work(cp->cp_wq, &cp->cp_hb_w, 0);
133138
cancel_delayed_work(&cp->cp_reconn_w);
134-
cp->cp_hb_start = 0;
135139

136140
rds_update_avg_connect_time(cp);
137141
cp->cp_connection_start = get_seconds();
@@ -336,6 +340,7 @@ void rds_hb_worker(struct work_struct *work)
336340
struct rds_conn_path,
337341
cp_hb_w.work);
338342
unsigned long now = get_seconds();
343+
unsigned long delay = HZ;
339344
int ret;
340345
struct rds_connection *conn = cp->cp_conn;
341346

@@ -344,24 +349,38 @@ void rds_hb_worker(struct work_struct *work)
344349
return;
345350

346351
if (rds_conn_path_state(cp) == RDS_CONN_UP) {
347-
if (!cp->cp_hb_start) {
352+
switch (cp->cp_hb_state) {
353+
case HB_PING_SENT:
354+
if (!cp->cp_hb_start) {
355+
cp->cp_hb_state = HB_PONG_RCVD;
356+
/* Pseudo random from 50% to 150% of interval */
357+
delay = msecs_to_jiffies(rds_conn_hb_interval * 1000 / 2) +
358+
msecs_to_jiffies(prandom_u32() % rds_conn_hb_interval * 1000);
359+
} else if (now - cp->cp_hb_start > rds_conn_hb_timeout) {
360+
rds_rtd_ptr(RDS_RTD_CM,
361+
"RDS/IB: connection <%pI6c,%pI6c,%d> timed out (0x%lx,0x%lx)..discon and recon\n",
362+
&conn->c_laddr, &conn->c_faddr,
363+
conn->c_tos, cp->cp_hb_start, now);
364+
rds_conn_path_drop(cp, DR_HB_TIMEOUT);
365+
return;
366+
}
367+
break;
368+
369+
case HB_PONG_RCVD:
348370
ret = rds_send_hb(cp->cp_conn, 0);
371+
349372
if (ret) {
350373
rds_rtd(RDS_RTD_ERR_EXT,
351374
"RDS/IB: rds_hb_worker: failed %d\n",
352375
ret);
353376
return;
354377
}
355378
cp->cp_hb_start = now;
356-
} else if (now - cp->cp_hb_start > rds_conn_hb_timeout) {
357-
rds_rtd_ptr(RDS_RTD_CM,
358-
"RDS/IB: connection <%pI6c,%pI6c,%d> timed out (0x%lx,0x%lx)..discon and recon\n",
359-
&conn->c_laddr, &conn->c_faddr,
360-
conn->c_tos, cp->cp_hb_start, now);
361-
rds_conn_path_drop(cp, DR_HB_TIMEOUT);
362-
return;
379+
cp->cp_hb_state = HB_PING_SENT;
380+
break;
363381
}
364-
queue_delayed_work(cp->cp_wq, &cp->cp_hb_w, HZ);
382+
383+
queue_delayed_work(cp->cp_wq, &cp->cp_hb_w, delay);
365384
}
366385
}
367386

0 commit comments

Comments
 (0)