Skip to content

Commit 1c003b1

Browse files
committed
ipvs: wakeup master thread
High rate of sync messages in master can lead to overflowing the socket buffer and dropping the messages. Fixed sleep of 1 second without wakeup events is not suitable for loaded masters, Use delayed_work to schedule sending for queued messages and limit the delay to IPVS_SYNC_SEND_DELAY (20ms). This will reduce the rate of wakeups but to avoid sending long bursts we wakeup the master thread after IPVS_SYNC_WAKEUP_RATE (8) messages. Add hard limit for the queued messages before sending by using "sync_qlen_max" sysctl var. It defaults to 1/32 of the memory pages but actually represents number of messages. It will protect us from allocating large parts of memory when the sending rate is lower than the queuing rate. As suggested by Pablo, add new sysctl var "sync_sock_size" to configure the SNDBUF (master) or RCVBUF (slave) socket limit. Default value is 0 (preserve system defaults). Change the master thread to detect and block on SNDBUF overflow, so that we do not drop messages when the socket limit is low but the sync_qlen_max limit is not reached. On ENOBUFS or other errors just drop the messages. Change master thread to enter TASK_INTERRUPTIBLE state early, so that we do not miss wakeups due to messages or kthread_should_stop event. Thanks to Pablo Neira Ayuso for his valuable feedback! Signed-off-by: Julian Anastasov <[email protected]> Signed-off-by: Simon Horman <[email protected]>
1 parent cdcc5e9 commit 1c003b1

File tree

3 files changed

+162
-32
lines changed

3 files changed

+162
-32
lines changed

include/net/ip_vs.h

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -869,6 +869,8 @@ struct netns_ipvs {
869869
#endif
870870
int sysctl_snat_reroute;
871871
int sysctl_sync_ver;
872+
int sysctl_sync_qlen_max;
873+
int sysctl_sync_sock_size;
872874
int sysctl_cache_bypass;
873875
int sysctl_expire_nodest_conn;
874876
int sysctl_expire_quiescent_template;
@@ -889,6 +891,9 @@ struct netns_ipvs {
889891
struct timer_list est_timer; /* Estimation timer */
890892
/* ip_vs_sync */
891893
struct list_head sync_queue;
894+
int sync_queue_len;
895+
unsigned int sync_queue_delay;
896+
struct delayed_work master_wakeup_work;
892897
spinlock_t sync_lock;
893898
struct ip_vs_sync_buff *sync_buff;
894899
spinlock_t sync_buff_lock;
@@ -911,6 +916,10 @@ struct netns_ipvs {
911916
#define DEFAULT_SYNC_THRESHOLD 3
912917
#define DEFAULT_SYNC_PERIOD 50
913918
#define DEFAULT_SYNC_VER 1
919+
#define IPVS_SYNC_WAKEUP_RATE 8
920+
#define IPVS_SYNC_QLEN_MAX (IPVS_SYNC_WAKEUP_RATE * 4)
921+
#define IPVS_SYNC_SEND_DELAY (HZ / 50)
922+
#define IPVS_SYNC_CHECK_PERIOD HZ
914923

915924
#ifdef CONFIG_SYSCTL
916925

@@ -929,6 +938,16 @@ static inline int sysctl_sync_ver(struct netns_ipvs *ipvs)
929938
return ipvs->sysctl_sync_ver;
930939
}
931940

941+
static inline int sysctl_sync_qlen_max(struct netns_ipvs *ipvs)
942+
{
943+
return ipvs->sysctl_sync_qlen_max;
944+
}
945+
946+
static inline int sysctl_sync_sock_size(struct netns_ipvs *ipvs)
947+
{
948+
return ipvs->sysctl_sync_sock_size;
949+
}
950+
932951
#else
933952

934953
static inline int sysctl_sync_threshold(struct netns_ipvs *ipvs)
@@ -946,6 +965,16 @@ static inline int sysctl_sync_ver(struct netns_ipvs *ipvs)
946965
return DEFAULT_SYNC_VER;
947966
}
948967

968+
static inline int sysctl_sync_qlen_max(struct netns_ipvs *ipvs)
969+
{
970+
return IPVS_SYNC_QLEN_MAX;
971+
}
972+
973+
static inline int sysctl_sync_sock_size(struct netns_ipvs *ipvs)
974+
{
975+
return 0;
976+
}
977+
949978
#endif
950979

951980
/*

net/netfilter/ipvs/ip_vs_ctl.c

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1717,6 +1717,18 @@ static struct ctl_table vs_vars[] = {
17171717
.mode = 0644,
17181718
.proc_handler = &proc_do_sync_mode,
17191719
},
1720+
{
1721+
.procname = "sync_qlen_max",
1722+
.maxlen = sizeof(int),
1723+
.mode = 0644,
1724+
.proc_handler = proc_dointvec,
1725+
},
1726+
{
1727+
.procname = "sync_sock_size",
1728+
.maxlen = sizeof(int),
1729+
.mode = 0644,
1730+
.proc_handler = proc_dointvec,
1731+
},
17201732
{
17211733
.procname = "cache_bypass",
17221734
.maxlen = sizeof(int),
@@ -3655,6 +3667,10 @@ int __net_init ip_vs_control_net_init_sysctl(struct net *net)
36553667
tbl[idx++].data = &ipvs->sysctl_snat_reroute;
36563668
ipvs->sysctl_sync_ver = 1;
36573669
tbl[idx++].data = &ipvs->sysctl_sync_ver;
3670+
ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32;
3671+
tbl[idx++].data = &ipvs->sysctl_sync_qlen_max;
3672+
ipvs->sysctl_sync_sock_size = 0;
3673+
tbl[idx++].data = &ipvs->sysctl_sync_sock_size;
36583674
tbl[idx++].data = &ipvs->sysctl_cache_bypass;
36593675
tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn;
36603676
tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template;

net/netfilter/ipvs/ip_vs_sync.c

Lines changed: 117 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -307,11 +307,15 @@ static inline struct ip_vs_sync_buff *sb_dequeue(struct netns_ipvs *ipvs)
307307
spin_lock_bh(&ipvs->sync_lock);
308308
if (list_empty(&ipvs->sync_queue)) {
309309
sb = NULL;
310+
__set_current_state(TASK_INTERRUPTIBLE);
310311
} else {
311312
sb = list_entry(ipvs->sync_queue.next,
312313
struct ip_vs_sync_buff,
313314
list);
314315
list_del(&sb->list);
316+
ipvs->sync_queue_len--;
317+
if (!ipvs->sync_queue_len)
318+
ipvs->sync_queue_delay = 0;
315319
}
316320
spin_unlock_bh(&ipvs->sync_lock);
317321

@@ -358,9 +362,16 @@ static inline void sb_queue_tail(struct netns_ipvs *ipvs)
358362
struct ip_vs_sync_buff *sb = ipvs->sync_buff;
359363

360364
spin_lock(&ipvs->sync_lock);
361-
if (ipvs->sync_state & IP_VS_STATE_MASTER)
365+
if (ipvs->sync_state & IP_VS_STATE_MASTER &&
366+
ipvs->sync_queue_len < sysctl_sync_qlen_max(ipvs)) {
367+
if (!ipvs->sync_queue_len)
368+
schedule_delayed_work(&ipvs->master_wakeup_work,
369+
max(IPVS_SYNC_SEND_DELAY, 1));
370+
ipvs->sync_queue_len++;
362371
list_add_tail(&sb->list, &ipvs->sync_queue);
363-
else
372+
if ((++ipvs->sync_queue_delay) == IPVS_SYNC_WAKEUP_RATE)
373+
wake_up_process(ipvs->master_thread);
374+
} else
364375
ip_vs_sync_buff_release(sb);
365376
spin_unlock(&ipvs->sync_lock);
366377
}
@@ -379,6 +390,7 @@ get_curr_sync_buff(struct netns_ipvs *ipvs, unsigned long time)
379390
time_after_eq(jiffies - ipvs->sync_buff->firstuse, time)) {
380391
sb = ipvs->sync_buff;
381392
ipvs->sync_buff = NULL;
393+
__set_current_state(TASK_RUNNING);
382394
} else
383395
sb = NULL;
384396
spin_unlock_bh(&ipvs->sync_buff_lock);
@@ -392,26 +404,23 @@ get_curr_sync_buff(struct netns_ipvs *ipvs, unsigned long time)
392404
void ip_vs_sync_switch_mode(struct net *net, int mode)
393405
{
394406
struct netns_ipvs *ipvs = net_ipvs(net);
407+
struct ip_vs_sync_buff *sb;
395408

409+
spin_lock_bh(&ipvs->sync_buff_lock);
396410
if (!(ipvs->sync_state & IP_VS_STATE_MASTER))
397-
return;
398-
if (mode == sysctl_sync_ver(ipvs) || !ipvs->sync_buff)
399-
return;
411+
goto unlock;
412+
sb = ipvs->sync_buff;
413+
if (mode == sysctl_sync_ver(ipvs) || !sb)
414+
goto unlock;
400415

401-
spin_lock_bh(&ipvs->sync_buff_lock);
402416
/* Buffer empty ? then let buf_create do the job */
403-
if (ipvs->sync_buff->mesg->size <= sizeof(struct ip_vs_sync_mesg)) {
404-
kfree(ipvs->sync_buff);
417+
if (sb->mesg->size <= sizeof(struct ip_vs_sync_mesg)) {
418+
ip_vs_sync_buff_release(sb);
405419
ipvs->sync_buff = NULL;
406-
} else {
407-
spin_lock_bh(&ipvs->sync_lock);
408-
if (ipvs->sync_state & IP_VS_STATE_MASTER)
409-
list_add_tail(&ipvs->sync_buff->list,
410-
&ipvs->sync_queue);
411-
else
412-
ip_vs_sync_buff_release(ipvs->sync_buff);
413-
spin_unlock_bh(&ipvs->sync_lock);
414-
}
420+
} else
421+
sb_queue_tail(ipvs);
422+
423+
unlock:
415424
spin_unlock_bh(&ipvs->sync_buff_lock);
416425
}
417426

@@ -1129,6 +1138,28 @@ static void ip_vs_process_message(struct net *net, __u8 *buffer,
11291138
}
11301139

11311140

1141+
/*
1142+
* Setup sndbuf (mode=1) or rcvbuf (mode=0)
1143+
*/
1144+
static void set_sock_size(struct sock *sk, int mode, int val)
1145+
{
1146+
/* setsockopt(sock, SOL_SOCKET, SO_SNDBUF, &val, sizeof(val)); */
1147+
/* setsockopt(sock, SOL_SOCKET, SO_RCVBUF, &val, sizeof(val)); */
1148+
lock_sock(sk);
1149+
if (mode) {
1150+
val = clamp_t(int, val, (SOCK_MIN_SNDBUF + 1) / 2,
1151+
sysctl_wmem_max);
1152+
sk->sk_sndbuf = val * 2;
1153+
sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1154+
} else {
1155+
val = clamp_t(int, val, (SOCK_MIN_RCVBUF + 1) / 2,
1156+
sysctl_rmem_max);
1157+
sk->sk_rcvbuf = val * 2;
1158+
sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
1159+
}
1160+
release_sock(sk);
1161+
}
1162+
11321163
/*
11331164
* Setup loopback of outgoing multicasts on a sending socket
11341165
*/
@@ -1305,6 +1336,9 @@ static struct socket *make_send_sock(struct net *net)
13051336

13061337
set_mcast_loop(sock->sk, 0);
13071338
set_mcast_ttl(sock->sk, 1);
1339+
result = sysctl_sync_sock_size(ipvs);
1340+
if (result > 0)
1341+
set_sock_size(sock->sk, 1, result);
13081342

13091343
result = bind_mcastif_addr(sock, ipvs->master_mcast_ifn);
13101344
if (result < 0) {
@@ -1350,6 +1384,9 @@ static struct socket *make_receive_sock(struct net *net)
13501384
sk_change_net(sock->sk, net);
13511385
/* it is equivalent to the REUSEADDR option in user-space */
13521386
sock->sk->sk_reuse = SK_CAN_REUSE;
1387+
result = sysctl_sync_sock_size(ipvs);
1388+
if (result > 0)
1389+
set_sock_size(sock->sk, 0, result);
13531390

13541391
result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr,
13551392
sizeof(struct sockaddr));
@@ -1392,18 +1429,22 @@ ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length)
13921429
return len;
13931430
}
13941431

1395-
static void
1432+
static int
13961433
ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg)
13971434
{
13981435
int msize;
1436+
int ret;
13991437

14001438
msize = msg->size;
14011439

14021440
/* Put size in network byte order */
14031441
msg->size = htons(msg->size);
14041442

1405-
if (ip_vs_send_async(sock, (char *)msg, msize) != msize)
1406-
pr_err("ip_vs_send_async error\n");
1443+
ret = ip_vs_send_async(sock, (char *)msg, msize);
1444+
if (ret >= 0 || ret == -EAGAIN)
1445+
return ret;
1446+
pr_err("ip_vs_send_async error %d\n", ret);
1447+
return 0;
14071448
}
14081449

14091450
static int
@@ -1428,36 +1469,75 @@ ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
14281469
return len;
14291470
}
14301471

1472+
/* Wakeup the master thread for sending */
1473+
static void master_wakeup_work_handler(struct work_struct *work)
1474+
{
1475+
struct netns_ipvs *ipvs = container_of(work, struct netns_ipvs,
1476+
master_wakeup_work.work);
1477+
1478+
spin_lock_bh(&ipvs->sync_lock);
1479+
if (ipvs->sync_queue_len &&
1480+
ipvs->sync_queue_delay < IPVS_SYNC_WAKEUP_RATE) {
1481+
ipvs->sync_queue_delay = IPVS_SYNC_WAKEUP_RATE;
1482+
wake_up_process(ipvs->master_thread);
1483+
}
1484+
spin_unlock_bh(&ipvs->sync_lock);
1485+
}
1486+
1487+
/* Get next buffer to send */
1488+
static inline struct ip_vs_sync_buff *
1489+
next_sync_buff(struct netns_ipvs *ipvs)
1490+
{
1491+
struct ip_vs_sync_buff *sb;
1492+
1493+
sb = sb_dequeue(ipvs);
1494+
if (sb)
1495+
return sb;
1496+
/* Do not delay entries in buffer for more than 2 seconds */
1497+
return get_curr_sync_buff(ipvs, 2 * HZ);
1498+
}
14311499

14321500
static int sync_thread_master(void *data)
14331501
{
14341502
struct ip_vs_sync_thread_data *tinfo = data;
14351503
struct netns_ipvs *ipvs = net_ipvs(tinfo->net);
1504+
struct sock *sk = tinfo->sock->sk;
14361505
struct ip_vs_sync_buff *sb;
14371506

14381507
pr_info("sync thread started: state = MASTER, mcast_ifn = %s, "
14391508
"syncid = %d\n",
14401509
ipvs->master_mcast_ifn, ipvs->master_syncid);
14411510

1442-
while (!kthread_should_stop()) {
1443-
while ((sb = sb_dequeue(ipvs))) {
1444-
ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
1445-
ip_vs_sync_buff_release(sb);
1511+
for (;;) {
1512+
sb = next_sync_buff(ipvs);
1513+
if (unlikely(kthread_should_stop()))
1514+
break;
1515+
if (!sb) {
1516+
schedule_timeout(IPVS_SYNC_CHECK_PERIOD);
1517+
continue;
14461518
}
1447-
1448-
/* check if entries stay in ipvs->sync_buff for 2 seconds */
1449-
sb = get_curr_sync_buff(ipvs, 2 * HZ);
1450-
if (sb) {
1451-
ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
1452-
ip_vs_sync_buff_release(sb);
1519+
while (ip_vs_send_sync_msg(tinfo->sock, sb->mesg) < 0) {
1520+
int ret = 0;
1521+
1522+
__wait_event_interruptible(*sk_sleep(sk),
1523+
sock_writeable(sk) ||
1524+
kthread_should_stop(),
1525+
ret);
1526+
if (unlikely(kthread_should_stop()))
1527+
goto done;
14531528
}
1454-
1455-
schedule_timeout_interruptible(HZ);
1529+
ip_vs_sync_buff_release(sb);
14561530
}
14571531

1532+
done:
1533+
__set_current_state(TASK_RUNNING);
1534+
if (sb)
1535+
ip_vs_sync_buff_release(sb);
1536+
14581537
/* clean up the sync_buff queue */
14591538
while ((sb = sb_dequeue(ipvs)))
14601539
ip_vs_sync_buff_release(sb);
1540+
__set_current_state(TASK_RUNNING);
14611541

14621542
/* clean up the current sync_buff */
14631543
sb = get_curr_sync_buff(ipvs, 0);
@@ -1538,6 +1618,10 @@ int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid)
15381618
realtask = &ipvs->master_thread;
15391619
name = "ipvs_master:%d";
15401620
threadfn = sync_thread_master;
1621+
ipvs->sync_queue_len = 0;
1622+
ipvs->sync_queue_delay = 0;
1623+
INIT_DELAYED_WORK(&ipvs->master_wakeup_work,
1624+
master_wakeup_work_handler);
15411625
sock = make_send_sock(net);
15421626
} else if (state == IP_VS_STATE_BACKUP) {
15431627
if (ipvs->backup_thread)
@@ -1623,6 +1707,7 @@ int stop_sync_thread(struct net *net, int state)
16231707
spin_lock_bh(&ipvs->sync_lock);
16241708
ipvs->sync_state &= ~IP_VS_STATE_MASTER;
16251709
spin_unlock_bh(&ipvs->sync_lock);
1710+
cancel_delayed_work_sync(&ipvs->master_wakeup_work);
16261711
retc = kthread_stop(ipvs->master_thread);
16271712
ipvs->master_thread = NULL;
16281713
} else if (state == IP_VS_STATE_BACKUP) {

0 commit comments

Comments
 (0)