Skip to content

Commit f54b311

Browse files
edumazetdavem330
authored andcommitted
tcp: auto corking
With the introduction of TCP Small Queues, TSO auto sizing, and TCP pacing, we can implement Automatic Corking in the kernel, to help applications doing small write()/sendmsg() to TCP sockets. Idea is to change tcp_push() to check if the current skb payload is under skb optimal size (a multiple of MSS bytes) If under 'size_goal', and at least one packet is still in Qdisc or NIC TX queues, set the TCP Small Queue Throttled bit, so that the push will be delayed up to TX completion time. This delay might allow the application to coalesce more bytes in the skb in following write()/sendmsg()/sendfile() system calls. The exact duration of the delay is depending on the dynamics of the system, and might be zero if no packet for this flow is actually held in Qdisc or NIC TX ring. Using FQ/pacing is a way to increase the probability of autocorking being triggered. Add a new sysctl (/proc/sys/net/ipv4/tcp_autocorking) to control this feature and default it to 1 (enabled) Add a new SNMP counter : nstat -a | grep TcpExtTCPAutoCorking This counter is incremented every time we detected skb was under used and its flush was deferred. Tested: Interesting effects when using line buffered commands under ssh. Excellent performance results in term of cpu usage and total throughput. lpq83:~# echo 1 >/proc/sys/net/ipv4/tcp_autocorking lpq83:~# perf stat ./super_netperf 4 -t TCP_STREAM -H lpq84 -- -m 128 9410.39 Performance counter stats for './super_netperf 4 -t TCP_STREAM -H lpq84 -- -m 128': 35209.439626 task-clock # 2.901 CPUs utilized 2,294 context-switches # 0.065 K/sec 101 CPU-migrations # 0.003 K/sec 4,079 page-faults # 0.116 K/sec 97,923,241,298 cycles # 2.781 GHz [83.31%] 51,832,908,236 stalled-cycles-frontend # 52.93% frontend cycles idle [83.30%] 25,697,986,603 stalled-cycles-backend # 26.24% backend cycles idle [66.70%] 102,225,978,536 instructions # 1.04 insns per cycle # 0.51 stalled cycles per insn [83.38%] 18,657,696,819 branches # 529.906 M/sec [83.29%] 91,679,646 branch-misses # 0.49% of all branches [83.40%] 12.136204899 seconds time elapsed lpq83:~# echo 0 >/proc/sys/net/ipv4/tcp_autocorking lpq83:~# perf stat ./super_netperf 4 -t TCP_STREAM -H lpq84 -- -m 128 6624.89 Performance counter stats for './super_netperf 4 -t TCP_STREAM -H lpq84 -- -m 128': 40045.864494 task-clock # 3.301 CPUs utilized 171 context-switches # 0.004 K/sec 53 CPU-migrations # 0.001 K/sec 4,080 page-faults # 0.102 K/sec 111,340,458,645 cycles # 2.780 GHz [83.34%] 61,778,039,277 stalled-cycles-frontend # 55.49% frontend cycles idle [83.31%] 29,295,522,759 stalled-cycles-backend # 26.31% backend cycles idle [66.67%] 108,654,349,355 instructions # 0.98 insns per cycle # 0.57 stalled cycles per insn [83.34%] 19,552,170,748 branches # 488.244 M/sec [83.34%] 157,875,417 branch-misses # 0.81% of all branches [83.34%] 12.130267788 seconds time elapsed Signed-off-by: Eric Dumazet <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent d8535a0 commit f54b311

File tree

6 files changed

+72
-13
lines changed

6 files changed

+72
-13
lines changed

Documentation/networking/ip-sysctl.txt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,16 @@ tcp_app_win - INTEGER
156156
buffer. Value 0 is special, it means that nothing is reserved.
157157
Default: 31
158158

159+
tcp_autocorking - BOOLEAN
160+
Enable TCP auto corking :
161+
When applications do consecutive small write()/sendmsg() system calls,
162+
we try to coalesce these small writes as much as possible, to lower
163+
total amount of sent packets. This is done if at least one prior
164+
packet for the flow is waiting in Qdisc queues or device transmit
165+
queue. Applications can still use TCP_CORK for optimal behavior
166+
when they know how/when to uncork their sockets.
167+
Default : 1
168+
159169
tcp_available_congestion_control - STRING
160170
Shows the available congestion control choices that are registered.
161171
More congestion control algorithms may be available as modules,

include/net/tcp.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,7 @@ extern int sysctl_tcp_limit_output_bytes;
282282
extern int sysctl_tcp_challenge_ack_limit;
283283
extern unsigned int sysctl_tcp_notsent_lowat;
284284
extern int sysctl_tcp_min_tso_segs;
285+
extern int sysctl_tcp_autocorking;
285286

286287
extern atomic_long_t tcp_memory_allocated;
287288
extern struct percpu_counter tcp_sockets_allocated;

include/uapi/linux/snmp.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,7 @@ enum
258258
LINUX_MIB_TCPFASTOPENCOOKIEREQD, /* TCPFastOpenCookieReqd */
259259
LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES, /* TCPSpuriousRtxHostQueues */
260260
LINUX_MIB_BUSYPOLLRXPACKETS, /* BusyPollRxPackets */
261+
LINUX_MIB_TCPAUTOCORKING, /* TCPAutoCorking */
261262
__LINUX_MIB_MAX
262263
};
263264

net/ipv4/proc.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,7 @@ static const struct snmp_mib snmp4_net_list[] = {
279279
SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD),
280280
SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES),
281281
SNMP_MIB_ITEM("BusyPollRxPackets", LINUX_MIB_BUSYPOLLRXPACKETS),
282+
SNMP_MIB_ITEM("TCPAutoCorking", LINUX_MIB_TCPAUTOCORKING),
282283
SNMP_MIB_SENTINEL
283284
};
284285

net/ipv4/sysctl_net_ipv4.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -732,6 +732,15 @@ static struct ctl_table ipv4_table[] = {
732732
.extra1 = &zero,
733733
.extra2 = &gso_max_segs,
734734
},
735+
{
736+
.procname = "tcp_autocorking",
737+
.data = &sysctl_tcp_autocorking,
738+
.maxlen = sizeof(int),
739+
.mode = 0644,
740+
.proc_handler = proc_dointvec_minmax,
741+
.extra1 = &zero,
742+
.extra2 = &one,
743+
},
735744
{
736745
.procname = "udp_mem",
737746
.data = &sysctl_udp_mem,

net/ipv4/tcp.c

Lines changed: 50 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,8 @@ int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
285285

286286
int sysctl_tcp_min_tso_segs __read_mostly = 2;
287287

288+
int sysctl_tcp_autocorking __read_mostly = 1;
289+
288290
struct percpu_counter tcp_orphan_count;
289291
EXPORT_SYMBOL_GPL(tcp_orphan_count);
290292

@@ -619,19 +621,52 @@ static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
619621
tp->snd_up = tp->write_seq;
620622
}
621623

622-
static inline void tcp_push(struct sock *sk, int flags, int mss_now,
623-
int nonagle)
624+
/* If a not yet filled skb is pushed, do not send it if
625+
* we have packets in Qdisc or NIC queues :
626+
* Because TX completion will happen shortly, it gives a chance
627+
* to coalesce future sendmsg() payload into this skb, without
628+
* need for a timer, and with no latency trade off.
629+
* As packets containing data payload have a bigger truesize
630+
* than pure acks (dataless) packets, the last check prevents
631+
* autocorking if we only have an ACK in Qdisc/NIC queues.
632+
*/
633+
static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
634+
int size_goal)
624635
{
625-
if (tcp_send_head(sk)) {
626-
struct tcp_sock *tp = tcp_sk(sk);
636+
return skb->len < size_goal &&
637+
sysctl_tcp_autocorking &&
638+
atomic_read(&sk->sk_wmem_alloc) > skb->truesize;
639+
}
640+
641+
static void tcp_push(struct sock *sk, int flags, int mss_now,
642+
int nonagle, int size_goal)
643+
{
644+
struct tcp_sock *tp = tcp_sk(sk);
645+
struct sk_buff *skb;
627646

628-
if (!(flags & MSG_MORE) || forced_push(tp))
629-
tcp_mark_push(tp, tcp_write_queue_tail(sk));
647+
if (!tcp_send_head(sk))
648+
return;
649+
650+
skb = tcp_write_queue_tail(sk);
651+
if (!(flags & MSG_MORE) || forced_push(tp))
652+
tcp_mark_push(tp, skb);
653+
654+
tcp_mark_urg(tp, flags);
655+
656+
if (tcp_should_autocork(sk, skb, size_goal)) {
630657

631-
tcp_mark_urg(tp, flags);
632-
__tcp_push_pending_frames(sk, mss_now,
633-
(flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
658+
/* avoid atomic op if TSQ_THROTTLED bit is already set */
659+
if (!test_bit(TSQ_THROTTLED, &tp->tsq_flags)) {
660+
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
661+
set_bit(TSQ_THROTTLED, &tp->tsq_flags);
662+
}
663+
return;
634664
}
665+
666+
if (flags & MSG_MORE)
667+
nonagle = TCP_NAGLE_CORK;
668+
669+
__tcp_push_pending_frames(sk, mss_now, nonagle);
635670
}
636671

637672
static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
@@ -934,7 +969,8 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
934969
wait_for_sndbuf:
935970
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
936971
wait_for_memory:
937-
tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
972+
tcp_push(sk, flags & ~MSG_MORE, mss_now,
973+
TCP_NAGLE_PUSH, size_goal);
938974

939975
if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
940976
goto do_error;
@@ -944,7 +980,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
944980

945981
out:
946982
if (copied && !(flags & MSG_SENDPAGE_NOTLAST))
947-
tcp_push(sk, flags, mss_now, tp->nonagle);
983+
tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
948984
return copied;
949985

950986
do_error:
@@ -1225,7 +1261,8 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
12251261
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
12261262
wait_for_memory:
12271263
if (copied)
1228-
tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
1264+
tcp_push(sk, flags & ~MSG_MORE, mss_now,
1265+
TCP_NAGLE_PUSH, size_goal);
12291266

12301267
if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
12311268
goto do_error;
@@ -1236,7 +1273,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
12361273

12371274
out:
12381275
if (copied)
1239-
tcp_push(sk, flags, mss_now, tp->nonagle);
1276+
tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
12401277
release_sock(sk);
12411278
return copied + copied_syn;
12421279

0 commit comments

Comments
 (0)