Skip to content

Commit f214f91

Browse files
wdebruijdavem330
authored andcommitted
tcp: enable MSG_ZEROCOPY
Enable support for MSG_ZEROCOPY to the TCP stack. TSO and GSO are both supported. Only data sent to remote destinations is sent without copying. Packets looped onto a local destination have their payload copied to avoid unbounded latency. Tested: A 10x TCP_STREAM between two hosts showed a reduction in netserver process cycles by up to 70%, depending on packet size. Systemwide, savings are of course much less pronounced, at up to 20% best case. msg_zerocopy.sh 4 tcp: without zerocopy tx=121792 (7600 MB) txc=0 zc=n rx=60458 (7600 MB) with zerocopy tx=286257 (17863 MB) txc=286257 zc=y rx=140022 (17863 MB) This test opens a pair of sockets over veth, one one calls send with 64KB and optionally MSG_ZEROCOPY and on the other reads the initial bytes. The receiver truncates, so this is strictly an upper bound on what is achievable. It is more representative of sending data out of a physical NIC (when payload is not touched, either). Signed-off-by: Willem de Bruijn <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent a91dbff commit f214f91

File tree

1 file changed

+31
-1
lines changed

1 file changed

+31
-1
lines changed

net/ipv4/tcp.c

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1165,6 +1165,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
11651165
int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
11661166
{
11671167
struct tcp_sock *tp = tcp_sk(sk);
1168+
struct ubuf_info *uarg = NULL;
11681169
struct sk_buff *skb;
11691170
struct sockcm_cookie sockc;
11701171
int flags, err, copied = 0;
@@ -1174,6 +1175,26 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
11741175
long timeo;
11751176

11761177
flags = msg->msg_flags;
1178+
1179+
if (flags & MSG_ZEROCOPY && size) {
1180+
if (sk->sk_state != TCP_ESTABLISHED) {
1181+
err = -EINVAL;
1182+
goto out_err;
1183+
}
1184+
1185+
skb = tcp_send_head(sk) ? tcp_write_queue_tail(sk) : NULL;
1186+
uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb));
1187+
if (!uarg) {
1188+
err = -ENOBUFS;
1189+
goto out_err;
1190+
}
1191+
1192+
/* skb may be freed in main loop, keep extra ref on uarg */
1193+
sock_zerocopy_get(uarg);
1194+
if (!(sk_check_csum_caps(sk) && sk->sk_route_caps & NETIF_F_SG))
1195+
uarg->zerocopy = 0;
1196+
}
1197+
11771198
if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect)) {
11781199
err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size);
11791200
if (err == -EINPROGRESS && copied_syn > 0)
@@ -1297,7 +1318,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
12971318
err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy);
12981319
if (err)
12991320
goto do_fault;
1300-
} else {
1321+
} else if (!uarg || !uarg->zerocopy) {
13011322
bool merge = true;
13021323
int i = skb_shinfo(skb)->nr_frags;
13031324
struct page_frag *pfrag = sk_page_frag(sk);
@@ -1335,6 +1356,13 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
13351356
page_ref_inc(pfrag->page);
13361357
}
13371358
pfrag->offset += copy;
1359+
} else {
1360+
err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg);
1361+
if (err == -EMSGSIZE || err == -EEXIST)
1362+
goto new_segment;
1363+
if (err < 0)
1364+
goto do_error;
1365+
copy = err;
13381366
}
13391367

13401368
if (!copied)
@@ -1381,6 +1409,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
13811409
tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
13821410
}
13831411
out_nopush:
1412+
sock_zerocopy_put(uarg);
13841413
return copied + copied_syn;
13851414

13861415
do_fault:
@@ -1397,6 +1426,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
13971426
if (copied + copied_syn)
13981427
goto out;
13991428
out_err:
1429+
sock_zerocopy_put_abort(uarg);
14001430
err = sk_stream_error(sk, flags, err);
14011431
/* make sure we wake any epoll edge trigger waiter */
14021432
if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&

0 commit comments

Comments
 (0)