Skip to content

Commit b5947e5

Browse files
wdebruijdavem330
authored andcommitted
udp: msg_zerocopy
Extend zerocopy to udp sockets. Allow setting sockopt SO_ZEROCOPY and interpret flag MSG_ZEROCOPY. This patch was previously part of the zerocopy RFC patchsets. Zerocopy is not effective at small MTU. With segmentation offload building larger datagrams, the benefit of page flipping outweights the cost of generating a completion notification. tools/testing/selftests/net/msg_zerocopy.sh after applying follow-on test patch and making skb_orphan_frags_rx same as skb_orphan_frags: ipv4 udp -t 1 tx=191312 (11938 MB) txc=0 zc=n rx=191312 (11938 MB) ipv4 udp -z -t 1 tx=304507 (19002 MB) txc=304507 zc=y rx=304507 (19002 MB) ok ipv6 udp -t 1 tx=174485 (10888 MB) txc=0 zc=n rx=174485 (10888 MB) ipv6 udp -z -t 1 tx=294801 (18396 MB) txc=294801 zc=y rx=294801 (18396 MB) ok Changes v1 -> v2 - Fixup reverse christmas tree violation v2 -> v3 - Split refcount avoidance optimization into separate patch - Fix refcount leak on error in fragmented case (thanks to Paolo Abeni for pointing this one out!) - Fix refcount inc on zero - Test sock_flag SOCK_ZEROCOPY directly in __ip_append_data. This is needed since commit 5cf4a85 ("tcp: really ignore MSG_ZEROCOPY if no SO_ZEROCOPY") did the same for tcp. Signed-off-by: Willem de Bruijn <[email protected]> Acked-by: Paolo Abeni <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent ce01a56 commit b5947e5

File tree

5 files changed

+55
-3
lines changed

5 files changed

+55
-3
lines changed

include/linux/skbuff.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -485,6 +485,7 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg);
485485

486486
void sock_zerocopy_callback(struct ubuf_info *uarg, bool success);
487487

488+
int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len);
488489
int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
489490
struct msghdr *msg, int len,
490491
struct ubuf_info *uarg);

net/core/skbuff.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1105,6 +1105,12 @@ EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort);
11051105
extern int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
11061106
struct iov_iter *from, size_t length);
11071107

1108+
int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len)
1109+
{
1110+
return __zerocopy_sg_from_iter(skb->sk, skb, &msg->msg_iter, len);
1111+
}
1112+
EXPORT_SYMBOL_GPL(skb_zerocopy_iter_dgram);
1113+
11081114
int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
11091115
struct msghdr *msg, int len,
11101116
struct ubuf_info *uarg)

net/core/sock.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1018,7 +1018,10 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
10181018

10191019
case SO_ZEROCOPY:
10201020
if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1021-
if (sk->sk_protocol != IPPROTO_TCP)
1021+
if (!((sk->sk_type == SOCK_STREAM &&
1022+
sk->sk_protocol == IPPROTO_TCP) ||
1023+
(sk->sk_type == SOCK_DGRAM &&
1024+
sk->sk_protocol == IPPROTO_UDP)))
10221025
ret = -ENOTSUPP;
10231026
} else if (sk->sk_family != PF_RDS) {
10241027
ret = -ENOTSUPP;

net/ipv4/ip_output.c

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -867,6 +867,7 @@ static int __ip_append_data(struct sock *sk,
867867
unsigned int flags)
868868
{
869869
struct inet_sock *inet = inet_sk(sk);
870+
struct ubuf_info *uarg = NULL;
870871
struct sk_buff *skb;
871872

872873
struct ip_options *opt = cork->opt;
@@ -916,6 +917,19 @@ static int __ip_append_data(struct sock *sk,
916917
(!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM)))
917918
csummode = CHECKSUM_PARTIAL;
918919

920+
if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
921+
uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
922+
if (!uarg)
923+
return -ENOBUFS;
924+
if (rt->dst.dev->features & NETIF_F_SG &&
925+
csummode == CHECKSUM_PARTIAL) {
926+
paged = true;
927+
} else {
928+
uarg->zerocopy = 0;
929+
skb_zcopy_set(skb, uarg);
930+
}
931+
}
932+
919933
cork->length += length;
920934

921935
/* So, what's going on in the loop below?
@@ -1006,6 +1020,7 @@ static int __ip_append_data(struct sock *sk,
10061020
cork->tx_flags = 0;
10071021
skb_shinfo(skb)->tskey = tskey;
10081022
tskey = 0;
1023+
skb_zcopy_set(skb, uarg);
10091024

10101025
/*
10111026
* Find where to start putting bytes.
@@ -1068,7 +1083,7 @@ static int __ip_append_data(struct sock *sk,
10681083
err = -EFAULT;
10691084
goto error;
10701085
}
1071-
} else {
1086+
} else if (!uarg || !uarg->zerocopy) {
10721087
int i = skb_shinfo(skb)->nr_frags;
10731088

10741089
err = -ENOMEM;
@@ -1098,18 +1113,24 @@ static int __ip_append_data(struct sock *sk,
10981113
skb->data_len += copy;
10991114
skb->truesize += copy;
11001115
wmem_alloc_delta += copy;
1116+
} else {
1117+
err = skb_zerocopy_iter_dgram(skb, from, copy);
1118+
if (err < 0)
1119+
goto error;
11011120
}
11021121
offset += copy;
11031122
length -= copy;
11041123
}
11051124

11061125
if (wmem_alloc_delta)
11071126
refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1127+
sock_zerocopy_put(uarg);
11081128
return 0;
11091129

11101130
error_efault:
11111131
err = -EFAULT;
11121132
error:
1133+
sock_zerocopy_put_abort(uarg);
11131134
cork->length -= length;
11141135
IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
11151136
refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);

net/ipv6/ip6_output.c

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1245,6 +1245,7 @@ static int __ip6_append_data(struct sock *sk,
12451245
{
12461246
struct sk_buff *skb, *skb_prev = NULL;
12471247
unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1248+
struct ubuf_info *uarg = NULL;
12481249
int exthdrlen = 0;
12491250
int dst_exthdrlen = 0;
12501251
int hh_len;
@@ -1322,6 +1323,19 @@ static int __ip6_append_data(struct sock *sk,
13221323
rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
13231324
csummode = CHECKSUM_PARTIAL;
13241325

1326+
if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1327+
uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
1328+
if (!uarg)
1329+
return -ENOBUFS;
1330+
if (rt->dst.dev->features & NETIF_F_SG &&
1331+
csummode == CHECKSUM_PARTIAL) {
1332+
paged = true;
1333+
} else {
1334+
uarg->zerocopy = 0;
1335+
skb_zcopy_set(skb, uarg);
1336+
}
1337+
}
1338+
13251339
/*
13261340
* Let's try using as much space as possible.
13271341
* Use MTU if total length of the message fits into the MTU.
@@ -1445,6 +1459,7 @@ static int __ip6_append_data(struct sock *sk,
14451459
cork->tx_flags = 0;
14461460
skb_shinfo(skb)->tskey = tskey;
14471461
tskey = 0;
1462+
skb_zcopy_set(skb, uarg);
14481463

14491464
/*
14501465
* Find where to start putting bytes
@@ -1506,7 +1521,7 @@ static int __ip6_append_data(struct sock *sk,
15061521
err = -EFAULT;
15071522
goto error;
15081523
}
1509-
} else {
1524+
} else if (!uarg || !uarg->zerocopy) {
15101525
int i = skb_shinfo(skb)->nr_frags;
15111526

15121527
err = -ENOMEM;
@@ -1536,18 +1551,24 @@ static int __ip6_append_data(struct sock *sk,
15361551
skb->data_len += copy;
15371552
skb->truesize += copy;
15381553
wmem_alloc_delta += copy;
1554+
} else {
1555+
err = skb_zerocopy_iter_dgram(skb, from, copy);
1556+
if (err < 0)
1557+
goto error;
15391558
}
15401559
offset += copy;
15411560
length -= copy;
15421561
}
15431562

15441563
if (wmem_alloc_delta)
15451564
refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1565+
sock_zerocopy_put(uarg);
15461566
return 0;
15471567

15481568
error_efault:
15491569
err = -EFAULT;
15501570
error:
1571+
sock_zerocopy_put_abort(uarg);
15511572
cork->length -= length;
15521573
IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
15531574
refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);

0 commit comments

Comments
 (0)