Skip to content

Commit 1f8b977

Browse files
wdebruijdavem330
authored andcommitted
sock: enable MSG_ZEROCOPY
Prepare the datapath for refcounted ubuf_info. Clone ubuf_info with skb_zerocopy_clone() wherever needed due to skb split, merge, resize or clone. Split skb_orphan_frags into two variants. The split, merge, .. paths support reference counted zerocopy buffers, so do not do a deep copy. Add skb_orphan_frags_rx for paths that may loop packets to receive sockets. That is not allowed, as it may cause unbounded latency. Deep copy all zerocopy copy buffers, ref-counted or not, in this path. The exact locations to modify were chosen by exhaustively searching through all code that might modify skb_frag references and/or the the SKBTX_DEV_ZEROCOPY tx_flags bit. The changes err on the safe side, in two ways. (1) legacy ubuf_info paths virtio and tap are not modified. They keep a 1:1 ubuf_info to sk_buff relationship. Calls to skb_orphan_frags still call skb_copy_ubufs and thus copy frags in this case. (2) not all copies deep in the stack are addressed yet. skb_shift, skb_split and skb_try_coalesce can be refined to avoid copying. These are not in the hot path and this patch is hairy enough as is, so that is left for future refinement. Signed-off-by: Willem de Bruijn <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 76851d1 commit 1f8b977

File tree

5 files changed

+36
-33
lines changed

5 files changed

+36
-33
lines changed

drivers/net/tun.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -892,7 +892,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
892892
sk_filter(tfile->socket.sk, skb))
893893
goto drop;
894894

895-
if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
895+
if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
896896
goto drop;
897897

898898
skb_tx_timestamp(skb);

drivers/vhost/net.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -533,6 +533,7 @@ static void handle_tx(struct vhost_net *net)
533533
ubuf->callback = vhost_zerocopy_callback;
534534
ubuf->ctx = nvq->ubufs;
535535
ubuf->desc = nvq->upend_idx;
536+
atomic_set(&ubuf->refcnt, 1);
536537
msg.msg_control = ubuf;
537538
msg.msg_controllen = sizeof(ubuf);
538539
ubufs = nvq->ubufs;

include/linux/skbuff.h

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2512,7 +2512,17 @@ static inline void skb_orphan(struct sk_buff *skb)
25122512
*/
25132513
static inline int skb_orphan_frags(struct sk_buff *skb, gfp_t gfp_mask)
25142514
{
2515-
if (likely(!(skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY)))
2515+
if (likely(!skb_zcopy(skb)))
2516+
return 0;
2517+
if (skb_uarg(skb)->callback == sock_zerocopy_callback)
2518+
return 0;
2519+
return skb_copy_ubufs(skb, gfp_mask);
2520+
}
2521+
2522+
/* Frags must be orphaned, even if refcounted, if skb might loop to rx path */
2523+
static inline int skb_orphan_frags_rx(struct sk_buff *skb, gfp_t gfp_mask)
2524+
{
2525+
if (likely(!skb_zcopy(skb)))
25162526
return 0;
25172527
return skb_copy_ubufs(skb, gfp_mask);
25182528
}
@@ -2944,6 +2954,8 @@ static inline int skb_add_data(struct sk_buff *skb,
29442954
static inline bool skb_can_coalesce(struct sk_buff *skb, int i,
29452955
const struct page *page, int off)
29462956
{
2957+
if (skb_zcopy(skb))
2958+
return false;
29472959
if (i) {
29482960
const struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[i - 1];
29492961

net/core/dev.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1853,7 +1853,7 @@ static inline int deliver_skb(struct sk_buff *skb,
18531853
struct packet_type *pt_prev,
18541854
struct net_device *orig_dev)
18551855
{
1856-
if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1856+
if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
18571857
return -ENOMEM;
18581858
refcount_inc(&skb->users);
18591859
return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
@@ -4412,7 +4412,7 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
44124412
}
44134413

44144414
if (pt_prev) {
4415-
if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
4415+
if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
44164416
goto drop;
44174417
else
44184418
ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);

net/core/skbuff.c

Lines changed: 19 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -567,21 +567,10 @@ static void skb_release_data(struct sk_buff *skb)
567567
for (i = 0; i < shinfo->nr_frags; i++)
568568
__skb_frag_unref(&shinfo->frags[i]);
569569

570-
/*
571-
* If skb buf is from userspace, we need to notify the caller
572-
* the lower device DMA has done;
573-
*/
574-
if (shinfo->tx_flags & SKBTX_DEV_ZEROCOPY) {
575-
struct ubuf_info *uarg;
576-
577-
uarg = shinfo->destructor_arg;
578-
if (uarg->callback)
579-
uarg->callback(uarg, true);
580-
}
581-
582570
if (shinfo->frag_list)
583571
kfree_skb_list(shinfo->frag_list);
584572

573+
skb_zcopy_clear(skb, true);
585574
skb_free_head(skb);
586575
}
587576

@@ -695,14 +684,7 @@ EXPORT_SYMBOL(kfree_skb_list);
695684
*/
696685
void skb_tx_error(struct sk_buff *skb)
697686
{
698-
if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
699-
struct ubuf_info *uarg;
700-
701-
uarg = skb_shinfo(skb)->destructor_arg;
702-
if (uarg->callback)
703-
uarg->callback(uarg, false);
704-
skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY;
705-
}
687+
skb_zcopy_clear(skb, true);
706688
}
707689
EXPORT_SYMBOL(skb_tx_error);
708690

@@ -1029,9 +1011,7 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
10291011
}
10301012
EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream);
10311013

1032-
/* unused only until next patch in the series; will remove attribute */
1033-
static int __attribute__((unused))
1034-
skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig,
1014+
static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig,
10351015
gfp_t gfp_mask)
10361016
{
10371017
if (skb_zcopy(orig)) {
@@ -1068,7 +1048,6 @@ static int __attribute__((unused))
10681048
*/
10691049
int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
10701050
{
1071-
struct ubuf_info *uarg = skb_shinfo(skb)->destructor_arg;
10721051
int num_frags = skb_shinfo(skb)->nr_frags;
10731052
struct page *page, *head = NULL;
10741053
int i, new_frags;
@@ -1127,8 +1106,6 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
11271106
for (i = 0; i < num_frags; i++)
11281107
skb_frag_unref(skb, i);
11291108

1130-
uarg->callback(uarg, false);
1131-
11321109
/* skb frags point to kernel buffers */
11331110
for (i = 0; i < new_frags - 1; i++) {
11341111
__skb_fill_page_desc(skb, i, head, 0, PAGE_SIZE);
@@ -1137,7 +1114,7 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
11371114
__skb_fill_page_desc(skb, new_frags - 1, head, 0, d_off);
11381115
skb_shinfo(skb)->nr_frags = new_frags;
11391116

1140-
skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY;
1117+
skb_zcopy_clear(skb, false);
11411118
return 0;
11421119
}
11431120
EXPORT_SYMBOL_GPL(skb_copy_ubufs);
@@ -1298,7 +1275,8 @@ struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
12981275
if (skb_shinfo(skb)->nr_frags) {
12991276
int i;
13001277

1301-
if (skb_orphan_frags(skb, gfp_mask)) {
1278+
if (skb_orphan_frags(skb, gfp_mask) ||
1279+
skb_zerocopy_clone(n, skb, gfp_mask)) {
13021280
kfree_skb(n);
13031281
n = NULL;
13041282
goto out;
@@ -1375,9 +1353,10 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
13751353
* be since all we did is relocate the values
13761354
*/
13771355
if (skb_cloned(skb)) {
1378-
/* copy this zero copy skb frags */
13791356
if (skb_orphan_frags(skb, gfp_mask))
13801357
goto nofrags;
1358+
if (skb_zcopy(skb))
1359+
atomic_inc(&skb_uarg(skb)->refcnt);
13811360
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
13821361
skb_frag_ref(skb, i);
13831362

@@ -1872,6 +1851,9 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta)
18721851
skb->tail += delta;
18731852
skb->data_len -= delta;
18741853

1854+
if (!skb->data_len)
1855+
skb_zcopy_clear(skb, false);
1856+
18751857
return skb_tail_pointer(skb);
18761858
}
18771859
EXPORT_SYMBOL(__pskb_pull_tail);
@@ -2627,6 +2609,7 @@ skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen)
26272609
skb_tx_error(from);
26282610
return -ENOMEM;
26292611
}
2612+
skb_zerocopy_clone(to, from, GFP_ATOMIC);
26302613

26312614
for (i = 0; i < skb_shinfo(from)->nr_frags; i++) {
26322615
if (!len)
@@ -2924,6 +2907,7 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
29242907

29252908
skb_shinfo(skb1)->tx_flags |= skb_shinfo(skb)->tx_flags &
29262909
SKBTX_SHARED_FRAG;
2910+
skb_zerocopy_clone(skb1, skb, 0);
29272911
if (len < pos) /* Split line is inside header. */
29282912
skb_split_inside_header(skb, skb1, len, pos);
29292913
else /* Second chunk has no header, nothing to copy. */
@@ -2967,6 +2951,8 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
29672951

29682952
if (skb_headlen(skb))
29692953
return 0;
2954+
if (skb_zcopy(tgt) || skb_zcopy(skb))
2955+
return 0;
29702956

29712957
todo = shiftlen;
29722958
from = 0;
@@ -3540,6 +3526,8 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
35403526

35413527
skb_shinfo(nskb)->tx_flags |= skb_shinfo(head_skb)->tx_flags &
35423528
SKBTX_SHARED_FRAG;
3529+
if (skb_zerocopy_clone(nskb, head_skb, GFP_ATOMIC))
3530+
goto err;
35433531

35443532
while (pos < offset + len) {
35453533
if (i >= nfrags) {
@@ -4663,6 +4651,8 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
46634651

46644652
if (skb_has_frag_list(to) || skb_has_frag_list(from))
46654653
return false;
4654+
if (skb_zcopy(to) || skb_zcopy(from))
4655+
return false;
46664656

46674657
if (skb_headlen(from) != 0) {
46684658
struct page *page;

0 commit comments

Comments
 (0)