Skip to content

Commit 65249fe

Browse files
minakuba-moo
authored andcommitted
net: add support for skbs with unreadable frags
For device memory TCP, we expect the skb headers to be available in host memory for access, and we expect the skb frags to be in device memory and unaccessible to the host. We expect there to be no mixing and matching of device memory frags (unaccessible) with host memory frags (accessible) in the same skb. Add a skb->devmem flag which indicates whether the frags in this skb are device memory frags or not. __skb_fill_netmem_desc() now checks frags added to skbs for net_iov, and marks the skb as skb->devmem accordingly. Add checks through the network stack to avoid accessing the frags of devmem skbs and avoid coalescing devmem skbs with non devmem skbs. Signed-off-by: Willem de Bruijn <[email protected]> Signed-off-by: Kaiyuan Zhang <[email protected]> Signed-off-by: Mina Almasry <[email protected]> Reviewed-by: Eric Dumazet <[email protected]> Reviewed-by: Jakub Kicinski <[email protected]> Link: https://patch.msgid.link/[email protected] Signed-off-by: Jakub Kicinski <[email protected]>
1 parent 9f6b619 commit 65249fe

File tree

9 files changed

+89
-11
lines changed

9 files changed

+89
-11
lines changed

include/linux/skbuff.h

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -827,6 +827,8 @@ enum skb_tstamp_type {
827827
* @csum_level: indicates the number of consecutive checksums found in
828828
* the packet minus one that have been verified as
829829
* CHECKSUM_UNNECESSARY (max 3)
830+
* @unreadable: indicates that at least 1 of the fragments in this skb is
831+
* unreadable.
830832
* @dst_pending_confirm: need to confirm neighbour
831833
* @decrypted: Decrypted SKB
832834
* @slow_gro: state present at GRO time, slower prepare step required
@@ -1008,7 +1010,7 @@ struct sk_buff {
10081010
#if IS_ENABLED(CONFIG_IP_SCTP)
10091011
__u8 csum_not_inet:1;
10101012
#endif
1011-
1013+
__u8 unreadable:1;
10121014
#if defined(CONFIG_NET_SCHED) || defined(CONFIG_NET_XGRESS)
10131015
__u16 tc_index; /* traffic control index */
10141016
#endif
@@ -1824,6 +1826,12 @@ static inline void skb_zcopy_downgrade_managed(struct sk_buff *skb)
18241826
__skb_zcopy_downgrade_managed(skb);
18251827
}
18261828

1829+
/* Return true if frags in this skb are readable by the host. */
1830+
static inline bool skb_frags_readable(const struct sk_buff *skb)
1831+
{
1832+
return !skb->unreadable;
1833+
}
1834+
18271835
static inline void skb_mark_not_on_list(struct sk_buff *skb)
18281836
{
18291837
skb->next = NULL;
@@ -2540,10 +2548,17 @@ static inline void skb_len_add(struct sk_buff *skb, int delta)
25402548
static inline void __skb_fill_netmem_desc(struct sk_buff *skb, int i,
25412549
netmem_ref netmem, int off, int size)
25422550
{
2543-
struct page *page = netmem_to_page(netmem);
2551+
struct page *page;
25442552

25452553
__skb_fill_netmem_desc_noacc(skb_shinfo(skb), i, netmem, off, size);
25462554

2555+
if (netmem_is_net_iov(netmem)) {
2556+
skb->unreadable = true;
2557+
return;
2558+
}
2559+
2560+
page = netmem_to_page(netmem);
2561+
25472562
/* Propagate page pfmemalloc to the skb if we can. The problem is
25482563
* that not all callers have unique ownership of the page but rely
25492564
* on page_is_pfmemalloc doing the right thing(tm).

include/net/tcp.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1069,7 +1069,8 @@ static inline bool tcp_skb_can_collapse(const struct sk_buff *to,
10691069
/* skb_cmp_decrypted() not needed, use tcp_write_collapse_fence() */
10701070
return likely(tcp_skb_can_collapse_to(to) &&
10711071
mptcp_skb_can_collapse(to, from) &&
1072-
skb_pure_zcopy_same(to, from));
1072+
skb_pure_zcopy_same(to, from) &&
1073+
skb_frags_readable(to) == skb_frags_readable(from));
10731074
}
10741075

10751076
static inline bool tcp_skb_can_collapse_rx(const struct sk_buff *to,

net/core/datagram.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -407,6 +407,9 @@ static int __skb_datagram_iter(const struct sk_buff *skb, int offset,
407407
return 0;
408408
}
409409

410+
if (!skb_frags_readable(skb))
411+
goto short_copy;
412+
410413
/* Copy paged appendix. Hmm... why does this look so complicated? */
411414
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
412415
int end;
@@ -623,6 +626,9 @@ int zerocopy_fill_skb_from_iter(struct sk_buff *skb,
623626
{
624627
int frag = skb_shinfo(skb)->nr_frags;
625628

629+
if (!skb_frags_readable(skb))
630+
return -EFAULT;
631+
626632
while (length && iov_iter_count(from)) {
627633
struct page *head, *last_head = NULL;
628634
struct page *pages[MAX_SKB_FRAGS];

net/core/dev.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3312,6 +3312,10 @@ int skb_checksum_help(struct sk_buff *skb)
33123312
return -EINVAL;
33133313
}
33143314

3315+
if (!skb_frags_readable(skb)) {
3316+
return -EFAULT;
3317+
}
3318+
33153319
/* Before computing a checksum, we should make sure no frag could
33163320
* be modified by an external entity : checksum could be wrong.
33173321
*/

net/core/skbuff.c

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1972,6 +1972,9 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
19721972
if (skb_shared(skb) || skb_unclone(skb, gfp_mask))
19731973
return -EINVAL;
19741974

1975+
if (!skb_frags_readable(skb))
1976+
return -EFAULT;
1977+
19751978
if (!num_frags)
19761979
goto release;
19771980

@@ -2145,6 +2148,9 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
21452148
unsigned int size;
21462149
int headerlen;
21472150

2151+
if (!skb_frags_readable(skb))
2152+
return NULL;
2153+
21482154
if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST))
21492155
return NULL;
21502156

@@ -2483,6 +2489,9 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
24832489
struct sk_buff *n;
24842490
int oldheadroom;
24852491

2492+
if (!skb_frags_readable(skb))
2493+
return NULL;
2494+
24862495
if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST))
24872496
return NULL;
24882497

@@ -2827,6 +2836,9 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta)
28272836
*/
28282837
int i, k, eat = (skb->tail + delta) - skb->end;
28292838

2839+
if (!skb_frags_readable(skb))
2840+
return NULL;
2841+
28302842
if (eat > 0 || skb_cloned(skb)) {
28312843
if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0,
28322844
GFP_ATOMIC))
@@ -2980,6 +2992,9 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
29802992
to += copy;
29812993
}
29822994

2995+
if (!skb_frags_readable(skb))
2996+
goto fault;
2997+
29832998
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
29842999
int end;
29853000
skb_frag_t *f = &skb_shinfo(skb)->frags[i];
@@ -3168,6 +3183,9 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
31683183
/*
31693184
* then map the fragments
31703185
*/
3186+
if (!skb_frags_readable(skb))
3187+
return false;
3188+
31713189
for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) {
31723190
const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];
31733191

@@ -3391,6 +3409,9 @@ int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len)
33913409
from += copy;
33923410
}
33933411

3412+
if (!skb_frags_readable(skb))
3413+
goto fault;
3414+
33943415
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
33953416
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
33963417
int end;
@@ -3470,6 +3491,9 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
34703491
pos = copy;
34713492
}
34723493

3494+
if (WARN_ON_ONCE(!skb_frags_readable(skb)))
3495+
return 0;
3496+
34733497
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
34743498
int end;
34753499
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
@@ -3570,6 +3594,9 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
35703594
pos = copy;
35713595
}
35723596

3597+
if (!skb_frags_readable(skb))
3598+
return 0;
3599+
35733600
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
35743601
int end;
35753602

@@ -4061,6 +4088,7 @@ static inline void skb_split_inside_header(struct sk_buff *skb,
40614088
skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];
40624089

40634090
skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
4091+
skb1->unreadable = skb->unreadable;
40644092
skb_shinfo(skb)->nr_frags = 0;
40654093
skb1->data_len = skb->data_len;
40664094
skb1->len += skb1->data_len;
@@ -4108,6 +4136,8 @@ static inline void skb_split_no_header(struct sk_buff *skb,
41084136
pos += size;
41094137
}
41104138
skb_shinfo(skb1)->nr_frags = k;
4139+
4140+
skb1->unreadable = skb->unreadable;
41114141
}
41124142

41134143
/**
@@ -4345,6 +4375,9 @@ unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
43454375
return block_limit - abs_offset;
43464376
}
43474377

4378+
if (!skb_frags_readable(st->cur_skb))
4379+
return 0;
4380+
43484381
if (st->frag_idx == 0 && !st->frag_data)
43494382
st->stepped_offset += skb_headlen(st->cur_skb);
43504383

@@ -5992,7 +6025,10 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
59926025
if (to->pp_recycle != from->pp_recycle)
59936026
return false;
59946027

5995-
if (len <= skb_tailroom(to)) {
6028+
if (skb_frags_readable(from) != skb_frags_readable(to))
6029+
return false;
6030+
6031+
if (len <= skb_tailroom(to) && skb_frags_readable(from)) {
59966032
if (len)
59976033
BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
59986034
*delta_truesize = 0;
@@ -6169,6 +6205,9 @@ int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len)
61696205
if (!pskb_may_pull(skb, write_len))
61706206
return -ENOMEM;
61716207

6208+
if (!skb_frags_readable(skb))
6209+
return -EFAULT;
6210+
61726211
if (!skb_cloned(skb) || skb_clone_writable(skb, write_len))
61736212
return 0;
61746213

@@ -6848,7 +6887,7 @@ void skb_condense(struct sk_buff *skb)
68486887
{
68496888
if (skb->data_len) {
68506889
if (skb->data_len > skb->end - skb->tail ||
6851-
skb_cloned(skb))
6890+
skb_cloned(skb) || !skb_frags_readable(skb))
68526891
return;
68536892

68546893
/* Nice, we can free page frag(s) right now */

net/ipv4/tcp.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2160,6 +2160,9 @@ static int tcp_zerocopy_receive(struct sock *sk,
21602160
skb = tcp_recv_skb(sk, seq, &offset);
21612161
}
21622162

2163+
if (!skb_frags_readable(skb))
2164+
break;
2165+
21632166
if (TCP_SKB_CB(skb)->has_rxtstamp) {
21642167
tcp_update_recv_tstamps(skb, tss);
21652168
zc->msg_flags |= TCP_CMSG_TS;

net/ipv4/tcp_input.c

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5391,6 +5391,9 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
53915391
for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) {
53925392
n = tcp_skb_next(skb, list);
53935393

5394+
if (!skb_frags_readable(skb))
5395+
goto skip_this;
5396+
53945397
/* No new bits? It is possible on ofo queue. */
53955398
if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
53965399
skb = tcp_collapse_one(sk, skb, list, root);
@@ -5411,17 +5414,20 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
54115414
break;
54125415
}
54135416

5414-
if (n && n != tail && tcp_skb_can_collapse_rx(skb, n) &&
5417+
if (n && n != tail && skb_frags_readable(n) &&
5418+
tcp_skb_can_collapse_rx(skb, n) &&
54155419
TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) {
54165420
end_of_skbs = false;
54175421
break;
54185422
}
54195423

5424+
skip_this:
54205425
/* Decided to skip this, advance start seq. */
54215426
start = TCP_SKB_CB(skb)->end_seq;
54225427
}
54235428
if (end_of_skbs ||
5424-
(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
5429+
(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) ||
5430+
!skb_frags_readable(skb))
54255431
return;
54265432

54275433
__skb_queue_head_init(&tmp);
@@ -5463,7 +5469,8 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
54635469
if (!skb ||
54645470
skb == tail ||
54655471
!tcp_skb_can_collapse_rx(nskb, skb) ||
5466-
(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
5472+
(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) ||
5473+
!skb_frags_readable(skb))
54675474
goto end;
54685475
}
54695476
}

net/ipv4/tcp_output.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2344,7 +2344,8 @@ static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len)
23442344

23452345
if (unlikely(TCP_SKB_CB(skb)->eor) ||
23462346
tcp_has_tx_tstamp(skb) ||
2347-
!skb_pure_zcopy_same(skb, next))
2347+
!skb_pure_zcopy_same(skb, next) ||
2348+
skb_frags_readable(skb) != skb_frags_readable(next))
23482349
return false;
23492350

23502351
len -= skb->len;
@@ -3264,6 +3265,8 @@ static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
32643265
return false;
32653266
if (skb_cloned(skb))
32663267
return false;
3268+
if (!skb_frags_readable(skb))
3269+
return false;
32673270
/* Some heuristics for collapsing over SACK'd could be invented */
32683271
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
32693272
return false;

net/packet/af_packet.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2216,7 +2216,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
22162216
}
22172217
}
22182218

2219-
snaplen = skb->len;
2219+
snaplen = skb_frags_readable(skb) ? skb->len : skb_headlen(skb);
22202220

22212221
res = run_filter(skb, sk, snaplen);
22222222
if (!res)
@@ -2336,7 +2336,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
23362336
}
23372337
}
23382338

2339-
snaplen = skb->len;
2339+
snaplen = skb_frags_readable(skb) ? skb->len : skb_headlen(skb);
23402340

23412341
res = run_filter(skb, sk, snaplen);
23422342
if (!res)

0 commit comments

Comments
 (0)