Skip to content

Commit 9bf412d

Browse files
author
Martin KaFai Lau
committed
Merge branch 'bpf-fix-wrong-copied_seq-calculation-and-add-tests'
Jiayuan Chen says: ==================== A previous commit described in this topic http://lore.kernel.org/bpf/[email protected] directly updated 'sk->copied_seq' in the tcp_eat_skb() function when the action of a BPF program was SK_REDIRECT. For other actions, like SK_PASS, the update logic for 'sk->copied_seq' was moved to tcp_bpf_recvmsg_parser() to ensure the accuracy of the 'fionread' feature. That commit works for a single stream_verdict scenario, as it also modified 'sk_data_ready->sk_psock_verdict_data_ready->tcp_read_skb' to remove updating 'sk->copied_seq'. However, for programs where both stream_parser and stream_verdict are active (strparser purpose), tcp_read_sock() was used instead of tcp_read_skb() (sk_data_ready->strp_data_ready->tcp_read_sock). tcp_read_sock() now still updates 'sk->copied_seq', leading to duplicated updates. In summary, for strparser + SK_PASS, copied_seq is redundantly calculated in both tcp_read_sock() and tcp_bpf_recvmsg_parser(). The issue causes incorrect copied_seq calculations, which prevent correct data reads from the recv() interface in user-land. Also we added test cases for bpf + strparser and separated them from sockmap_basic, as strparser has more encapsulation and parsing capabilities compared to sockmap. ==================== Link: https://patch.msgid.link/[email protected] Signed-off-by: Martin KaFai Lau <[email protected]>
2 parents bc27c52 + 6fcfe96 commit 9bf412d

File tree

12 files changed

+610
-65
lines changed

12 files changed

+610
-65
lines changed

Documentation/networking/strparser.rst

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ Functions
112112
Callbacks
113113
=========
114114

115-
There are six callbacks:
115+
There are seven callbacks:
116116

117117
::
118118

@@ -182,6 +182,13 @@ There are six callbacks:
182182
the length of the message. skb->len - offset may be greater
183183
then full_len since strparser does not trim the skb.
184184

185+
::
186+
187+
int (*read_sock)(struct strparser *strp, read_descriptor_t *desc,
188+
sk_read_actor_t recv_actor);
189+
190+
The read_sock callback is used by strparser instead of
191+
sock->ops->read_sock, if provided.
185192
::
186193

187194
int (*read_sock_done)(struct strparser *strp, int err);

include/linux/skmsg.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,8 @@ struct sk_psock {
9191
struct sk_psock_progs progs;
9292
#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
9393
struct strparser strp;
94+
u32 copied_seq;
95+
u32 ingress_bytes;
9496
#endif
9597
struct sk_buff_head ingress_skb;
9698
struct list_head ingress_msg;

include/net/strparser.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@ struct strparser;
4343
struct strp_callbacks {
4444
int (*parse_msg)(struct strparser *strp, struct sk_buff *skb);
4545
void (*rcv_msg)(struct strparser *strp, struct sk_buff *skb);
46+
int (*read_sock)(struct strparser *strp, read_descriptor_t *desc,
47+
sk_read_actor_t recv_actor);
4648
int (*read_sock_done)(struct strparser *strp, int err);
4749
void (*abort_parser)(struct strparser *strp, int err);
4850
void (*lock)(struct strparser *strp);

include/net/tcp.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -729,6 +729,9 @@ void tcp_get_info(struct sock *, struct tcp_info *);
729729
/* Read 'sendfile()'-style from a TCP socket */
730730
int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
731731
sk_read_actor_t recv_actor);
732+
int tcp_read_sock_noack(struct sock *sk, read_descriptor_t *desc,
733+
sk_read_actor_t recv_actor, bool noack,
734+
u32 *copied_seq);
732735
int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
733736
struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off);
734737
void tcp_read_done(struct sock *sk, size_t len);
@@ -2599,6 +2602,11 @@ struct sk_psock;
25992602
#ifdef CONFIG_BPF_SYSCALL
26002603
int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore);
26012604
void tcp_bpf_clone(const struct sock *sk, struct sock *newsk);
2605+
#ifdef CONFIG_BPF_STREAM_PARSER
2606+
struct strparser;
2607+
int tcp_bpf_strp_read_sock(struct strparser *strp, read_descriptor_t *desc,
2608+
sk_read_actor_t recv_actor);
2609+
#endif /* CONFIG_BPF_STREAM_PARSER */
26022610
#endif /* CONFIG_BPF_SYSCALL */
26032611

26042612
#ifdef CONFIG_INET

net/core/skmsg.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -549,6 +549,9 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb,
549549
return num_sge;
550550
}
551551

552+
#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
553+
psock->ingress_bytes += len;
554+
#endif
552555
copied = len;
553556
msg->sg.start = 0;
554557
msg->sg.size = copied;
@@ -1144,6 +1147,10 @@ int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
11441147
if (!ret)
11451148
sk_psock_set_state(psock, SK_PSOCK_RX_STRP_ENABLED);
11461149

1150+
if (sk_is_tcp(sk)) {
1151+
psock->strp.cb.read_sock = tcp_bpf_strp_read_sock;
1152+
psock->copied_seq = tcp_sk(sk)->copied_seq;
1153+
}
11471154
return ret;
11481155
}
11491156

net/core/sock_map.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -303,7 +303,10 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk)
303303

304304
write_lock_bh(&sk->sk_callback_lock);
305305
if (stream_parser && stream_verdict && !psock->saved_data_ready) {
306-
ret = sk_psock_init_strp(sk, psock);
306+
if (sk_is_tcp(sk))
307+
ret = sk_psock_init_strp(sk, psock);
308+
else
309+
ret = -EOPNOTSUPP;
307310
if (ret) {
308311
write_unlock_bh(&sk->sk_callback_lock);
309312
sk_psock_put(sk, psock);

net/ipv4/tcp.c

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1565,12 +1565,13 @@ EXPORT_SYMBOL(tcp_recv_skb);
15651565
* or for 'peeking' the socket using this routine
15661566
* (although both would be easy to implement).
15671567
*/
1568-
int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1569-
sk_read_actor_t recv_actor)
1568+
static int __tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1569+
sk_read_actor_t recv_actor, bool noack,
1570+
u32 *copied_seq)
15701571
{
15711572
struct sk_buff *skb;
15721573
struct tcp_sock *tp = tcp_sk(sk);
1573-
u32 seq = tp->copied_seq;
1574+
u32 seq = *copied_seq;
15741575
u32 offset;
15751576
int copied = 0;
15761577

@@ -1624,9 +1625,12 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
16241625
tcp_eat_recv_skb(sk, skb);
16251626
if (!desc->count)
16261627
break;
1627-
WRITE_ONCE(tp->copied_seq, seq);
1628+
WRITE_ONCE(*copied_seq, seq);
16281629
}
1629-
WRITE_ONCE(tp->copied_seq, seq);
1630+
WRITE_ONCE(*copied_seq, seq);
1631+
1632+
if (noack)
1633+
goto out;
16301634

16311635
tcp_rcv_space_adjust(sk);
16321636

@@ -1635,10 +1639,25 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
16351639
tcp_recv_skb(sk, seq, &offset);
16361640
tcp_cleanup_rbuf(sk, copied);
16371641
}
1642+
out:
16381643
return copied;
16391644
}
1645+
1646+
int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1647+
sk_read_actor_t recv_actor)
1648+
{
1649+
return __tcp_read_sock(sk, desc, recv_actor, false,
1650+
&tcp_sk(sk)->copied_seq);
1651+
}
16401652
EXPORT_SYMBOL(tcp_read_sock);
16411653

1654+
int tcp_read_sock_noack(struct sock *sk, read_descriptor_t *desc,
1655+
sk_read_actor_t recv_actor, bool noack,
1656+
u32 *copied_seq)
1657+
{
1658+
return __tcp_read_sock(sk, desc, recv_actor, noack, copied_seq);
1659+
}
1660+
16421661
int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
16431662
{
16441663
struct sk_buff *skb;

net/ipv4/tcp_bpf.c

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -646,6 +646,42 @@ static int tcp_bpf_assert_proto_ops(struct proto *ops)
646646
ops->sendmsg == tcp_sendmsg ? 0 : -ENOTSUPP;
647647
}
648648

649+
#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
650+
int tcp_bpf_strp_read_sock(struct strparser *strp, read_descriptor_t *desc,
651+
sk_read_actor_t recv_actor)
652+
{
653+
struct sock *sk = strp->sk;
654+
struct sk_psock *psock;
655+
struct tcp_sock *tp;
656+
int copied = 0;
657+
658+
tp = tcp_sk(sk);
659+
rcu_read_lock();
660+
psock = sk_psock(sk);
661+
if (WARN_ON_ONCE(!psock)) {
662+
desc->error = -EINVAL;
663+
goto out;
664+
}
665+
666+
psock->ingress_bytes = 0;
667+
copied = tcp_read_sock_noack(sk, desc, recv_actor, true,
668+
&psock->copied_seq);
669+
if (copied < 0)
670+
goto out;
671+
/* recv_actor may redirect skb to another socket (SK_REDIRECT) or
672+
* just put skb into ingress queue of current socket (SK_PASS).
673+
* For SK_REDIRECT, we need to ack the frame immediately but for
674+
* SK_PASS, we want to delay the ack until tcp_bpf_recvmsg_parser().
675+
*/
676+
tp->copied_seq = psock->copied_seq - psock->ingress_bytes;
677+
tcp_rcv_space_adjust(sk);
678+
__tcp_cleanup_rbuf(sk, copied - psock->ingress_bytes);
679+
out:
680+
rcu_read_unlock();
681+
return copied;
682+
}
683+
#endif /* CONFIG_BPF_STREAM_PARSER */
684+
649685
int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
650686
{
651687
int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;

net/strparser/strparser.c

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -347,15 +347,21 @@ static int strp_read_sock(struct strparser *strp)
347347
struct socket *sock = strp->sk->sk_socket;
348348
read_descriptor_t desc;
349349

350-
if (unlikely(!sock || !sock->ops || !sock->ops->read_sock))
350+
if (unlikely(!sock || !sock->ops))
351+
return -EBUSY;
352+
353+
if (unlikely(!strp->cb.read_sock && !sock->ops->read_sock))
351354
return -EBUSY;
352355

353356
desc.arg.data = strp;
354357
desc.error = 0;
355358
desc.count = 1; /* give more than one skb per call */
356359

357360
/* sk should be locked here, so okay to do read_sock */
358-
sock->ops->read_sock(strp->sk, &desc, strp_recv);
361+
if (strp->cb.read_sock)
362+
strp->cb.read_sock(strp, &desc, strp_recv);
363+
else
364+
sock->ops->read_sock(strp->sk, &desc, strp_recv);
359365

360366
desc.error = strp->cb.read_sock_done(strp, desc.error);
361367

@@ -468,6 +474,7 @@ int strp_init(struct strparser *strp, struct sock *sk,
468474
strp->cb.unlock = cb->unlock ? : strp_sock_unlock;
469475
strp->cb.rcv_msg = cb->rcv_msg;
470476
strp->cb.parse_msg = cb->parse_msg;
477+
strp->cb.read_sock = cb->read_sock;
471478
strp->cb.read_sock_done = cb->read_sock_done ? : default_read_sock_done;
472479
strp->cb.abort_parser = cb->abort_parser ? : strp_abort_strp;
473480

tools/testing/selftests/bpf/prog_tests/sockmap_basic.c

Lines changed: 3 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -522,66 +522,15 @@ static void test_sockmap_skb_verdict_shutdown(void)
522522
if (!ASSERT_EQ(err, 1, "epoll_wait(fd)"))
523523
goto out_close;
524524

525-
n = recv(c1, &b, 1, SOCK_NONBLOCK);
526-
ASSERT_EQ(n, 0, "recv_timeout(fin)");
525+
n = recv(c1, &b, 1, MSG_DONTWAIT);
526+
ASSERT_EQ(n, 0, "recv(fin)");
527527
out_close:
528528
close(c1);
529529
close(p1);
530530
out:
531531
test_sockmap_pass_prog__destroy(skel);
532532
}
533533

534-
static void test_sockmap_stream_pass(void)
535-
{
536-
int zero = 0, sent, recvd;
537-
int verdict, parser;
538-
int err, map;
539-
int c = -1, p = -1;
540-
struct test_sockmap_pass_prog *pass = NULL;
541-
char snd[256] = "0123456789";
542-
char rcv[256] = "0";
543-
544-
pass = test_sockmap_pass_prog__open_and_load();
545-
verdict = bpf_program__fd(pass->progs.prog_skb_verdict);
546-
parser = bpf_program__fd(pass->progs.prog_skb_parser);
547-
map = bpf_map__fd(pass->maps.sock_map_rx);
548-
549-
err = bpf_prog_attach(parser, map, BPF_SK_SKB_STREAM_PARSER, 0);
550-
if (!ASSERT_OK(err, "bpf_prog_attach stream parser"))
551-
goto out;
552-
553-
err = bpf_prog_attach(verdict, map, BPF_SK_SKB_STREAM_VERDICT, 0);
554-
if (!ASSERT_OK(err, "bpf_prog_attach stream verdict"))
555-
goto out;
556-
557-
err = create_pair(AF_INET, SOCK_STREAM, &c, &p);
558-
if (err)
559-
goto out;
560-
561-
/* sk_data_ready of 'p' will be replaced by strparser handler */
562-
err = bpf_map_update_elem(map, &zero, &p, BPF_NOEXIST);
563-
if (!ASSERT_OK(err, "bpf_map_update_elem(p)"))
564-
goto out_close;
565-
566-
/*
567-
* as 'prog_skb_parser' return the original skb len and
568-
* 'prog_skb_verdict' return SK_PASS, the kernel will just
569-
* pass it through to original socket 'p'
570-
*/
571-
sent = xsend(c, snd, sizeof(snd), 0);
572-
ASSERT_EQ(sent, sizeof(snd), "xsend(c)");
573-
574-
recvd = recv_timeout(p, rcv, sizeof(rcv), SOCK_NONBLOCK,
575-
IO_TIMEOUT_SEC);
576-
ASSERT_EQ(recvd, sizeof(rcv), "recv_timeout(p)");
577-
578-
out_close:
579-
close(c);
580-
close(p);
581-
582-
out:
583-
test_sockmap_pass_prog__destroy(pass);
584-
}
585534

586535
static void test_sockmap_skb_verdict_fionread(bool pass_prog)
587536
{
@@ -628,7 +577,7 @@ static void test_sockmap_skb_verdict_fionread(bool pass_prog)
628577
ASSERT_EQ(avail, expected, "ioctl(FIONREAD)");
629578
/* On DROP test there will be no data to read */
630579
if (pass_prog) {
631-
recvd = recv_timeout(c1, &buf, sizeof(buf), SOCK_NONBLOCK, IO_TIMEOUT_SEC);
580+
recvd = recv_timeout(c1, &buf, sizeof(buf), MSG_DONTWAIT, IO_TIMEOUT_SEC);
632581
ASSERT_EQ(recvd, sizeof(buf), "recv_timeout(c0)");
633582
}
634583

@@ -1101,8 +1050,6 @@ void test_sockmap_basic(void)
11011050
test_sockmap_progs_query(BPF_SK_SKB_VERDICT);
11021051
if (test__start_subtest("sockmap skb_verdict shutdown"))
11031052
test_sockmap_skb_verdict_shutdown();
1104-
if (test__start_subtest("sockmap stream parser and verdict pass"))
1105-
test_sockmap_stream_pass();
11061053
if (test__start_subtest("sockmap skb_verdict fionread"))
11071054
test_sockmap_skb_verdict_fionread(true);
11081055
if (test__start_subtest("sockmap skb_verdict fionread on drop"))

0 commit comments

Comments
 (0)