Skip to content

Commit 6fa9201

Browse files
jrfastabborkmann
authored andcommitted
bpf, sockmap: Avoid returning unneeded EAGAIN when redirecting to self
If a socket redirects to itself and it is under memory pressure it is possible to get a socket stuck so that recv() returns EAGAIN and the socket can not advance for some time. This happens because when redirecting a skb to the same socket we received the skb on we first check if it is OK to enqueue the skb on the receiving socket by checking memory limits. But, if the skb is itself the object holding the memory needed to enqueue the skb we will keep retrying from kernel side and always fail with EAGAIN. Then userspace will get a recv() EAGAIN error if there are no skbs in the psock ingress queue. This will continue until either some skbs get kfree'd causing the memory pressure to reduce far enough that we can enqueue the pending packet or the socket is destroyed. In some cases its possible to get a socket stuck for a noticeable amount of time if the socket is only receiving skbs from sk_skb verdict programs. To reproduce I make the socket memory limits ridiculously low so sockets are always under memory pressure. More often though if under memory pressure it looks like a spurious EAGAIN error on user space side causing userspace to retry and typically enough has moved on the memory side that it works. To fix skip memory checks and skb_orphan if receiving on the same sock as already assigned. For SK_PASS cases this is easy, its always the same socket so we can just omit the orphan/set_owner pair. For backlog cases we need to check skb->sk and decide if the orphan and set_owner pair are needed. Fixes: 5119940 ("bpf: skb_verdict, support SK_PASS on RX BPF path") Signed-off-by: John Fastabend <[email protected]> Signed-off-by: Daniel Borkmann <[email protected]> Reviewed-by: Jakub Sitnicki <[email protected]> Link: https://lore.kernel.org/bpf/160556572660.73229.12566203819812939627.stgit@john-XPS-13-9370
1 parent 70796fb commit 6fa9201

File tree

1 file changed

+53
-19
lines changed

1 file changed

+53
-19
lines changed

net/core/skmsg.c

Lines changed: 53 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -399,38 +399,38 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from,
399399
}
400400
EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter);
401401

402-
static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb)
402+
static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk,
403+
struct sk_buff *skb)
403404
{
404-
struct sock *sk = psock->sk;
405-
int copied = 0, num_sge;
406405
struct sk_msg *msg;
407406

408407
if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
409-
return -EAGAIN;
408+
return NULL;
409+
410+
if (!sk_rmem_schedule(sk, skb, skb->truesize))
411+
return NULL;
410412

411413
msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC);
412414
if (unlikely(!msg))
413-
return -EAGAIN;
414-
if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
415-
kfree(msg);
416-
return -EAGAIN;
417-
}
415+
return NULL;
418416

419417
sk_msg_init(msg);
420-
num_sge = skb_to_sgvec(skb, msg->sg.data, 0, skb->len);
418+
return msg;
419+
}
420+
421+
static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb,
422+
struct sk_psock *psock,
423+
struct sock *sk,
424+
struct sk_msg *msg)
425+
{
426+
int num_sge = skb_to_sgvec(skb, msg->sg.data, 0, skb->len);
427+
int copied;
428+
421429
if (unlikely(num_sge < 0)) {
422430
kfree(msg);
423431
return num_sge;
424432
}
425433

426-
/* This will transition ownership of the data from the socket where
427-
* the BPF program was run initiating the redirect to the socket
428-
* we will eventually receive this data on. The data will be released
429-
* from skb_consume found in __tcp_bpf_recvmsg() after its been copied
430-
* into user buffers.
431-
*/
432-
skb_set_owner_r(skb, sk);
433-
434434
copied = skb->len;
435435
msg->sg.start = 0;
436436
msg->sg.size = copied;
@@ -442,6 +442,40 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb)
442442
return copied;
443443
}
444444

445+
static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb)
446+
{
447+
struct sock *sk = psock->sk;
448+
struct sk_msg *msg;
449+
450+
msg = sk_psock_create_ingress_msg(sk, skb);
451+
if (!msg)
452+
return -EAGAIN;
453+
454+
/* This will transition ownership of the data from the socket where
455+
* the BPF program was run initiating the redirect to the socket
456+
* we will eventually receive this data on. The data will be released
457+
* from skb_consume found in __tcp_bpf_recvmsg() after its been copied
458+
* into user buffers.
459+
*/
460+
skb_set_owner_r(skb, sk);
461+
return sk_psock_skb_ingress_enqueue(skb, psock, sk, msg);
462+
}
463+
464+
/* Puts an skb on the ingress queue of the socket already assigned to the
465+
* skb. In this case we do not need to check memory limits or skb_set_owner_r
466+
* because the skb is already accounted for here.
467+
*/
468+
static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb)
469+
{
470+
struct sk_msg *msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC);
471+
struct sock *sk = psock->sk;
472+
473+
if (unlikely(!msg))
474+
return -EAGAIN;
475+
sk_msg_init(msg);
476+
return sk_psock_skb_ingress_enqueue(skb, psock, sk, msg);
477+
}
478+
445479
static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
446480
u32 off, u32 len, bool ingress)
447481
{
@@ -801,7 +835,7 @@ static void sk_psock_verdict_apply(struct sk_psock *psock,
801835
* retrying later from workqueue.
802836
*/
803837
if (skb_queue_empty(&psock->ingress_skb)) {
804-
err = sk_psock_skb_ingress(psock, skb);
838+
err = sk_psock_skb_ingress_self(psock, skb);
805839
}
806840
if (err < 0) {
807841
skb_queue_tail(&psock->ingress_skb, skb);

0 commit comments

Comments
 (0)