Skip to content

Commit bf3b9f6

Browse files
ssamudraladavem330
authored andcommitted
epoll: Add busy poll support to epoll with socket fds.
This patch adds busy poll support to epoll. The implementation is meant to be opportunistic in that it will take the NAPI ID from the last socket that is added to the ready list that contains a valid NAPI ID and it will use that for busy polling until the ready list goes empty. Once the ready list goes empty the NAPI ID is reset and busy polling is disabled until a new socket is added to the ready list. In addition when we insert a new socket into the epoll we record the NAPI ID and assume we are going to receive events on it. If that doesn't occur it will be evicted as the active NAPI ID and we will resume normal behavior. An application can use SO_INCOMING_CPU or SO_REUSEPORT_ATTACH_C/EBPF socket options to spread the incoming connections to specific worker threads based on the incoming queue. This enables epoll for each worker thread to have only sockets that receive packets from a single queue. So when an application calls epoll_wait() and there are no events available to report, busy polling is done on the associated queue to pull the packets. Signed-off-by: Sridhar Samudrala <[email protected]> Signed-off-by: Alexander Duyck <[email protected]> Acked-by: Eric Dumazet <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 7db6b04 commit bf3b9f6

File tree

1 file changed

+93
-0
lines changed

1 file changed

+93
-0
lines changed

fs/eventpoll.c

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
#include <linux/seq_file.h>
4343
#include <linux/compat.h>
4444
#include <linux/rculist.h>
45+
#include <net/busy_poll.h>
4546

4647
/*
4748
* LOCKING:
@@ -224,6 +225,11 @@ struct eventpoll {
224225
/* used to optimize loop detection check */
225226
int visited;
226227
struct list_head visited_list_link;
228+
229+
#ifdef CONFIG_NET_RX_BUSY_POLL
230+
/* used to track busy poll napi_id */
231+
unsigned int napi_id;
232+
#endif
227233
};
228234

229235
/* Wait structure used by the poll hooks */
@@ -384,6 +390,77 @@ static inline int ep_events_available(struct eventpoll *ep)
384390
return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
385391
}
386392

393+
#ifdef CONFIG_NET_RX_BUSY_POLL
394+
static bool ep_busy_loop_end(void *p, unsigned long start_time)
395+
{
396+
struct eventpoll *ep = p;
397+
398+
return ep_events_available(ep) || busy_loop_timeout(start_time);
399+
}
400+
#endif /* CONFIG_NET_RX_BUSY_POLL */
401+
402+
/*
403+
* Busy poll if globally on and supporting sockets found && no events,
404+
* busy loop will return if need_resched or ep_events_available.
405+
*
406+
* we must do our busy polling with irqs enabled
407+
*/
408+
static void ep_busy_loop(struct eventpoll *ep, int nonblock)
409+
{
410+
#ifdef CONFIG_NET_RX_BUSY_POLL
411+
unsigned int napi_id = READ_ONCE(ep->napi_id);
412+
413+
if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on())
414+
napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep);
415+
#endif
416+
}
417+
418+
static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep)
419+
{
420+
#ifdef CONFIG_NET_RX_BUSY_POLL
421+
if (ep->napi_id)
422+
ep->napi_id = 0;
423+
#endif
424+
}
425+
426+
/*
427+
* Set epoll busy poll NAPI ID from sk.
428+
*/
429+
static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
430+
{
431+
#ifdef CONFIG_NET_RX_BUSY_POLL
432+
struct eventpoll *ep;
433+
unsigned int napi_id;
434+
struct socket *sock;
435+
struct sock *sk;
436+
int err;
437+
438+
if (!net_busy_loop_on())
439+
return;
440+
441+
sock = sock_from_file(epi->ffd.file, &err);
442+
if (!sock)
443+
return;
444+
445+
sk = sock->sk;
446+
if (!sk)
447+
return;
448+
449+
napi_id = READ_ONCE(sk->sk_napi_id);
450+
ep = epi->ep;
451+
452+
/* Non-NAPI IDs can be rejected
453+
* or
454+
* Nothing to do if we already have this ID
455+
*/
456+
if (napi_id < MIN_NAPI_ID || napi_id == ep->napi_id)
457+
return;
458+
459+
/* record NAPI ID for use in next busy poll */
460+
ep->napi_id = napi_id;
461+
#endif
462+
}
463+
387464
/**
388465
* ep_call_nested - Perform a bound (possibly) nested call, by checking
389466
* that the recursion limit is not exceeded, and that
@@ -1022,6 +1099,8 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
10221099

10231100
spin_lock_irqsave(&ep->lock, flags);
10241101

1102+
ep_set_busy_poll_napi_id(epi);
1103+
10251104
/*
10261105
* If the event mask does not contain any poll(2) event, we consider the
10271106
* descriptor to be disabled. This condition is likely the effect of the
@@ -1363,6 +1442,9 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
13631442
/* We have to drop the new item inside our item list to keep track of it */
13641443
spin_lock_irqsave(&ep->lock, flags);
13651444

1445+
/* record NAPI ID of new item if present */
1446+
ep_set_busy_poll_napi_id(epi);
1447+
13661448
/* If the file is already "ready" we drop it inside the ready list */
13671449
if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
13681450
list_add_tail(&epi->rdllink, &ep->rdllist);
@@ -1637,9 +1719,20 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
16371719
}
16381720

16391721
fetch_events:
1722+
1723+
if (!ep_events_available(ep))
1724+
ep_busy_loop(ep, timed_out);
1725+
16401726
spin_lock_irqsave(&ep->lock, flags);
16411727

16421728
if (!ep_events_available(ep)) {
1729+
/*
1730+
* Busy poll timed out. Drop NAPI ID for now, we can add
1731+
* it back in when we have moved a socket with a valid NAPI
1732+
* ID onto the ready list.
1733+
*/
1734+
ep_reset_busy_poll_napi_id(ep);
1735+
16431736
/*
16441737
* We don't have any available event to return to the caller.
16451738
* We need to sleep here, and we will be wake up by

0 commit comments

Comments
 (0)