Skip to content

Commit 217f697

Browse files
Eric Dumazetdavem330
authored andcommitted
net: busy-poll: allow preemption in sk_busy_loop()
After commit 4cd13c2 ("softirq: Let ksoftirqd do its job"), sk_busy_loop() needs a bit of care : softirqs might be delayed since we do not allow preemption yet. This patch adds preemptiom points in sk_busy_loop(), and makes sure no unnecessary cache line dirtying or atomic operations are done while looping. A new flag is added into napi->state : NAPI_STATE_IN_BUSY_POLL This prevents napi_complete_done() from clearing NAPIF_STATE_SCHED, so that sk_busy_loop() does not have to grab it again. Similarly, netpoll_poll_lock() is done one time. This gives about 10 to 20 % improvement in various busy polling tests, especially when many threads are busy polling in configurations with large number of NIC queues. This should allow experimenting with bigger delays without hurting overall latencies. Tested: On a 40Gb mlx4 NIC, 32 RX/TX queues. echo 70 >/proc/sys/net/core/busy_read for i in `seq 1 40`; do echo -n $i: ; ./super_netperf $i -H lpaa24 -t UDP_RR -- -N -n; done Before: After: 1: 90072 92819 2: 157289 184007 3: 235772 213504 4: 344074 357513 5: 394755 458267 6: 461151 487819 7: 549116 625963 8: 544423 716219 9: 720460 738446 10: 794686 837612 11: 915998 923960 12: 937507 925107 13: 1019677 971506 14: 1046831 1113650 15: 1114154 1148902 16: 1105221 1179263 17: 1266552 1299585 18: 1258454 1383817 19: 1341453 1312194 20: 1363557 1488487 21: 1387979 1501004 22: 1417552 1601683 23: 1550049 1642002 24: 1568876 1601915 25: 1560239 1683607 26: 1640207 1745211 27: 1706540 1723574 28: 1638518 1722036 29: 1734309 1757447 30: 1782007 1855436 31: 1724806 1888539 32: 1717716 1944297 33: 1778716 1869118 34: 1805738 1983466 35: 1815694 2020758 36: 1893059 2035632 37: 1843406 2034653 38: 1888830 2086580 39: 1972827 2143567 40: 1877729 2181851 Signed-off-by: Eric Dumazet <[email protected]> Cc: Willem de Bruijn <[email protected]> Cc: Adam Belay <[email protected]> Cc: Tariq Toukan <[email protected]> Cc: Yuval Mintz <[email protected]> Cc: Ariel Elior <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 2874aa2 commit 217f697

File tree

2 files changed

+92
-20
lines changed

2 files changed

+92
-20
lines changed

include/linux/netdevice.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -334,6 +334,16 @@ enum {
334334
NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */
335335
NAPI_STATE_HASHED, /* In NAPI hash (busy polling possible) */
336336
NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
337+
NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
338+
};
339+
340+
enum {
341+
NAPIF_STATE_SCHED = (1UL << NAPI_STATE_SCHED),
342+
NAPIF_STATE_DISABLE = (1UL << NAPI_STATE_DISABLE),
343+
NAPIF_STATE_NPSVC = (1UL << NAPI_STATE_NPSVC),
344+
NAPIF_STATE_HASHED = (1UL << NAPI_STATE_HASHED),
345+
NAPIF_STATE_NO_BUSY_POLL = (1UL << NAPI_STATE_NO_BUSY_POLL),
346+
NAPIF_STATE_IN_BUSY_POLL = (1UL << NAPI_STATE_IN_BUSY_POLL),
337347
};
338348

339349
enum gro_result {

net/core/dev.c

Lines changed: 82 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -4902,6 +4902,12 @@ void __napi_complete(struct napi_struct *n)
49024902
{
49034903
BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
49044904

4905+
/* Some drivers call us directly, instead of calling
4906+
* napi_complete_done().
4907+
*/
4908+
if (unlikely(test_bit(NAPI_STATE_IN_BUSY_POLL, &n->state)))
4909+
return;
4910+
49054911
list_del_init(&n->poll_list);
49064912
smp_mb__before_atomic();
49074913
clear_bit(NAPI_STATE_SCHED, &n->state);
@@ -4913,10 +4919,13 @@ void napi_complete_done(struct napi_struct *n, int work_done)
49134919
unsigned long flags;
49144920

49154921
/*
4916-
* don't let napi dequeue from the cpu poll list
4917-
* just in case its running on a different cpu
4922+
* 1) Don't let napi dequeue from the cpu poll list
4923+
* just in case its running on a different cpu.
4924+
* 2) If we are busy polling, do nothing here, we have
4925+
* the guarantee we will be called later.
49184926
*/
4919-
if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4927+
if (unlikely(n->state & (NAPIF_STATE_NPSVC |
4928+
NAPIF_STATE_IN_BUSY_POLL)))
49204929
return;
49214930

49224931
if (n->gro_list) {
@@ -4956,13 +4965,41 @@ static struct napi_struct *napi_by_id(unsigned int napi_id)
49564965
}
49574966

49584967
#if defined(CONFIG_NET_RX_BUSY_POLL)
4968+
49594969
#define BUSY_POLL_BUDGET 8
4970+
4971+
static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
4972+
{
4973+
int rc;
4974+
4975+
clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
4976+
4977+
local_bh_disable();
4978+
4979+
/* All we really want here is to re-enable device interrupts.
4980+
* Ideally, a new ndo_busy_poll_stop() could avoid another round.
4981+
*/
4982+
rc = napi->poll(napi, BUSY_POLL_BUDGET);
4983+
netpoll_poll_unlock(have_poll_lock);
4984+
if (rc == BUSY_POLL_BUDGET)
4985+
__napi_schedule(napi);
4986+
local_bh_enable();
4987+
if (local_softirq_pending())
4988+
do_softirq();
4989+
}
4990+
49604991
bool sk_busy_loop(struct sock *sk, int nonblock)
49614992
{
49624993
unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
4994+
int (*napi_poll)(struct napi_struct *napi, int budget);
49634995
int (*busy_poll)(struct napi_struct *dev);
4996+
void *have_poll_lock = NULL;
49644997
struct napi_struct *napi;
4965-
int rc = false;
4998+
int rc;
4999+
5000+
restart:
5001+
rc = false;
5002+
napi_poll = NULL;
49665003

49675004
rcu_read_lock();
49685005

@@ -4973,24 +5010,33 @@ bool sk_busy_loop(struct sock *sk, int nonblock)
49735010
/* Note: ndo_busy_poll method is optional in linux-4.5 */
49745011
busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
49755012

4976-
do {
5013+
preempt_disable();
5014+
for (;;) {
49775015
rc = 0;
49785016
local_bh_disable();
49795017
if (busy_poll) {
49805018
rc = busy_poll(napi);
4981-
} else if (napi_schedule_prep(napi)) {
4982-
void *have = netpoll_poll_lock(napi);
4983-
4984-
if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
4985-
rc = napi->poll(napi, BUSY_POLL_BUDGET);
4986-
trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
4987-
if (rc == BUSY_POLL_BUDGET) {
4988-
napi_complete_done(napi, rc);
4989-
napi_schedule(napi);
4990-
}
4991-
}
4992-
netpoll_poll_unlock(have);
5019+
goto count;
49935020
}
5021+
if (!napi_poll) {
5022+
unsigned long val = READ_ONCE(napi->state);
5023+
5024+
/* If multiple threads are competing for this napi,
5025+
* we avoid dirtying napi->state as much as we can.
5026+
*/
5027+
if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
5028+
NAPIF_STATE_IN_BUSY_POLL))
5029+
goto count;
5030+
if (cmpxchg(&napi->state, val,
5031+
val | NAPIF_STATE_IN_BUSY_POLL |
5032+
NAPIF_STATE_SCHED) != val)
5033+
goto count;
5034+
have_poll_lock = netpoll_poll_lock(napi);
5035+
napi_poll = napi->poll;
5036+
}
5037+
rc = napi_poll(napi, BUSY_POLL_BUDGET);
5038+
trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
5039+
count:
49945040
if (rc > 0)
49955041
__NET_ADD_STATS(sock_net(sk),
49965042
LINUX_MIB_BUSYPOLLRXPACKETS, rc);
@@ -4999,10 +5045,26 @@ bool sk_busy_loop(struct sock *sk, int nonblock)
49995045
if (rc == LL_FLUSH_FAILED)
50005046
break; /* permanent failure */
50015047

5002-
cpu_relax();
5003-
} while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) &&
5004-
!need_resched() && !busy_loop_timeout(end_time));
5048+
if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) ||
5049+
busy_loop_timeout(end_time))
5050+
break;
50055051

5052+
if (unlikely(need_resched())) {
5053+
if (napi_poll)
5054+
busy_poll_stop(napi, have_poll_lock);
5055+
preempt_enable();
5056+
rcu_read_unlock();
5057+
cond_resched();
5058+
rc = !skb_queue_empty(&sk->sk_receive_queue);
5059+
if (rc || busy_loop_timeout(end_time))
5060+
return rc;
5061+
goto restart;
5062+
}
5063+
cpu_relax_lowlatency();
5064+
}
5065+
if (napi_poll)
5066+
busy_poll_stop(napi, have_poll_lock);
5067+
preempt_enable();
50065068
rc = !skb_queue_empty(&sk->sk_receive_queue);
50075069
out:
50085070
rcu_read_unlock();

0 commit comments

Comments
 (0)