Skip to content

Commit e61caf0

Browse files
committed
Merge branch 'page_pool-allow-caching-from-safely-localized-napi'
Jakub Kicinski says: ==================== page_pool: allow caching from safely localized NAPI I went back to the explicit "are we in NAPI method", mostly because I don't like having both around :( (even tho I maintain that in_softirq() && !in_hardirq() is as safe, as softirqs do not nest). Still returning the skbs to a CPU, tho, not to the NAPI instance. I reckon we could create a small refcounted struct per NAPI instance which would allow sockets and other users so hold a persisent and safe reference. But that's a bigger change, and I get 90+% recycling thru the cache with just these patches (for RR and streaming tests with 100% CPU use it's almost 100%). Some numbers for streaming test with 100% CPU use (from previous version, but really they perform the same): HW-GRO page=page before after before after recycle: cached: 0 138669686 0 150197505 cache_full: 0 223391 0 74582 ring: 138551933 9997191 149299454 0 ring_full: 0 488 3154 127590 released_refcnt: 0 0 0 0 alloc: fast: 136491361 148615710 146969587 150322859 slow: 1772 1799 144 105 slow_high_order: 0 0 0 0 empty: 1772 1799 144 105 refill: 2165245 156302 2332880 2128 waive: 0 0 0 0 v1: https://lore.kernel.org/all/[email protected]/ rfcv2: https://lore.kernel.org/all/[email protected]/ ==================== Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Jakub Kicinski <[email protected]>
2 parents c11d2e7 + 294e39e commit e61caf0

File tree

8 files changed

+58
-30
lines changed

8 files changed

+58
-30
lines changed

Documentation/networking/page_pool.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,7 @@ Registration
165165
pp_params.pool_size = DESC_NUM;
166166
pp_params.nid = NUMA_NO_NODE;
167167
pp_params.dev = priv->dev;
168+
pp_params.napi = napi; /* only if locking is tied to NAPI */
168169
pp_params.dma_dir = xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE;
169170
page_pool = page_pool_create(&pp_params);
170171

drivers/net/ethernet/broadcom/bnxt/bnxt.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3211,6 +3211,7 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp,
32113211

32123212
pp.pool_size = bp->rx_ring_size;
32133213
pp.nid = dev_to_node(&bp->pdev->dev);
3214+
pp.napi = &rxr->bnapi->napi;
32143215
pp.dev = &bp->pdev->dev;
32153216
pp.dma_dir = DMA_BIDIRECTIONAL;
32163217

include/linux/netdevice.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -360,8 +360,11 @@ struct napi_struct {
360360
unsigned long gro_bitmask;
361361
int (*poll)(struct napi_struct *, int);
362362
#ifdef CONFIG_NETPOLL
363+
/* CPU actively polling if netpoll is configured */
363364
int poll_owner;
364365
#endif
366+
/* CPU on which NAPI has been scheduled for processing */
367+
int list_owner;
365368
struct net_device *dev;
366369
struct gro_list gro_hash[GRO_HASH_BUCKETS];
367370
struct sk_buff *skb;

include/linux/skbuff.h

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3386,6 +3386,18 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f)
33863386
__skb_frag_ref(&skb_shinfo(skb)->frags[f]);
33873387
}
33883388

3389+
static inline void
3390+
napi_frag_unref(skb_frag_t *frag, bool recycle, bool napi_safe)
3391+
{
3392+
struct page *page = skb_frag_page(frag);
3393+
3394+
#ifdef CONFIG_PAGE_POOL
3395+
if (recycle && page_pool_return_skb_page(page, napi_safe))
3396+
return;
3397+
#endif
3398+
put_page(page);
3399+
}
3400+
33893401
/**
33903402
* __skb_frag_unref - release a reference on a paged fragment.
33913403
* @frag: the paged fragment
@@ -3396,13 +3408,7 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f)
33963408
*/
33973409
static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle)
33983410
{
3399-
struct page *page = skb_frag_page(frag);
3400-
3401-
#ifdef CONFIG_PAGE_POOL
3402-
if (recycle && page_pool_return_skb_page(page))
3403-
return;
3404-
#endif
3405-
put_page(page);
3411+
napi_frag_unref(frag, recycle, false);
34063412
}
34073413

34083414
/**

include/net/page_pool.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ struct page_pool_params {
7777
unsigned int pool_size;
7878
int nid; /* Numa node id to allocate from pages from */
7979
struct device *dev; /* device, for DMA pre-mapping purposes */
80+
struct napi_struct *napi; /* Sole consumer of pages, otherwise NULL */
8081
enum dma_data_direction dma_dir; /* DMA mapping direction */
8182
unsigned int max_len; /* max DMA sync memory size */
8283
unsigned int offset; /* DMA addr offset */
@@ -239,7 +240,7 @@ inline enum dma_data_direction page_pool_get_dma_dir(struct page_pool *pool)
239240
return pool->p.dma_dir;
240241
}
241242

242-
bool page_pool_return_skb_page(struct page *page);
243+
bool page_pool_return_skb_page(struct page *page, bool napi_safe);
243244

244245
struct page_pool *page_pool_create(const struct page_pool_params *params);
245246

net/core/dev.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4359,6 +4359,7 @@ static inline void ____napi_schedule(struct softnet_data *sd,
43594359
}
43604360

43614361
list_add_tail(&napi->poll_list, &sd->poll_list);
4362+
WRITE_ONCE(napi->list_owner, smp_processor_id());
43624363
/* If not called from net_rx_action()
43634364
* we have to raise NET_RX_SOFTIRQ.
43644365
*/
@@ -6069,6 +6070,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
60696070
list_del_init(&n->poll_list);
60706071
local_irq_restore(flags);
60716072
}
6073+
WRITE_ONCE(n->list_owner, -1);
60726074

60736075
val = READ_ONCE(n->state);
60746076
do {
@@ -6384,6 +6386,7 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
63846386
#ifdef CONFIG_NETPOLL
63856387
napi->poll_owner = -1;
63866388
#endif
6389+
napi->list_owner = -1;
63876390
set_bit(NAPI_STATE_SCHED, &napi->state);
63886391
set_bit(NAPI_STATE_NPSVC, &napi->state);
63896392
list_add_rcu(&napi->dev_list, &dev->napi_list);

net/core/page_pool.c

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include <linux/mm.h> /* for put_page() */
2020
#include <linux/poison.h>
2121
#include <linux/ethtool.h>
22+
#include <linux/netdevice.h>
2223

2324
#include <trace/events/page_pool.h>
2425

@@ -874,9 +875,11 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid)
874875
}
875876
EXPORT_SYMBOL(page_pool_update_nid);
876877

877-
bool page_pool_return_skb_page(struct page *page)
878+
bool page_pool_return_skb_page(struct page *page, bool napi_safe)
878879
{
880+
struct napi_struct *napi;
879881
struct page_pool *pp;
882+
bool allow_direct;
880883

881884
page = compound_head(page);
882885

@@ -892,12 +895,20 @@ bool page_pool_return_skb_page(struct page *page)
892895

893896
pp = page->pp;
894897

898+
/* Allow direct recycle if we have reasons to believe that we are
899+
* in the same context as the consumer would run, so there's
900+
* no possible race.
901+
*/
902+
napi = pp->p.napi;
903+
allow_direct = napi_safe && napi &&
904+
READ_ONCE(napi->list_owner) == smp_processor_id();
905+
895906
/* Driver set this to memory recycling info. Reset it on recycle.
896907
* This will *not* work for NIC using a split-page memory model.
897908
* The page will be returned to the pool here regardless of the
898909
* 'flipped' fragment being in use or not.
899910
*/
900-
page_pool_put_full_page(pp, page, false);
911+
page_pool_put_full_page(pp, page, allow_direct);
901912

902913
return true;
903914
}

net/core/skbuff.c

Lines changed: 22 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -839,11 +839,11 @@ static void skb_clone_fraglist(struct sk_buff *skb)
839839
skb_get(list);
840840
}
841841

842-
static bool skb_pp_recycle(struct sk_buff *skb, void *data)
842+
static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe)
843843
{
844844
if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
845845
return false;
846-
return page_pool_return_skb_page(virt_to_page(data));
846+
return page_pool_return_skb_page(virt_to_page(data), napi_safe);
847847
}
848848

849849
static void skb_kfree_head(void *head, unsigned int end_offset)
@@ -856,20 +856,21 @@ static void skb_kfree_head(void *head, unsigned int end_offset)
856856
kfree(head);
857857
}
858858

859-
static void skb_free_head(struct sk_buff *skb)
859+
static void skb_free_head(struct sk_buff *skb, bool napi_safe)
860860
{
861861
unsigned char *head = skb->head;
862862

863863
if (skb->head_frag) {
864-
if (skb_pp_recycle(skb, head))
864+
if (skb_pp_recycle(skb, head, napi_safe))
865865
return;
866866
skb_free_frag(head);
867867
} else {
868868
skb_kfree_head(head, skb_end_offset(skb));
869869
}
870870
}
871871

872-
static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason)
872+
static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason,
873+
bool napi_safe)
873874
{
874875
struct skb_shared_info *shinfo = skb_shinfo(skb);
875876
int i;
@@ -888,13 +889,13 @@ static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason)
888889
}
889890

890891
for (i = 0; i < shinfo->nr_frags; i++)
891-
__skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
892+
napi_frag_unref(&shinfo->frags[i], skb->pp_recycle, napi_safe);
892893

893894
free_head:
894895
if (shinfo->frag_list)
895896
kfree_skb_list_reason(shinfo->frag_list, reason);
896897

897-
skb_free_head(skb);
898+
skb_free_head(skb, napi_safe);
898899
exit:
899900
/* When we clone an SKB we copy the reycling bit. The pp_recycle
900901
* bit is only set on the head though, so in order to avoid races
@@ -955,11 +956,12 @@ void skb_release_head_state(struct sk_buff *skb)
955956
}
956957

957958
/* Free everything but the sk_buff shell. */
958-
static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason)
959+
static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason,
960+
bool napi_safe)
959961
{
960962
skb_release_head_state(skb);
961963
if (likely(skb->head))
962-
skb_release_data(skb, reason);
964+
skb_release_data(skb, reason, napi_safe);
963965
}
964966

965967
/**
@@ -973,7 +975,7 @@ static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason)
973975

974976
void __kfree_skb(struct sk_buff *skb)
975977
{
976-
skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED);
978+
skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED, false);
977979
kfree_skbmem(skb);
978980
}
979981
EXPORT_SYMBOL(__kfree_skb);
@@ -1027,7 +1029,7 @@ static void kfree_skb_add_bulk(struct sk_buff *skb,
10271029
return;
10281030
}
10291031

1030-
skb_release_all(skb, reason);
1032+
skb_release_all(skb, reason, false);
10311033
sa->skb_array[sa->skb_count++] = skb;
10321034

10331035
if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) {
@@ -1201,7 +1203,7 @@ EXPORT_SYMBOL(consume_skb);
12011203
void __consume_stateless_skb(struct sk_buff *skb)
12021204
{
12031205
trace_consume_skb(skb, __builtin_return_address(0));
1204-
skb_release_data(skb, SKB_CONSUMED);
1206+
skb_release_data(skb, SKB_CONSUMED, false);
12051207
kfree_skbmem(skb);
12061208
}
12071209

@@ -1226,7 +1228,7 @@ static void napi_skb_cache_put(struct sk_buff *skb)
12261228

12271229
void __kfree_skb_defer(struct sk_buff *skb)
12281230
{
1229-
skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED);
1231+
skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED, true);
12301232
napi_skb_cache_put(skb);
12311233
}
12321234

@@ -1264,7 +1266,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget)
12641266
return;
12651267
}
12661268

1267-
skb_release_all(skb, SKB_CONSUMED);
1269+
skb_release_all(skb, SKB_CONSUMED, !!budget);
12681270
napi_skb_cache_put(skb);
12691271
}
12701272
EXPORT_SYMBOL(napi_consume_skb);
@@ -1395,7 +1397,7 @@ EXPORT_SYMBOL_GPL(alloc_skb_for_msg);
13951397
*/
13961398
struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
13971399
{
1398-
skb_release_all(dst, SKB_CONSUMED);
1400+
skb_release_all(dst, SKB_CONSUMED, false);
13991401
return __skb_clone(dst, src);
14001402
}
14011403
EXPORT_SYMBOL_GPL(skb_morph);
@@ -2018,9 +2020,9 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
20182020
if (skb_has_frag_list(skb))
20192021
skb_clone_fraglist(skb);
20202022

2021-
skb_release_data(skb, SKB_CONSUMED);
2023+
skb_release_data(skb, SKB_CONSUMED, false);
20222024
} else {
2023-
skb_free_head(skb);
2025+
skb_free_head(skb, false);
20242026
}
20252027
off = (data + nhead) - skb->head;
20262028

@@ -6389,12 +6391,12 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
63896391
skb_frag_ref(skb, i);
63906392
if (skb_has_frag_list(skb))
63916393
skb_clone_fraglist(skb);
6392-
skb_release_data(skb, SKB_CONSUMED);
6394+
skb_release_data(skb, SKB_CONSUMED, false);
63936395
} else {
63946396
/* we can reuse existing recount- all we did was
63956397
* relocate values
63966398
*/
6397-
skb_free_head(skb);
6399+
skb_free_head(skb, false);
63986400
}
63996401

64006402
skb->head = data;
@@ -6529,7 +6531,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
65296531
skb_kfree_head(data, size);
65306532
return -ENOMEM;
65316533
}
6532-
skb_release_data(skb, SKB_CONSUMED);
6534+
skb_release_data(skb, SKB_CONSUMED, false);
65336535

65346536
skb->head = data;
65356537
skb->head_frag = 0;

0 commit comments

Comments
 (0)