Skip to content

Commit 99c07c4

Browse files
netoptimizerdavem330
authored andcommitted
xdp: tracking page_pool resources and safe removal
This patch is needed before we can allow drivers to use page_pool for DMA-mappings. Today with page_pool and XDP return API, it is possible to remove the page_pool object (from rhashtable), while there are still in-flight packet-pages. This is safely handled via RCU and failed lookups in __xdp_return() fallback to call put_page(), when page_pool object is gone. In-case page is still DMA mapped, this will result in page note getting correctly DMA unmapped. To solve this, the page_pool is extended with tracking in-flight pages. And XDP disconnect system queries page_pool and waits, via workqueue, for all in-flight pages to be returned. To avoid killing performance when tracking in-flight pages, the implement use two (unsigned) counters, that in placed on different cache-lines, and can be used to deduct in-flight packets. This is done by mapping the unsigned "sequence" counters onto signed Two's complement arithmetic operations. This is e.g. used by kernel's time_after macros, described in kernel commit 1ba3aab and 5a581b3, and also explained in RFC1982. The trick is these two incrementing counters only need to be read and compared, when checking if it's safe to free the page_pool structure. Which will only happen when driver have disconnected RX/alloc side. Thus, on a non-fast-path. It is chosen that page_pool tracking is also enabled for the non-DMA use-case, as this can be used for statistics later. After this patch, using page_pool requires more strict resource "release", e.g. via page_pool_release_page() that was introduced in this patchset, and previous patches implement/fix this more strict requirement. Drivers no-longer call page_pool_destroy(). Drivers already call xdp_rxq_info_unreg() which call xdp_rxq_info_unreg_mem_model(), which will attempt to disconnect the mem id, and if attempt fails schedule the disconnect for later via delayed workqueue. Signed-off-by: Jesper Dangaard Brouer <[email protected]> Reviewed-by: Ilias Apalodimas <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 29b006a commit 99c07c4

File tree

4 files changed

+136
-35
lines changed

4 files changed

+136
-35
lines changed

drivers/net/ethernet/mellanox/mlx5/core/en_main.c

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -643,9 +643,6 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq)
643643
}
644644

645645
xdp_rxq_info_unreg(&rq->xdp_rxq);
646-
if (rq->page_pool)
647-
page_pool_destroy(rq->page_pool);
648-
649646
mlx5_wq_destroy(&rq->wq_ctrl);
650647
}
651648

include/net/page_pool.h

Lines changed: 31 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,16 @@
1616
* page_pool_alloc_pages() call. Drivers should likely use
1717
* page_pool_dev_alloc_pages() replacing dev_alloc_pages().
1818
*
19-
* If page_pool handles DMA mapping (use page->private), then API user
20-
* is responsible for invoking page_pool_put_page() once. In-case of
21-
* elevated refcnt, the DMA state is released, assuming other users of
22-
* the page will eventually call put_page().
19+
* API keeps track of in-flight pages, in-order to let API user know
20+
* when it is safe to dealloactor page_pool object. Thus, API users
21+
* must make sure to call page_pool_release_page() when a page is
22+
* "leaving" the page_pool. Or call page_pool_put_page() where
23+
* appropiate. For maintaining correct accounting.
2324
*
24-
* If no DMA mapping is done, then it can act as shim-layer that
25-
* fall-through to alloc_page. As no state is kept on the page, the
26-
* regular put_page() call is sufficient.
25+
* API user must only call page_pool_put_page() once on a page, as it
26+
* will either recycle the page, or in case of elevated refcnt, it
27+
* will release the DMA mapping and in-flight state accounting. We
28+
* hope to lift this requirement in the future.
2729
*/
2830
#ifndef _NET_PAGE_POOL_H
2931
#define _NET_PAGE_POOL_H
@@ -66,9 +68,10 @@ struct page_pool_params {
6668
};
6769

6870
struct page_pool {
69-
struct rcu_head rcu;
7071
struct page_pool_params p;
7172

73+
u32 pages_state_hold_cnt;
74+
7275
/*
7376
* Data structure for allocation side
7477
*
@@ -96,6 +99,8 @@ struct page_pool {
9699
* TODO: Implement bulk return pages into this structure.
97100
*/
98101
struct ptr_ring ring;
102+
103+
atomic_t pages_state_release_cnt;
99104
};
100105

101106
struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp);
@@ -109,8 +114,6 @@ static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool)
109114

110115
struct page_pool *page_pool_create(const struct page_pool_params *params);
111116

112-
void page_pool_destroy(struct page_pool *pool);
113-
114117
void __page_pool_free(struct page_pool *pool);
115118
static inline void page_pool_free(struct page_pool *pool)
116119
{
@@ -143,6 +146,24 @@ static inline void page_pool_recycle_direct(struct page_pool *pool,
143146
__page_pool_put_page(pool, page, true);
144147
}
145148

149+
/* API user MUST have disconnected alloc-side (not allowed to call
150+
* page_pool_alloc_pages()) before calling this. The free-side can
151+
* still run concurrently, to handle in-flight packet-pages.
152+
*
153+
* A request to shutdown can fail (with false) if there are still
154+
* in-flight packet-pages.
155+
*/
156+
bool __page_pool_request_shutdown(struct page_pool *pool);
157+
static inline bool page_pool_request_shutdown(struct page_pool *pool)
158+
{
159+
/* When page_pool isn't compiled-in, net/core/xdp.c doesn't
160+
* allow registering MEM_TYPE_PAGE_POOL, but shield linker.
161+
*/
162+
#ifdef CONFIG_PAGE_POOL
163+
return __page_pool_request_shutdown(pool);
164+
#endif
165+
}
166+
146167
/* Disconnects a page (from a page_pool). API users can have a need
147168
* to disconnect a page (from a page_pool), to allow it to be used as
148169
* a regular page (that will eventually be returned to the normal

net/core/page_pool.c

Lines changed: 46 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@ static int page_pool_init(struct page_pool *pool,
4343
if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
4444
return -ENOMEM;
4545

46+
atomic_set(&pool->pages_state_release_cnt, 0);
47+
4648
return 0;
4749
}
4850

@@ -151,6 +153,9 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
151153
page->dma_addr = dma;
152154

153155
skip_dma_map:
156+
/* Track how many pages are held 'in-flight' */
157+
pool->pages_state_hold_cnt++;
158+
154159
/* When page just alloc'ed is should/must have refcnt 1. */
155160
return page;
156161
}
@@ -173,26 +178,58 @@ struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
173178
}
174179
EXPORT_SYMBOL(page_pool_alloc_pages);
175180

181+
/* Calculate distance between two u32 values, valid if distance is below 2^(31)
182+
* https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution
183+
*/
184+
#define _distance(a, b) (s32)((a) - (b))
185+
186+
static s32 page_pool_inflight(struct page_pool *pool)
187+
{
188+
u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
189+
u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
190+
s32 distance;
191+
192+
distance = _distance(hold_cnt, release_cnt);
193+
194+
/* TODO: Add tracepoint here */
195+
return distance;
196+
}
197+
198+
static bool __page_pool_safe_to_destroy(struct page_pool *pool)
199+
{
200+
s32 inflight = page_pool_inflight(pool);
201+
202+
/* The distance should not be able to become negative */
203+
WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight);
204+
205+
return (inflight == 0);
206+
}
207+
176208
/* Cleanup page_pool state from page */
177209
static void __page_pool_clean_page(struct page_pool *pool,
178210
struct page *page)
179211
{
180212
dma_addr_t dma;
181213

182214
if (!(pool->p.flags & PP_FLAG_DMA_MAP))
183-
return;
215+
goto skip_dma_unmap;
184216

185217
dma = page->dma_addr;
186218
/* DMA unmap */
187219
dma_unmap_page_attrs(pool->p.dev, dma,
188220
PAGE_SIZE << pool->p.order, pool->p.dma_dir,
189221
DMA_ATTR_SKIP_CPU_SYNC);
190222
page->dma_addr = 0;
223+
skip_dma_unmap:
224+
atomic_inc(&pool->pages_state_release_cnt);
191225
}
192226

193227
/* unmap the page and clean our state */
194228
void page_pool_unmap_page(struct page_pool *pool, struct page *page)
195229
{
230+
/* When page is unmapped, this implies page will not be
231+
* returned to page_pool.
232+
*/
196233
__page_pool_clean_page(pool, page);
197234
}
198235
EXPORT_SYMBOL(page_pool_unmap_page);
@@ -201,6 +238,7 @@ EXPORT_SYMBOL(page_pool_unmap_page);
201238
static void __page_pool_return_page(struct page_pool *pool, struct page *page)
202239
{
203240
__page_pool_clean_page(pool, page);
241+
204242
put_page(page);
205243
/* An optimization would be to call __free_pages(page, pool->p.order)
206244
* knowing page is not part of page-cache (thus avoiding a
@@ -296,24 +334,17 @@ void __page_pool_free(struct page_pool *pool)
296334
{
297335
WARN(pool->alloc.count, "API usage violation");
298336
WARN(!ptr_ring_empty(&pool->ring), "ptr_ring is not empty");
337+
WARN(!__page_pool_safe_to_destroy(pool), "still in-flight pages");
299338

300339
ptr_ring_cleanup(&pool->ring, NULL);
301340
kfree(pool);
302341
}
303342
EXPORT_SYMBOL(__page_pool_free);
304343

305-
static void __page_pool_destroy_rcu(struct rcu_head *rcu)
306-
{
307-
struct page_pool *pool;
308-
309-
pool = container_of(rcu, struct page_pool, rcu);
310-
311-
__page_pool_empty_ring(pool);
312-
__page_pool_free(pool);
313-
}
314-
315-
/* Cleanup and release resources */
316-
void page_pool_destroy(struct page_pool *pool)
344+
/* Request to shutdown: release pages cached by page_pool, and check
345+
* for in-flight pages
346+
*/
347+
bool __page_pool_request_shutdown(struct page_pool *pool)
317348
{
318349
struct page *page;
319350

@@ -331,7 +362,6 @@ void page_pool_destroy(struct page_pool *pool)
331362
*/
332363
__page_pool_empty_ring(pool);
333364

334-
/* An xdp_mem_allocator can still ref page_pool pointer */
335-
call_rcu(&pool->rcu, __page_pool_destroy_rcu);
365+
return __page_pool_safe_to_destroy(pool);
336366
}
337-
EXPORT_SYMBOL(page_pool_destroy);
367+
EXPORT_SYMBOL(__page_pool_request_shutdown);

net/core/xdp.c

Lines changed: 59 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ struct xdp_mem_allocator {
3838
};
3939
struct rhash_head node;
4040
struct rcu_head rcu;
41+
struct delayed_work defer_wq;
4142
};
4243

4344
static u32 xdp_mem_id_hashfn(const void *data, u32 len, u32 seed)
@@ -79,13 +80,13 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
7980

8081
xa = container_of(rcu, struct xdp_mem_allocator, rcu);
8182

83+
/* Allocator have indicated safe to remove before this is called */
84+
if (xa->mem.type == MEM_TYPE_PAGE_POOL)
85+
page_pool_free(xa->page_pool);
86+
8287
/* Allow this ID to be reused */
8388
ida_simple_remove(&mem_id_pool, xa->mem.id);
8489

85-
/* Notice, driver is expected to free the *allocator,
86-
* e.g. page_pool, and MUST also use RCU free.
87-
*/
88-
8990
/* Poison memory */
9091
xa->mem.id = 0xFFFF;
9192
xa->mem.type = 0xF0F0;
@@ -94,6 +95,46 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
9495
kfree(xa);
9596
}
9697

98+
bool __mem_id_disconnect(int id)
99+
{
100+
struct xdp_mem_allocator *xa;
101+
bool safe_to_remove = true;
102+
103+
mutex_lock(&mem_id_lock);
104+
105+
xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params);
106+
if (!xa) {
107+
mutex_unlock(&mem_id_lock);
108+
WARN(1, "Request remove non-existing id(%d), driver bug?", id);
109+
return true;
110+
}
111+
112+
/* Detects in-flight packet-pages for page_pool */
113+
if (xa->mem.type == MEM_TYPE_PAGE_POOL)
114+
safe_to_remove = page_pool_request_shutdown(xa->page_pool);
115+
116+
if (safe_to_remove &&
117+
!rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
118+
call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
119+
120+
mutex_unlock(&mem_id_lock);
121+
return safe_to_remove;
122+
}
123+
124+
#define DEFER_TIME (msecs_to_jiffies(1000))
125+
126+
static void mem_id_disconnect_defer_retry(struct work_struct *wq)
127+
{
128+
struct delayed_work *dwq = to_delayed_work(wq);
129+
struct xdp_mem_allocator *xa = container_of(dwq, typeof(*xa), defer_wq);
130+
131+
if (__mem_id_disconnect(xa->mem.id))
132+
return;
133+
134+
/* Still not ready to be disconnected, retry later */
135+
schedule_delayed_work(&xa->defer_wq, DEFER_TIME);
136+
}
137+
97138
void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
98139
{
99140
struct xdp_mem_allocator *xa;
@@ -112,16 +153,28 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
112153
if (id == 0)
113154
return;
114155

156+
if (__mem_id_disconnect(id))
157+
return;
158+
159+
/* Could not disconnect, defer new disconnect attempt to later */
115160
mutex_lock(&mem_id_lock);
116161

117162
xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params);
118-
if (xa && !rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
119-
call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
163+
if (!xa) {
164+
mutex_unlock(&mem_id_lock);
165+
return;
166+
}
120167

168+
INIT_DELAYED_WORK(&xa->defer_wq, mem_id_disconnect_defer_retry);
121169
mutex_unlock(&mem_id_lock);
170+
schedule_delayed_work(&xa->defer_wq, DEFER_TIME);
122171
}
123172
EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg_mem_model);
124173

174+
/* This unregister operation will also cleanup and destroy the
175+
* allocator. The page_pool_free() operation is first called when it's
176+
* safe to remove, possibly deferred to a workqueue.
177+
*/
125178
void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
126179
{
127180
/* Simplify driver cleanup code paths, allow unreg "unused" */

0 commit comments

Comments
 (0)