Skip to content

Commit 71f0dd5

Browse files
committed
Merge branch 'io_uring-zero-copy-rx'
David Wei says: ==================== io_uring zero copy rx This patchset contains net/ patches needed by a new io_uring request implementing zero copy rx into userspace pages, eliminating a kernel to user copy. We configure a page pool that a driver uses to fill a hw rx queue to hand out user pages instead of kernel pages. Any data that ends up hitting this hw rx queue will thus be dma'd into userspace memory directly, without needing to be bounced through kernel memory. 'Reading' data out of a socket instead becomes a _notification_ mechanism, where the kernel tells userspace where the data is. The overall approach is similar to the devmem TCP proposal. This relies on hw header/data split, flow steering and RSS to ensure packet headers remain in kernel memory and only desired flows hit a hw rx queue configured for zero copy. Configuring this is outside of the scope of this patchset. We share netdev core infra with devmem TCP. The main difference is that io_uring is used for the uAPI and the lifetime of all objects are bound to an io_uring instance. Data is 'read' using a new io_uring request type. When done, data is returned via a new shared refill queue. A zero copy page pool refills a hw rx queue from this refill queue directly. Of course, the lifetime of these data buffers are managed by io_uring rather than the networking stack, with different refcounting rules. This patchset is the first step adding basic zero copy support. We will extend this iteratively with new features e.g. dynamically allocated zero copy areas, THP support, dmabuf support, improved copy fallback, general optimisations and more. In terms of netdev support, we're first targeting Broadcom bnxt. Patches aren't included since Taehee Yoo has already sent a more comprehensive patchset adding support in [1]. Google gve should already support this, and Mellanox mlx5 support is WIP pending driver changes. =========== Performance =========== Note: Comparison with epoll + TCP_ZEROCOPY_RECEIVE isn't done yet. Test setup: * AMD EPYC 9454 * Broadcom BCM957508 200G * Kernel v6.11 base [2] * liburing fork [3] * kperf fork [4] * 4K MTU * Single TCP flow With application thread + net rx softirq pinned to _different_ cores: +-------------------------------+ | epoll | io_uring | |-----------|-------------------| | 82.2 Gbps | 116.2 Gbps (+41%) | +-------------------------------+ Pinned to _same_ core: +-------------------------------+ | epoll | io_uring | |-----------|-------------------| | 62.6 Gbps | 80.9 Gbps (+29%) | +-------------------------------+ ===== Links ===== Broadcom bnxt support: [1]: https://lore.kernel.org/[email protected] Linux kernel branch including io_uring bits: [2]: https://github.com/isilence/linux.git zcrx/v13 liburing for testing: [3]: https://github.com/isilence/liburing.git zcrx/next kperf for testing: [4]: https://git.kernel.dk/kperf.git ==================== Link: https://patch.msgid.link/[email protected] Signed-off-by: Jakub Kicinski <[email protected]>
2 parents ba6ec09 + 6e18ed9 commit 71f0dd5

File tree

14 files changed

+321
-81
lines changed

14 files changed

+321
-81
lines changed

Documentation/netlink/specs/netdev.yaml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,9 @@ attribute-sets:
114114
doc: Bitmask of enabled AF_XDP features.
115115
type: u64
116116
enum: xsk-flags
117+
-
118+
name: io-uring-provider-info
119+
attributes: []
117120
-
118121
name: page-pool
119122
attributes:
@@ -171,6 +174,11 @@ attribute-sets:
171174
name: dmabuf
172175
doc: ID of the dmabuf this page-pool is attached to.
173176
type: u32
177+
-
178+
name: io-uring
179+
doc: io-uring memory provider information.
180+
type: nest
181+
nested-attributes: io-uring-provider-info
174182
-
175183
name: page-pool-info
176184
subset-of: page-pool
@@ -296,6 +304,11 @@ attribute-sets:
296304
name: dmabuf
297305
doc: ID of the dmabuf attached to this queue, if any.
298306
type: u32
307+
-
308+
name: io-uring
309+
doc: io_uring memory provider information.
310+
type: nest
311+
nested-attributes: io-uring-provider-info
299312

300313
-
301314
name: qstats
@@ -572,6 +585,7 @@ operations:
572585
- inflight-mem
573586
- detach-time
574587
- dmabuf
588+
- io-uring
575589
dump:
576590
reply: *pp-reply
577591
config-cond: page-pool
@@ -637,6 +651,7 @@ operations:
637651
- napi-id
638652
- ifindex
639653
- dmabuf
654+
- io-uring
640655
dump:
641656
request:
642657
attributes:

include/net/netmem.h

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,20 @@ struct net_iov {
2424
unsigned long __unused_padding;
2525
unsigned long pp_magic;
2626
struct page_pool *pp;
27-
struct dmabuf_genpool_chunk_owner *owner;
27+
struct net_iov_area *owner;
2828
unsigned long dma_addr;
2929
atomic_long_t pp_ref_count;
3030
};
3131

32+
struct net_iov_area {
33+
/* Array of net_iovs for this area. */
34+
struct net_iov *niovs;
35+
size_t num_niovs;
36+
37+
/* Offset into the dma-buf where this chunk starts. */
38+
unsigned long base_virtual;
39+
};
40+
3241
/* These fields in struct page are used by the page_pool and net stack:
3342
*
3443
* struct {
@@ -54,6 +63,16 @@ NET_IOV_ASSERT_OFFSET(dma_addr, dma_addr);
5463
NET_IOV_ASSERT_OFFSET(pp_ref_count, pp_ref_count);
5564
#undef NET_IOV_ASSERT_OFFSET
5665

66+
static inline struct net_iov_area *net_iov_owner(const struct net_iov *niov)
67+
{
68+
return niov->owner;
69+
}
70+
71+
static inline unsigned int net_iov_idx(const struct net_iov *niov)
72+
{
73+
return niov - net_iov_owner(niov)->niovs;
74+
}
75+
5776
/* netmem */
5877

5978
/**
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
/* SPDX-License-Identifier: GPL-2.0 */
2+
#ifndef _NET_PAGE_POOL_MEMORY_PROVIDER_H
3+
#define _NET_PAGE_POOL_MEMORY_PROVIDER_H
4+
5+
#include <net/netmem.h>
6+
#include <net/page_pool/types.h>
7+
8+
struct netdev_rx_queue;
9+
struct sk_buff;
10+
11+
struct memory_provider_ops {
12+
netmem_ref (*alloc_netmems)(struct page_pool *pool, gfp_t gfp);
13+
bool (*release_netmem)(struct page_pool *pool, netmem_ref netmem);
14+
int (*init)(struct page_pool *pool);
15+
void (*destroy)(struct page_pool *pool);
16+
int (*nl_fill)(void *mp_priv, struct sk_buff *rsp,
17+
struct netdev_rx_queue *rxq);
18+
void (*uninstall)(void *mp_priv, struct netdev_rx_queue *rxq);
19+
};
20+
21+
bool net_mp_niov_set_dma_addr(struct net_iov *niov, dma_addr_t addr);
22+
void net_mp_niov_set_page_pool(struct page_pool *pool, struct net_iov *niov);
23+
void net_mp_niov_clear_page_pool(struct net_iov *niov);
24+
25+
int net_mp_open_rxq(struct net_device *dev, unsigned ifq_idx,
26+
struct pp_memory_provider_params *p);
27+
void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx,
28+
struct pp_memory_provider_params *old_p);
29+
30+
/**
31+
* net_mp_netmem_place_in_cache() - give a netmem to a page pool
32+
* @pool: the page pool to place the netmem into
33+
* @netmem: netmem to give
34+
*
35+
* Push an accounted netmem into the page pool's allocation cache. The caller
36+
* must ensure that there is space in the cache. It should only be called off
37+
* the mp_ops->alloc_netmems() path.
38+
*/
39+
static inline void net_mp_netmem_place_in_cache(struct page_pool *pool,
40+
netmem_ref netmem)
41+
{
42+
pool->alloc.cache[pool->alloc.count++] = netmem;
43+
}
44+
45+
#endif

include/net/page_pool/types.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,8 +152,11 @@ struct page_pool_stats {
152152
*/
153153
#define PAGE_POOL_FRAG_GROUP_ALIGN (4 * sizeof(long))
154154

155+
struct memory_provider_ops;
156+
155157
struct pp_memory_provider_params {
156158
void *mp_priv;
159+
const struct memory_provider_ops *mp_ops;
157160
};
158161

159162
struct page_pool {
@@ -216,6 +219,7 @@ struct page_pool {
216219
struct ptr_ring ring;
217220

218221
void *mp_priv;
222+
const struct memory_provider_ops *mp_ops;
219223

220224
#ifdef CONFIG_PAGE_POOL_STATS
221225
/* recycle stats are per-cpu to avoid locking */

include/uapi/linux/netdev.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,11 @@ enum {
8686
NETDEV_A_DEV_MAX = (__NETDEV_A_DEV_MAX - 1)
8787
};
8888

89+
enum {
90+
__NETDEV_A_IO_URING_PROVIDER_INFO_MAX,
91+
NETDEV_A_IO_URING_PROVIDER_INFO_MAX = (__NETDEV_A_IO_URING_PROVIDER_INFO_MAX - 1)
92+
};
93+
8994
enum {
9095
NETDEV_A_PAGE_POOL_ID = 1,
9196
NETDEV_A_PAGE_POOL_IFINDEX,
@@ -94,6 +99,7 @@ enum {
9499
NETDEV_A_PAGE_POOL_INFLIGHT_MEM,
95100
NETDEV_A_PAGE_POOL_DETACH_TIME,
96101
NETDEV_A_PAGE_POOL_DMABUF,
102+
NETDEV_A_PAGE_POOL_IO_URING,
97103

98104
__NETDEV_A_PAGE_POOL_MAX,
99105
NETDEV_A_PAGE_POOL_MAX = (__NETDEV_A_PAGE_POOL_MAX - 1)
@@ -136,6 +142,7 @@ enum {
136142
NETDEV_A_QUEUE_TYPE,
137143
NETDEV_A_QUEUE_NAPI_ID,
138144
NETDEV_A_QUEUE_DMABUF,
145+
NETDEV_A_QUEUE_IO_URING,
139146

140147
__NETDEV_A_QUEUE_MAX,
141148
NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1)

net/core/dev.c

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,7 @@
159159
#include <net/netdev_rx_queue.h>
160160
#include <net/page_pool/types.h>
161161
#include <net/page_pool/helpers.h>
162+
#include <net/page_pool/memory_provider.h>
162163
#include <net/rps.h>
163164
#include <linux/phy_link_topology.h>
164165

@@ -11745,6 +11746,19 @@ void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
1174511746
}
1174611747
EXPORT_SYMBOL(unregister_netdevice_queue);
1174711748

11749+
static void dev_memory_provider_uninstall(struct net_device *dev)
11750+
{
11751+
unsigned int i;
11752+
11753+
for (i = 0; i < dev->real_num_rx_queues; i++) {
11754+
struct netdev_rx_queue *rxq = &dev->_rx[i];
11755+
struct pp_memory_provider_params *p = &rxq->mp_params;
11756+
11757+
if (p->mp_ops && p->mp_ops->uninstall)
11758+
p->mp_ops->uninstall(rxq->mp_params.mp_priv, rxq);
11759+
}
11760+
}
11761+
1174811762
void unregister_netdevice_many_notify(struct list_head *head,
1174911763
u32 portid, const struct nlmsghdr *nlh)
1175011764
{
@@ -11799,7 +11813,7 @@ void unregister_netdevice_many_notify(struct list_head *head,
1179911813
dev_tcx_uninstall(dev);
1180011814
dev_xdp_uninstall(dev);
1180111815
bpf_dev_bound_netdev_unregister(dev);
11802-
dev_dmabuf_uninstall(dev);
11816+
dev_memory_provider_uninstall(dev);
1180311817

1180411818
netdev_offload_xstats_disable_all(dev);
1180511819

0 commit comments

Comments
 (0)