Skip to content

Commit 684009d

Browse files
committed
Merge branch 'XDP-redirect-memory-return-API'
Jesper Dangaard Brouer says: ==================== XDP redirect memory return API Submitted against net-next, as it contains NIC driver changes. This patchset works towards supporting different XDP RX-ring memory allocators. As this will be needed by the AF_XDP zero-copy mode. The patchset uses mlx5 as the sample driver, which gets implemented XDP_REDIRECT RX-mode, but not ndo_xdp_xmit (as this API is subject to change thought the patchset). A new struct xdp_frame is introduced (modeled after cpumap xdp_pkt). And both ndo_xdp_xmit and the new xdp_return_frame end-up using this. Support for a driver supplied allocator is implemented, and a refurbished version of page_pool is the first return allocator type introduced. This will be a integration point for AF_XDP zero-copy. The mlx5 driver evolve into using the page_pool, and see a performance increase (with ndo_xdp_xmit out ixgbe driver) from 6Mpps to 12Mpps. The patchset stop at 16 patches (one over limit), but more API changes are planned. Specifically extending ndo_xdp_xmit and xdp_return_frame APIs to support bulking. As this will address some known limits. V2: Updated according to Tariq's feedback V3: Updated based on feedback from Jason Wang and Alex Duyck V4: Updated based on feedback from Tariq and Jason V5: Fix SPDX license, add Tariq's reviews, improve patch desc for perf test V6: Updated based on feedback from Eric Dumazet and Alex Duyck V7: Adapt to i40e that got XDP_REDIRECT support in-between V8: Updated based on feedback kbuild test robot, and adjust for mlx5 changes page_pool only compiled into kernel when drivers Kconfig 'select' feature V9: Remove some inline statements, let compiler decide what to inline Fix return value in virtio_net driver Adjust for mlx5 changes in-between submissions V10: Minor adjust for mlx5 requested by Tariq Resubmit against net-next V11: avoid leaking info stored in frame data on page reuse ==================== Acked-by: Alexei Starovoitov <[email protected]> Signed-off-by: David S. Miller <[email protected]>
2 parents 897ddc2 + 6dfb970 commit 684009d

File tree

22 files changed

+1102
-198
lines changed

22 files changed

+1102
-198
lines changed

drivers/net/ethernet/intel/i40e/i40e_txrx.c

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -638,7 +638,7 @@ static void i40e_unmap_and_free_tx_resource(struct i40e_ring *ring,
638638
if (tx_buffer->tx_flags & I40E_TX_FLAGS_FD_SB)
639639
kfree(tx_buffer->raw_buf);
640640
else if (ring_is_xdp(ring))
641-
page_frag_free(tx_buffer->raw_buf);
641+
xdp_return_frame(tx_buffer->xdpf);
642642
else
643643
dev_kfree_skb_any(tx_buffer->skb);
644644
if (dma_unmap_len(tx_buffer, len))
@@ -841,7 +841,7 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
841841

842842
/* free the skb/XDP data */
843843
if (ring_is_xdp(tx_ring))
844-
page_frag_free(tx_buf->raw_buf);
844+
xdp_return_frame(tx_buf->xdpf);
845845
else
846846
napi_consume_skb(tx_buf->skb, napi_budget);
847847

@@ -2203,9 +2203,20 @@ static bool i40e_is_non_eop(struct i40e_ring *rx_ring,
22032203
#define I40E_XDP_CONSUMED 1
22042204
#define I40E_XDP_TX 2
22052205

2206-
static int i40e_xmit_xdp_ring(struct xdp_buff *xdp,
2206+
static int i40e_xmit_xdp_ring(struct xdp_frame *xdpf,
22072207
struct i40e_ring *xdp_ring);
22082208

2209+
static int i40e_xmit_xdp_tx_ring(struct xdp_buff *xdp,
2210+
struct i40e_ring *xdp_ring)
2211+
{
2212+
struct xdp_frame *xdpf = convert_to_xdp_frame(xdp);
2213+
2214+
if (unlikely(!xdpf))
2215+
return I40E_XDP_CONSUMED;
2216+
2217+
return i40e_xmit_xdp_ring(xdpf, xdp_ring);
2218+
}
2219+
22092220
/**
22102221
* i40e_run_xdp - run an XDP program
22112222
* @rx_ring: Rx ring being processed
@@ -2225,13 +2236,15 @@ static struct sk_buff *i40e_run_xdp(struct i40e_ring *rx_ring,
22252236
if (!xdp_prog)
22262237
goto xdp_out;
22272238

2239+
prefetchw(xdp->data_hard_start); /* xdp_frame write */
2240+
22282241
act = bpf_prog_run_xdp(xdp_prog, xdp);
22292242
switch (act) {
22302243
case XDP_PASS:
22312244
break;
22322245
case XDP_TX:
22332246
xdp_ring = rx_ring->vsi->xdp_rings[rx_ring->queue_index];
2234-
result = i40e_xmit_xdp_ring(xdp, xdp_ring);
2247+
result = i40e_xmit_xdp_tx_ring(xdp, xdp_ring);
22352248
break;
22362249
case XDP_REDIRECT:
22372250
err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog);
@@ -3478,28 +3491,28 @@ static inline int i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb,
34783491
* @xdp: data to transmit
34793492
* @xdp_ring: XDP Tx ring
34803493
**/
3481-
static int i40e_xmit_xdp_ring(struct xdp_buff *xdp,
3494+
static int i40e_xmit_xdp_ring(struct xdp_frame *xdpf,
34823495
struct i40e_ring *xdp_ring)
34833496
{
3484-
u32 size = xdp->data_end - xdp->data;
34853497
u16 i = xdp_ring->next_to_use;
34863498
struct i40e_tx_buffer *tx_bi;
34873499
struct i40e_tx_desc *tx_desc;
3500+
u32 size = xdpf->len;
34883501
dma_addr_t dma;
34893502

34903503
if (!unlikely(I40E_DESC_UNUSED(xdp_ring))) {
34913504
xdp_ring->tx_stats.tx_busy++;
34923505
return I40E_XDP_CONSUMED;
34933506
}
34943507

3495-
dma = dma_map_single(xdp_ring->dev, xdp->data, size, DMA_TO_DEVICE);
3508+
dma = dma_map_single(xdp_ring->dev, xdpf->data, size, DMA_TO_DEVICE);
34963509
if (dma_mapping_error(xdp_ring->dev, dma))
34973510
return I40E_XDP_CONSUMED;
34983511

34993512
tx_bi = &xdp_ring->tx_bi[i];
35003513
tx_bi->bytecount = size;
35013514
tx_bi->gso_segs = 1;
3502-
tx_bi->raw_buf = xdp->data;
3515+
tx_bi->xdpf = xdpf;
35033516

35043517
/* record length, and DMA address */
35053518
dma_unmap_len_set(tx_bi, len, size);
@@ -3675,7 +3688,7 @@ netdev_tx_t i40e_lan_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
36753688
*
36763689
* Returns Zero if sent, else an error code
36773690
**/
3678-
int i40e_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
3691+
int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
36793692
{
36803693
struct i40e_netdev_priv *np = netdev_priv(dev);
36813694
unsigned int queue_index = smp_processor_id();
@@ -3688,7 +3701,7 @@ int i40e_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
36883701
if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs)
36893702
return -ENXIO;
36903703

3691-
err = i40e_xmit_xdp_ring(xdp, vsi->xdp_rings[queue_index]);
3704+
err = i40e_xmit_xdp_ring(xdpf, vsi->xdp_rings[queue_index]);
36923705
if (err != I40E_XDP_TX)
36933706
return -ENOSPC;
36943707

drivers/net/ethernet/intel/i40e/i40e_txrx.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -306,6 +306,7 @@ static inline unsigned int i40e_txd_use_count(unsigned int size)
306306
struct i40e_tx_buffer {
307307
struct i40e_tx_desc *next_to_watch;
308308
union {
309+
struct xdp_frame *xdpf;
309310
struct sk_buff *skb;
310311
void *raw_buf;
311312
};
@@ -510,7 +511,7 @@ u32 i40e_get_tx_pending(struct i40e_ring *ring, bool in_sw);
510511
void i40e_detect_recover_hung(struct i40e_vsi *vsi);
511512
int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size);
512513
bool __i40e_chk_linearize(struct sk_buff *skb);
513-
int i40e_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp);
514+
int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf);
514515
void i40e_xdp_flush(struct net_device *dev);
515516

516517
/**

drivers/net/ethernet/intel/ixgbe/ixgbe.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -241,8 +241,7 @@ struct ixgbe_tx_buffer {
241241
unsigned long time_stamp;
242242
union {
243243
struct sk_buff *skb;
244-
/* XDP uses address ptr on irq_clean */
245-
void *data;
244+
struct xdp_frame *xdpf;
246245
};
247246
unsigned int bytecount;
248247
unsigned short gso_segs;

drivers/net/ethernet/intel/ixgbe/ixgbe_main.c

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1216,7 +1216,7 @@ static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector *q_vector,
12161216

12171217
/* free the skb */
12181218
if (ring_is_xdp(tx_ring))
1219-
page_frag_free(tx_buffer->data);
1219+
xdp_return_frame(tx_buffer->xdpf);
12201220
else
12211221
napi_consume_skb(tx_buffer->skb, napi_budget);
12221222

@@ -2262,14 +2262,15 @@ static struct sk_buff *ixgbe_build_skb(struct ixgbe_ring *rx_ring,
22622262
#define IXGBE_XDP_TX 2
22632263

22642264
static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter *adapter,
2265-
struct xdp_buff *xdp);
2265+
struct xdp_frame *xdpf);
22662266

22672267
static struct sk_buff *ixgbe_run_xdp(struct ixgbe_adapter *adapter,
22682268
struct ixgbe_ring *rx_ring,
22692269
struct xdp_buff *xdp)
22702270
{
22712271
int err, result = IXGBE_XDP_PASS;
22722272
struct bpf_prog *xdp_prog;
2273+
struct xdp_frame *xdpf;
22732274
u32 act;
22742275

22752276
rcu_read_lock();
@@ -2278,12 +2279,19 @@ static struct sk_buff *ixgbe_run_xdp(struct ixgbe_adapter *adapter,
22782279
if (!xdp_prog)
22792280
goto xdp_out;
22802281

2282+
prefetchw(xdp->data_hard_start); /* xdp_frame write */
2283+
22812284
act = bpf_prog_run_xdp(xdp_prog, xdp);
22822285
switch (act) {
22832286
case XDP_PASS:
22842287
break;
22852288
case XDP_TX:
2286-
result = ixgbe_xmit_xdp_ring(adapter, xdp);
2289+
xdpf = convert_to_xdp_frame(xdp);
2290+
if (unlikely(!xdpf)) {
2291+
result = IXGBE_XDP_CONSUMED;
2292+
break;
2293+
}
2294+
result = ixgbe_xmit_xdp_ring(adapter, xdpf);
22872295
break;
22882296
case XDP_REDIRECT:
22892297
err = xdp_do_redirect(adapter->netdev, xdp, xdp_prog);
@@ -5797,7 +5805,7 @@ static void ixgbe_clean_tx_ring(struct ixgbe_ring *tx_ring)
57975805

57985806
/* Free all the Tx ring sk_buffs */
57995807
if (ring_is_xdp(tx_ring))
5800-
page_frag_free(tx_buffer->data);
5808+
xdp_return_frame(tx_buffer->xdpf);
58015809
else
58025810
dev_kfree_skb_any(tx_buffer->skb);
58035811

@@ -6370,7 +6378,7 @@ int ixgbe_setup_rx_resources(struct ixgbe_adapter *adapter,
63706378
struct device *dev = rx_ring->dev;
63716379
int orig_node = dev_to_node(dev);
63726380
int ring_node = -1;
6373-
int size;
6381+
int size, err;
63746382

63756383
size = sizeof(struct ixgbe_rx_buffer) * rx_ring->count;
63766384

@@ -6407,6 +6415,13 @@ int ixgbe_setup_rx_resources(struct ixgbe_adapter *adapter,
64076415
rx_ring->queue_index) < 0)
64086416
goto err;
64096417

6418+
err = xdp_rxq_info_reg_mem_model(&rx_ring->xdp_rxq,
6419+
MEM_TYPE_PAGE_SHARED, NULL);
6420+
if (err) {
6421+
xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
6422+
goto err;
6423+
}
6424+
64106425
rx_ring->xdp_prog = adapter->xdp_prog;
64116426

64126427
return 0;
@@ -8336,7 +8351,7 @@ static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb,
83368351
}
83378352

83388353
static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter *adapter,
8339-
struct xdp_buff *xdp)
8354+
struct xdp_frame *xdpf)
83408355
{
83418356
struct ixgbe_ring *ring = adapter->xdp_ring[smp_processor_id()];
83428357
struct ixgbe_tx_buffer *tx_buffer;
@@ -8345,12 +8360,12 @@ static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter *adapter,
83458360
dma_addr_t dma;
83468361
u16 i;
83478362

8348-
len = xdp->data_end - xdp->data;
8363+
len = xdpf->len;
83498364

83508365
if (unlikely(!ixgbe_desc_unused(ring)))
83518366
return IXGBE_XDP_CONSUMED;
83528367

8353-
dma = dma_map_single(ring->dev, xdp->data, len, DMA_TO_DEVICE);
8368+
dma = dma_map_single(ring->dev, xdpf->data, len, DMA_TO_DEVICE);
83548369
if (dma_mapping_error(ring->dev, dma))
83558370
return IXGBE_XDP_CONSUMED;
83568371

@@ -8365,7 +8380,8 @@ static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter *adapter,
83658380

83668381
dma_unmap_len_set(tx_buffer, len, len);
83678382
dma_unmap_addr_set(tx_buffer, dma, dma);
8368-
tx_buffer->data = xdp->data;
8383+
tx_buffer->xdpf = xdpf;
8384+
83698385
tx_desc->read.buffer_addr = cpu_to_le64(dma);
83708386

83718387
/* put descriptor type bits */
@@ -9996,7 +10012,7 @@ static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp)
999610012
}
999710013
}
999810014

9999-
static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
10015+
static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
1000010016
{
1000110017
struct ixgbe_adapter *adapter = netdev_priv(dev);
1000210018
struct ixgbe_ring *ring;
@@ -10012,7 +10028,7 @@ static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
1001210028
if (unlikely(!ring))
1001310029
return -ENXIO;
1001410030

10015-
err = ixgbe_xmit_xdp_ring(adapter, xdp);
10031+
err = ixgbe_xmit_xdp_ring(adapter, xdpf);
1001610032
if (err != IXGBE_XDP_TX)
1001710033
return -ENOSPC;
1001810034

drivers/net/ethernet/mellanox/mlx5/core/Kconfig

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ config MLX5_CORE_EN
3030
bool "Mellanox Technologies ConnectX-4 Ethernet support"
3131
depends on NETDEVICES && ETHERNET && INET && PCI && MLX5_CORE
3232
depends on IPV6=y || IPV6=n || MLX5_CORE=m
33+
select PAGE_POOL
3334
default n
3435
---help---
3536
Ethernet support in Mellanox Technologies ConnectX-4 NIC.

drivers/net/ethernet/mellanox/mlx5/core/en.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,8 @@
5353
#include "mlx5_core.h"
5454
#include "en_stats.h"
5555

56+
struct page_pool;
57+
5658
#define MLX5_SET_CFG(p, f, v) MLX5_SET(create_flow_group_in, p, f, v)
5759

5860
#define MLX5E_ETH_HARD_MTU (ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN)
@@ -392,6 +394,7 @@ struct mlx5e_xdpsq {
392394
struct {
393395
struct mlx5e_dma_info *di;
394396
bool doorbell;
397+
bool redirect_flush;
395398
} db;
396399

397400
/* read only */
@@ -533,6 +536,7 @@ struct mlx5e_rq {
533536
unsigned int hw_mtu;
534537
struct mlx5e_xdpsq xdpsq;
535538
DECLARE_BITMAP(flags, 8);
539+
struct page_pool *page_pool;
536540

537541
/* control */
538542
struct mlx5_wq_ctrl wq_ctrl;

drivers/net/ethernet/mellanox/mlx5/core/en_main.c

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
#include <linux/mlx5/fs.h>
3636
#include <net/vxlan.h>
3737
#include <linux/bpf.h>
38+
#include <net/page_pool.h>
3839
#include "eswitch.h"
3940
#include "en.h"
4041
#include "en_tc.h"
@@ -389,10 +390,11 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
389390
struct mlx5e_rq_param *rqp,
390391
struct mlx5e_rq *rq)
391392
{
393+
struct page_pool_params pp_params = { 0 };
392394
struct mlx5_core_dev *mdev = c->mdev;
393395
void *rqc = rqp->rqc;
394396
void *rqc_wq = MLX5_ADDR_OF(rqc, rqc, wq);
395-
u32 byte_count;
397+
u32 byte_count, pool_size;
396398
int npages;
397399
int wq_sz;
398400
int err;
@@ -432,9 +434,12 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
432434

433435
rq->buff.map_dir = rq->xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE;
434436
rq->buff.headroom = mlx5e_get_rq_headroom(mdev, params);
437+
pool_size = 1 << params->log_rq_mtu_frames;
435438

436439
switch (rq->wq_type) {
437440
case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
441+
442+
pool_size = MLX5_MPWRQ_PAGES_PER_WQE << mlx5e_mpwqe_get_log_rq_size(params);
438443
rq->post_wqes = mlx5e_post_rx_mpwqes;
439444
rq->dealloc_wqe = mlx5e_dealloc_rx_mpwqe;
440445

@@ -512,6 +517,32 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
512517
rq->mkey_be = c->mkey_be;
513518
}
514519

520+
/* Create a page_pool and register it with rxq */
521+
pp_params.order = rq->buff.page_order;
522+
pp_params.flags = 0; /* No-internal DMA mapping in page_pool */
523+
pp_params.pool_size = pool_size;
524+
pp_params.nid = cpu_to_node(c->cpu);
525+
pp_params.dev = c->pdev;
526+
pp_params.dma_dir = rq->buff.map_dir;
527+
528+
/* page_pool can be used even when there is no rq->xdp_prog,
529+
* given page_pool does not handle DMA mapping there is no
530+
* required state to clear. And page_pool gracefully handle
531+
* elevated refcnt.
532+
*/
533+
rq->page_pool = page_pool_create(&pp_params);
534+
if (IS_ERR(rq->page_pool)) {
535+
if (rq->wq_type != MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ)
536+
kfree(rq->wqe.frag_info);
537+
err = PTR_ERR(rq->page_pool);
538+
rq->page_pool = NULL;
539+
goto err_rq_wq_destroy;
540+
}
541+
err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
542+
MEM_TYPE_PAGE_POOL, rq->page_pool);
543+
if (err)
544+
goto err_rq_wq_destroy;
545+
515546
for (i = 0; i < wq_sz; i++) {
516547
struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(&rq->wq, i);
517548

@@ -548,6 +579,8 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
548579
if (rq->xdp_prog)
549580
bpf_prog_put(rq->xdp_prog);
550581
xdp_rxq_info_unreg(&rq->xdp_rxq);
582+
if (rq->page_pool)
583+
page_pool_destroy(rq->page_pool);
551584
mlx5_wq_destroy(&rq->wq_ctrl);
552585

553586
return err;
@@ -561,6 +594,8 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq)
561594
bpf_prog_put(rq->xdp_prog);
562595

563596
xdp_rxq_info_unreg(&rq->xdp_rxq);
597+
if (rq->page_pool)
598+
page_pool_destroy(rq->page_pool);
564599

565600
switch (rq->wq_type) {
566601
case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:

0 commit comments

Comments
 (0)