Skip to content

Commit db05815

Browse files
Maxim Mikityanskiyborkmann
authored andcommitted
net/mlx5e: Add XSK zero-copy support
This commit adds support for AF_XDP zero-copy RX and TX. We create a dedicated XSK RQ inside the channel, it means that two RQs are running simultaneously: one for non-XSK traffic and the other for XSK traffic. The regular and XSK RQs use a single ID namespace split into two halves: the lower half is regular RQs, and the upper half is XSK RQs. When any zero-copy AF_XDP socket is active, changing the number of channels is not allowed, because it would break to mapping between XSK RQ IDs and channels. XSK requires different page allocation and release routines. Such functions as mlx5e_{alloc,free}_rx_mpwqe and mlx5e_{get,put}_rx_frag are generic enough to be used for both regular and XSK RQs, and they use the mlx5e_page_{alloc,release} wrappers around the real allocation functions. Function pointers are not used to avoid losing the performance with retpolines. Wherever it's certain that the regular (non-XSK) page release function should be used, it's called directly. Only the stats that could be meaningful for XSK are exposed to the userspace. Those that don't take part in the XSK flow are not considered. Note that we don't wait for WQEs on the XSK RQ (unlike the regular RQ), because the newer xdpsock sample doesn't provide any Fill Ring entries at the setup stage. We create a dedicated XSK SQ in the channel. This separation has its advantages: 1. When the UMEM is closed, the XSK SQ can also be closed and stop receiving completions. If an existing SQ was used for XSK, it would continue receiving completions for the packets of the closed socket. If a new UMEM was opened at that point, it would start getting completions that don't belong to it. 2. Calculating statistics separately. When the userspace kicks the TX, the driver triggers a hardware interrupt by posting a NOP to a dedicated XSK ICO (internal control operations) SQ, in order to trigger NAPI on the right CPU core. This XSK ICO SQ is protected by a spinlock, as the userspace application may kick the TX from any core. Store the pointers to the UMEMs in the net device private context, independently from the kernel. This way the driver can distinguish between the zero-copy and non-zero-copy UMEMs. The kernel function xdp_get_umem_from_qid does not care about this difference, but the driver is only interested in zero-copy UMEMs, particularly, on the cleanup it determines whether to close the XSK RQ and SQ or not by looking at the presence of the UMEM. Use state_lock to protect the access to this area of UMEM pointers. LRO isn't compatible with XDP, but there may be active UMEMs while XDP is off. If this is the case, don't allow LRO to ensure XDP can be reenabled at any time. The validation of XSK parameters typically happens when XSK queues open. However, when the interface is down or the XDP program isn't set, it's still possible to have active AF_XDP sockets and even to open new, but the XSK queues will be closed. To cover these cases, perform the validation also in these flows: 1. A new UMEM is registered, but the XSK queues aren't going to be created due to missing XDP program or interface being down. 2. MTU changes while there are UMEMs registered. Having this early check prevents mlx5e_open_channels from failing at a later stage, where recovery is impossible and the application has no chance to handle the error, because it got the successful return value for an MTU change or XSK open operation. The performance testing was performed on a machine with the following configuration: - 24 cores of Intel Xeon E5-2620 v3 @ 2.40 GHz - Mellanox ConnectX-5 Ex with 100 Gbit/s link The results with retpoline disabled, single stream: txonly: 33.3 Mpps (21.5 Mpps with queue and app pinned to the same CPU) rxdrop: 12.2 Mpps l2fwd: 9.4 Mpps The results with retpoline enabled, single stream: txonly: 21.3 Mpps (14.1 Mpps with queue and app pinned to the same CPU) rxdrop: 9.9 Mpps l2fwd: 6.8 Mpps Signed-off-by: Maxim Mikityanskiy <[email protected]> Signed-off-by: Tariq Toukan <[email protected]> Acked-by: Saeed Mahameed <[email protected]> Signed-off-by: Daniel Borkmann <[email protected]>
1 parent 32a2365 commit db05815

File tree

24 files changed

+1840
-255
lines changed

24 files changed

+1840
-255
lines changed

drivers/net/ethernet/mellanox/mlx5/core/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ mlx5_core-y := main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
2424
mlx5_core-$(CONFIG_MLX5_CORE_EN) += en_main.o en_common.o en_fs.o en_ethtool.o \
2525
en_tx.o en_rx.o en_dim.o en_txrx.o en/xdp.o en_stats.o \
2626
en_selftest.o en/port.o en/monitor_stats.o en/reporter_tx.o \
27-
en/params.o
27+
en/params.o en/xsk/umem.o en/xsk/setup.o en/xsk/rx.o en/xsk/tx.o
2828

2929
#
3030
# Netdev extra

drivers/net/ethernet/mellanox/mlx5/core/en.h

Lines changed: 88 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,7 @@ struct page_pool;
137137
#define MLX5E_MAX_NUM_CHANNELS (MLX5E_INDIR_RQT_SIZE >> 1)
138138
#define MLX5E_MAX_NUM_SQS (MLX5E_MAX_NUM_CHANNELS * MLX5E_MAX_NUM_TC)
139139
#define MLX5E_TX_CQ_POLL_BUDGET 128
140+
#define MLX5E_TX_XSK_POLL_BUDGET 64
140141
#define MLX5E_SQ_RECOVER_MIN_INTERVAL 500 /* msecs */
141142

142143
#define MLX5E_UMR_WQE_INLINE_SZ \
@@ -155,6 +156,11 @@ do { \
155156
##__VA_ARGS__); \
156157
} while (0)
157158

159+
enum mlx5e_rq_group {
160+
MLX5E_RQ_GROUP_REGULAR,
161+
MLX5E_RQ_GROUP_XSK,
162+
MLX5E_NUM_RQ_GROUPS /* Keep last. */
163+
};
158164

159165
static inline u16 mlx5_min_rx_wqes(int wq_type, u32 wq_size)
160166
{
@@ -179,7 +185,8 @@ static inline int mlx5e_get_max_num_channels(struct mlx5_core_dev *mdev)
179185
/* Use this function to get max num channels after netdev was created */
180186
static inline int mlx5e_get_netdev_max_channels(struct net_device *netdev)
181187
{
182-
return min_t(unsigned int, netdev->num_rx_queues,
188+
return min_t(unsigned int,
189+
netdev->num_rx_queues / MLX5E_NUM_RQ_GROUPS,
183190
netdev->num_tx_queues);
184191
}
185192

@@ -250,6 +257,7 @@ struct mlx5e_params {
250257
u32 lro_timeout;
251258
u32 pflags;
252259
struct bpf_prog *xdp_prog;
260+
struct mlx5e_xsk *xsk;
253261
unsigned int sw_mtu;
254262
int hard_mtu;
255263
};
@@ -399,8 +407,14 @@ struct mlx5e_txqsq {
399407
} ____cacheline_aligned_in_smp;
400408

401409
struct mlx5e_dma_info {
402-
struct page *page;
403-
dma_addr_t addr;
410+
dma_addr_t addr;
411+
union {
412+
struct page *page;
413+
struct {
414+
u64 handle;
415+
void *data;
416+
} xsk;
417+
};
404418
};
405419

406420
/* XDP packets can be transmitted in different ways. On completion, we need to
@@ -467,9 +481,11 @@ struct mlx5e_xdp_mpwqe {
467481
};
468482

469483
struct mlx5e_xdpsq;
484+
typedef int (*mlx5e_fp_xmit_xdp_frame_check)(struct mlx5e_xdpsq *);
470485
typedef bool (*mlx5e_fp_xmit_xdp_frame)(struct mlx5e_xdpsq *,
471486
struct mlx5e_xdp_xmit_data *,
472-
struct mlx5e_xdp_info *);
487+
struct mlx5e_xdp_info *,
488+
int);
473489

474490
struct mlx5e_xdpsq {
475491
/* data path */
@@ -487,8 +503,10 @@ struct mlx5e_xdpsq {
487503
struct mlx5e_cq cq;
488504

489505
/* read only */
506+
struct xdp_umem *umem;
490507
struct mlx5_wq_cyc wq;
491508
struct mlx5e_xdpsq_stats *stats;
509+
mlx5e_fp_xmit_xdp_frame_check xmit_xdp_frame_check;
492510
mlx5e_fp_xmit_xdp_frame xmit_xdp_frame;
493511
struct {
494512
struct mlx5e_xdp_wqe_info *wqe_info;
@@ -619,6 +637,7 @@ struct mlx5e_rq {
619637
} mpwqe;
620638
};
621639
struct {
640+
u16 umem_headroom;
622641
u16 headroom;
623642
u8 map_dir; /* dma map direction */
624643
} buff;
@@ -649,6 +668,10 @@ struct mlx5e_rq {
649668
DECLARE_BITMAP(flags, 8);
650669
struct page_pool *page_pool;
651670

671+
/* AF_XDP zero-copy */
672+
struct zero_copy_allocator zca;
673+
struct xdp_umem *umem;
674+
652675
/* control */
653676
struct mlx5_wq_ctrl wq_ctrl;
654677
__be32 mkey_be;
@@ -661,6 +684,11 @@ struct mlx5e_rq {
661684
struct xdp_rxq_info xdp_rxq;
662685
} ____cacheline_aligned_in_smp;
663686

687+
enum mlx5e_channel_state {
688+
MLX5E_CHANNEL_STATE_XSK,
689+
MLX5E_CHANNEL_NUM_STATES
690+
};
691+
664692
struct mlx5e_channel {
665693
/* data path */
666694
struct mlx5e_rq rq;
@@ -677,6 +705,13 @@ struct mlx5e_channel {
677705
/* XDP_REDIRECT */
678706
struct mlx5e_xdpsq xdpsq;
679707

708+
/* AF_XDP zero-copy */
709+
struct mlx5e_rq xskrq;
710+
struct mlx5e_xdpsq xsksq;
711+
struct mlx5e_icosq xskicosq;
712+
/* xskicosq can be accessed from any CPU - the spinlock protects it. */
713+
spinlock_t xskicosq_lock;
714+
680715
/* data path - accessed per napi poll */
681716
struct irq_desc *irq_desc;
682717
struct mlx5e_ch_stats *stats;
@@ -685,6 +720,7 @@ struct mlx5e_channel {
685720
struct mlx5e_priv *priv;
686721
struct mlx5_core_dev *mdev;
687722
struct hwtstamp_config *tstamp;
723+
DECLARE_BITMAP(state, MLX5E_CHANNEL_NUM_STATES);
688724
int ix;
689725
int cpu;
690726
cpumask_var_t xps_cpumask;
@@ -700,14 +736,17 @@ struct mlx5e_channel_stats {
700736
struct mlx5e_ch_stats ch;
701737
struct mlx5e_sq_stats sq[MLX5E_MAX_NUM_TC];
702738
struct mlx5e_rq_stats rq;
739+
struct mlx5e_rq_stats xskrq;
703740
struct mlx5e_xdpsq_stats rq_xdpsq;
704741
struct mlx5e_xdpsq_stats xdpsq;
742+
struct mlx5e_xdpsq_stats xsksq;
705743
} ____cacheline_aligned_in_smp;
706744

707745
enum {
708746
MLX5E_STATE_OPENED,
709747
MLX5E_STATE_DESTROYING,
710748
MLX5E_STATE_XDP_TX_ENABLED,
749+
MLX5E_STATE_XDP_OPEN,
711750
};
712751

713752
struct mlx5e_rqt {
@@ -740,6 +779,17 @@ struct mlx5e_modify_sq_param {
740779
int rl_index;
741780
};
742781

782+
struct mlx5e_xsk {
783+
/* UMEMs are stored separately from channels, because we don't want to
784+
* lose them when channels are recreated. The kernel also stores UMEMs,
785+
* but it doesn't distinguish between zero-copy and non-zero-copy UMEMs,
786+
* so rely on our mechanism.
787+
*/
788+
struct xdp_umem **umems;
789+
u16 refcnt;
790+
bool ever_used;
791+
};
792+
743793
struct mlx5e_priv {
744794
/* priv data path fields - start */
745795
struct mlx5e_txqsq *txq2sq[MLX5E_MAX_NUM_CHANNELS * MLX5E_MAX_NUM_TC];
@@ -760,6 +810,7 @@ struct mlx5e_priv {
760810
struct mlx5e_tir indir_tir[MLX5E_NUM_INDIR_TIRS];
761811
struct mlx5e_tir inner_indir_tir[MLX5E_NUM_INDIR_TIRS];
762812
struct mlx5e_tir direct_tir[MLX5E_MAX_NUM_CHANNELS];
813+
struct mlx5e_tir xsk_tir[MLX5E_MAX_NUM_CHANNELS];
763814
struct mlx5e_rss_params rss_params;
764815
u32 tx_rates[MLX5E_MAX_NUM_SQS];
765816

@@ -796,6 +847,7 @@ struct mlx5e_priv {
796847
struct mlx5e_tls *tls;
797848
#endif
798849
struct devlink_health_reporter *tx_reporter;
850+
struct mlx5e_xsk xsk;
799851
};
800852

801853
struct mlx5e_profile {
@@ -839,8 +891,9 @@ bool mlx5e_striding_rq_possible(struct mlx5_core_dev *mdev,
839891
struct mlx5e_params *params);
840892

841893
void mlx5e_page_dma_unmap(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info);
842-
void mlx5e_page_release(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info,
843-
bool recycle);
894+
void mlx5e_page_release_dynamic(struct mlx5e_rq *rq,
895+
struct mlx5e_dma_info *dma_info,
896+
bool recycle);
844897
void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe);
845898
void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe);
846899
bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq);
@@ -900,6 +953,30 @@ void mlx5e_build_indir_tir_ctx_hash(struct mlx5e_rss_params *rss_params,
900953
void mlx5e_modify_tirs_hash(struct mlx5e_priv *priv, void *in, int inlen);
901954
struct mlx5e_tirc_config mlx5e_tirc_get_default_config(enum mlx5e_traffic_types tt);
902955

956+
struct mlx5e_xsk_param;
957+
958+
struct mlx5e_rq_param;
959+
int mlx5e_open_rq(struct mlx5e_channel *c, struct mlx5e_params *params,
960+
struct mlx5e_rq_param *param, struct mlx5e_xsk_param *xsk,
961+
struct xdp_umem *umem, struct mlx5e_rq *rq);
962+
int mlx5e_wait_for_min_rx_wqes(struct mlx5e_rq *rq, int wait_time);
963+
void mlx5e_deactivate_rq(struct mlx5e_rq *rq);
964+
void mlx5e_close_rq(struct mlx5e_rq *rq);
965+
966+
struct mlx5e_sq_param;
967+
int mlx5e_open_icosq(struct mlx5e_channel *c, struct mlx5e_params *params,
968+
struct mlx5e_sq_param *param, struct mlx5e_icosq *sq);
969+
void mlx5e_close_icosq(struct mlx5e_icosq *sq);
970+
int mlx5e_open_xdpsq(struct mlx5e_channel *c, struct mlx5e_params *params,
971+
struct mlx5e_sq_param *param, struct xdp_umem *umem,
972+
struct mlx5e_xdpsq *sq, bool is_redirect);
973+
void mlx5e_close_xdpsq(struct mlx5e_xdpsq *sq);
974+
975+
struct mlx5e_cq_param;
976+
int mlx5e_open_cq(struct mlx5e_channel *c, struct net_dim_cq_moder moder,
977+
struct mlx5e_cq_param *param, struct mlx5e_cq *cq);
978+
void mlx5e_close_cq(struct mlx5e_cq *cq);
979+
903980
int mlx5e_open_locked(struct net_device *netdev);
904981
int mlx5e_close_locked(struct net_device *netdev);
905982

@@ -1070,10 +1147,10 @@ int mlx5e_create_indirect_rqt(struct mlx5e_priv *priv);
10701147
int mlx5e_create_indirect_tirs(struct mlx5e_priv *priv, bool inner_ttc);
10711148
void mlx5e_destroy_indirect_tirs(struct mlx5e_priv *priv, bool inner_ttc);
10721149

1073-
int mlx5e_create_direct_rqts(struct mlx5e_priv *priv);
1074-
void mlx5e_destroy_direct_rqts(struct mlx5e_priv *priv);
1075-
int mlx5e_create_direct_tirs(struct mlx5e_priv *priv);
1076-
void mlx5e_destroy_direct_tirs(struct mlx5e_priv *priv);
1150+
int mlx5e_create_direct_rqts(struct mlx5e_priv *priv, struct mlx5e_tir *tirs);
1151+
void mlx5e_destroy_direct_rqts(struct mlx5e_priv *priv, struct mlx5e_tir *tirs);
1152+
int mlx5e_create_direct_tirs(struct mlx5e_priv *priv, struct mlx5e_tir *tirs);
1153+
void mlx5e_destroy_direct_tirs(struct mlx5e_priv *priv, struct mlx5e_tir *tirs);
10771154
void mlx5e_destroy_rqt(struct mlx5e_priv *priv, struct mlx5e_rqt *rqt);
10781155

10791156
int mlx5e_create_tis(struct mlx5_core_dev *mdev, int tc,
@@ -1142,6 +1219,7 @@ void mlx5e_detach_netdev(struct mlx5e_priv *priv);
11421219
void mlx5e_destroy_netdev(struct mlx5e_priv *priv);
11431220
void mlx5e_set_netdev_mtu_boundaries(struct mlx5e_priv *priv);
11441221
void mlx5e_build_nic_params(struct mlx5_core_dev *mdev,
1222+
struct mlx5e_xsk *xsk,
11451223
struct mlx5e_rss_params *rss_params,
11461224
struct mlx5e_params *params,
11471225
u16 max_channels, u16 mtu);

drivers/net/ethernet/mellanox/mlx5/core/en/params.c

Lines changed: 32 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -49,48 +49,56 @@ u32 mlx5e_rx_get_linear_frag_sz(struct mlx5e_params *params,
4949
return frag_sz;
5050
}
5151

52-
u8 mlx5e_mpwqe_log_pkts_per_wqe(struct mlx5e_params *params)
52+
u8 mlx5e_mpwqe_log_pkts_per_wqe(struct mlx5e_params *params,
53+
struct mlx5e_xsk_param *xsk)
5354
{
54-
u32 linear_frag_sz = mlx5e_rx_get_linear_frag_sz(params, NULL);
55+
u32 linear_frag_sz = mlx5e_rx_get_linear_frag_sz(params, xsk);
5556

5657
return MLX5_MPWRQ_LOG_WQE_SZ - order_base_2(linear_frag_sz);
5758
}
5859

59-
bool mlx5e_rx_is_linear_skb(struct mlx5e_params *params)
60+
bool mlx5e_rx_is_linear_skb(struct mlx5e_params *params,
61+
struct mlx5e_xsk_param *xsk)
6062
{
61-
u32 frag_sz = mlx5e_rx_get_linear_frag_sz(params, NULL);
63+
/* AF_XDP allocates SKBs on XDP_PASS - ensure they don't occupy more
64+
* than one page. For this, check both with and without xsk.
65+
*/
66+
u32 linear_frag_sz = max(mlx5e_rx_get_linear_frag_sz(params, xsk),
67+
mlx5e_rx_get_linear_frag_sz(params, NULL));
6268

63-
return !params->lro_en && frag_sz <= PAGE_SIZE;
69+
return !params->lro_en && linear_frag_sz <= PAGE_SIZE;
6470
}
6571

6672
#define MLX5_MAX_MPWQE_LOG_WQE_STRIDE_SZ ((BIT(__mlx5_bit_sz(wq, log_wqe_stride_size)) - 1) + \
6773
MLX5_MPWQE_LOG_STRIDE_SZ_BASE)
6874
bool mlx5e_rx_mpwqe_is_linear_skb(struct mlx5_core_dev *mdev,
69-
struct mlx5e_params *params)
75+
struct mlx5e_params *params,
76+
struct mlx5e_xsk_param *xsk)
7077
{
71-
u32 frag_sz = mlx5e_rx_get_linear_frag_sz(params, NULL);
78+
u32 linear_frag_sz = mlx5e_rx_get_linear_frag_sz(params, xsk);
7279
s8 signed_log_num_strides_param;
7380
u8 log_num_strides;
7481

75-
if (!mlx5e_rx_is_linear_skb(params))
82+
if (!mlx5e_rx_is_linear_skb(params, xsk))
7683
return false;
7784

78-
if (order_base_2(frag_sz) > MLX5_MAX_MPWQE_LOG_WQE_STRIDE_SZ)
85+
if (order_base_2(linear_frag_sz) > MLX5_MAX_MPWQE_LOG_WQE_STRIDE_SZ)
7986
return false;
8087

8188
if (MLX5_CAP_GEN(mdev, ext_stride_num_range))
8289
return true;
8390

84-
log_num_strides = MLX5_MPWRQ_LOG_WQE_SZ - order_base_2(frag_sz);
91+
log_num_strides = MLX5_MPWRQ_LOG_WQE_SZ - order_base_2(linear_frag_sz);
8592
signed_log_num_strides_param =
8693
(s8)log_num_strides - MLX5_MPWQE_LOG_NUM_STRIDES_BASE;
8794

8895
return signed_log_num_strides_param >= 0;
8996
}
9097

91-
u8 mlx5e_mpwqe_get_log_rq_size(struct mlx5e_params *params)
98+
u8 mlx5e_mpwqe_get_log_rq_size(struct mlx5e_params *params,
99+
struct mlx5e_xsk_param *xsk)
92100
{
93-
u8 log_pkts_per_wqe = mlx5e_mpwqe_log_pkts_per_wqe(params);
101+
u8 log_pkts_per_wqe = mlx5e_mpwqe_log_pkts_per_wqe(params, xsk);
94102

95103
/* Numbers are unsigned, don't subtract to avoid underflow. */
96104
if (params->log_rq_mtu_frames <
@@ -101,27 +109,30 @@ u8 mlx5e_mpwqe_get_log_rq_size(struct mlx5e_params *params)
101109
}
102110

103111
u8 mlx5e_mpwqe_get_log_stride_size(struct mlx5_core_dev *mdev,
104-
struct mlx5e_params *params)
112+
struct mlx5e_params *params,
113+
struct mlx5e_xsk_param *xsk)
105114
{
106-
if (mlx5e_rx_mpwqe_is_linear_skb(mdev, params))
107-
return order_base_2(mlx5e_rx_get_linear_frag_sz(params, NULL));
115+
if (mlx5e_rx_mpwqe_is_linear_skb(mdev, params, xsk))
116+
return order_base_2(mlx5e_rx_get_linear_frag_sz(params, xsk));
108117

109118
return MLX5_MPWRQ_DEF_LOG_STRIDE_SZ(mdev);
110119
}
111120

112121
u8 mlx5e_mpwqe_get_log_num_strides(struct mlx5_core_dev *mdev,
113-
struct mlx5e_params *params)
122+
struct mlx5e_params *params,
123+
struct mlx5e_xsk_param *xsk)
114124
{
115125
return MLX5_MPWRQ_LOG_WQE_SZ -
116-
mlx5e_mpwqe_get_log_stride_size(mdev, params);
126+
mlx5e_mpwqe_get_log_stride_size(mdev, params, xsk);
117127
}
118128

119129
u16 mlx5e_get_rq_headroom(struct mlx5_core_dev *mdev,
120-
struct mlx5e_params *params)
130+
struct mlx5e_params *params,
131+
struct mlx5e_xsk_param *xsk)
121132
{
122133
bool is_linear_skb = (params->rq_wq_type == MLX5_WQ_TYPE_CYCLIC) ?
123-
mlx5e_rx_is_linear_skb(params) :
124-
mlx5e_rx_mpwqe_is_linear_skb(mdev, params);
134+
mlx5e_rx_is_linear_skb(params, xsk) :
135+
mlx5e_rx_mpwqe_is_linear_skb(mdev, params, xsk);
125136

126-
return is_linear_skb ? mlx5e_get_linear_rq_headroom(params, NULL) : 0;
137+
return is_linear_skb ? mlx5e_get_linear_rq_headroom(params, xsk) : 0;
127138
}

0 commit comments

Comments
 (0)