Skip to content

Commit aa1d3fa

Browse files
alobakinborkmann
authored andcommitted
ice: Robustify cleaning/completing XDP Tx buffers
When queueing frames from a Page Pool for redirecting to a device backed by the ice driver, `perf top` shows heavy load on page_alloc() and page_frag_free(), despite that on a properly working system it must be fully or at least almost zero-alloc. The problem is in fact a bit deeper and raises from how ice cleans up completed Tx buffers. The story so far: when cleaning/freeing the resources related to a particular completed Tx frame (skbs, DMA mappings etc.), ice uses some heuristics only without setting any type explicitly (except for dummy Flow Director packets, which are marked via ice_tx_buf::tx_flags). This kinda works, but only up to some point. For example, currently ice assumes that each frame coming to __ice_xmit_xdp_ring(), is backed by either plain order-0 page or plain page frag, while it may also be backed by Page Pool or any other possible memory models introduced in future. This means any &xdp_frame must be freed properly via xdp_return_frame() family with no assumptions. In order to do that, the whole heuristics must be replaced with setting the Tx buffer/frame type explicitly, just how it's always been done via an enum. Let us reuse 16 bits from ::tx_flags -- 1 bit-and instr won't hurt much -- especially given that sometimes there was a check for %ICE_TX_FLAGS_DUMMY_PKT, which is now turned from a flag to an enum member. The rest of the changes is straightforward and most of it is just a conversion to rely now on the type set in &ice_tx_buf rather than to some secondary properties. For now, no functional changes intended, the change only prepares the ground for starting freeing XDP frames properly next step. And it must be done atomically/synchronously to not break stuff. Signed-off-by: Alexander Lobakin <[email protected]> Signed-off-by: Daniel Borkmann <[email protected]> Acked-by: Maciej Fijalkowski <[email protected]> Link: https://lore.kernel.org/bpf/[email protected]
1 parent 923096b commit aa1d3fa

File tree

4 files changed

+63
-36
lines changed

4 files changed

+63
-36
lines changed

drivers/net/ethernet/intel/ice/ice_txrx.c

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ ice_prgm_fdir_fltr(struct ice_vsi *vsi, struct ice_fltr_desc *fdir_desc,
8585
td_cmd = ICE_TXD_LAST_DESC_CMD | ICE_TX_DESC_CMD_DUMMY |
8686
ICE_TX_DESC_CMD_RE;
8787

88-
tx_buf->tx_flags = ICE_TX_FLAGS_DUMMY_PKT;
88+
tx_buf->type = ICE_TX_BUF_DUMMY;
8989
tx_buf->raw_buf = raw_packet;
9090

9191
tx_desc->cmd_type_offset_bsz =
@@ -112,28 +112,26 @@ ice_prgm_fdir_fltr(struct ice_vsi *vsi, struct ice_fltr_desc *fdir_desc,
112112
static void
113113
ice_unmap_and_free_tx_buf(struct ice_tx_ring *ring, struct ice_tx_buf *tx_buf)
114114
{
115-
if (tx_buf->skb) {
116-
if (tx_buf->tx_flags & ICE_TX_FLAGS_DUMMY_PKT) {
117-
devm_kfree(ring->dev, tx_buf->raw_buf);
118-
} else if (ice_ring_is_xdp(ring)) {
119-
page_frag_free(tx_buf->raw_buf);
120-
} else {
121-
dev_kfree_skb_any(tx_buf->skb);
122-
}
123-
if (dma_unmap_len(tx_buf, len))
124-
dma_unmap_single(ring->dev,
125-
dma_unmap_addr(tx_buf, dma),
126-
dma_unmap_len(tx_buf, len),
127-
DMA_TO_DEVICE);
128-
} else if (dma_unmap_len(tx_buf, len)) {
115+
if (dma_unmap_len(tx_buf, len))
129116
dma_unmap_page(ring->dev,
130117
dma_unmap_addr(tx_buf, dma),
131118
dma_unmap_len(tx_buf, len),
132119
DMA_TO_DEVICE);
120+
121+
switch (tx_buf->type) {
122+
case ICE_TX_BUF_DUMMY:
123+
devm_kfree(ring->dev, tx_buf->raw_buf);
124+
break;
125+
case ICE_TX_BUF_SKB:
126+
dev_kfree_skb_any(tx_buf->skb);
127+
break;
128+
case ICE_TX_BUF_XDP_TX:
129+
page_frag_free(tx_buf->raw_buf);
130+
break;
133131
}
134132

135133
tx_buf->next_to_watch = NULL;
136-
tx_buf->skb = NULL;
134+
tx_buf->type = ICE_TX_BUF_EMPTY;
137135
dma_unmap_len_set(tx_buf, len, 0);
138136
/* tx_buf must be completely set up in the transmit path */
139137
}
@@ -266,7 +264,7 @@ static bool ice_clean_tx_irq(struct ice_tx_ring *tx_ring, int napi_budget)
266264
DMA_TO_DEVICE);
267265

268266
/* clear tx_buf data */
269-
tx_buf->skb = NULL;
267+
tx_buf->type = ICE_TX_BUF_EMPTY;
270268
dma_unmap_len_set(tx_buf, len, 0);
271269

272270
/* unmap remaining buffers */
@@ -1709,6 +1707,7 @@ ice_tx_map(struct ice_tx_ring *tx_ring, struct ice_tx_buf *first,
17091707
DMA_TO_DEVICE);
17101708

17111709
tx_buf = &tx_ring->tx_buf[i];
1710+
tx_buf->type = ICE_TX_BUF_FRAG;
17121711
}
17131712

17141713
/* record SW timestamp if HW timestamp is not available */
@@ -2352,6 +2351,7 @@ ice_xmit_frame_ring(struct sk_buff *skb, struct ice_tx_ring *tx_ring)
23522351
/* record the location of the first descriptor for this packet */
23532352
first = &tx_ring->tx_buf[tx_ring->next_to_use];
23542353
first->skb = skb;
2354+
first->type = ICE_TX_BUF_SKB;
23552355
first->bytecount = max_t(unsigned int, skb->len, ETH_ZLEN);
23562356
first->gso_segs = 1;
23572357
first->tx_flags = 0;
@@ -2524,11 +2524,11 @@ void ice_clean_ctrl_tx_irq(struct ice_tx_ring *tx_ring)
25242524
dma_unmap_addr(tx_buf, dma),
25252525
dma_unmap_len(tx_buf, len),
25262526
DMA_TO_DEVICE);
2527-
if (tx_buf->tx_flags & ICE_TX_FLAGS_DUMMY_PKT)
2527+
if (tx_buf->type == ICE_TX_BUF_DUMMY)
25282528
devm_kfree(tx_ring->dev, tx_buf->raw_buf);
25292529

25302530
/* clear next_to_watch to prevent false hangs */
2531-
tx_buf->raw_buf = NULL;
2531+
tx_buf->type = ICE_TX_BUF_EMPTY;
25322532
tx_buf->tx_flags = 0;
25332533
tx_buf->next_to_watch = NULL;
25342534
dma_unmap_len_set(tx_buf, len, 0);

drivers/net/ethernet/intel/ice/ice_txrx.h

Lines changed: 25 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -121,10 +121,7 @@ static inline int ice_skb_pad(void)
121121
#define ICE_TX_FLAGS_TSO BIT(0)
122122
#define ICE_TX_FLAGS_HW_VLAN BIT(1)
123123
#define ICE_TX_FLAGS_SW_VLAN BIT(2)
124-
/* ICE_TX_FLAGS_DUMMY_PKT is used to mark dummy packets that should be
125-
* freed instead of returned like skb packets.
126-
*/
127-
#define ICE_TX_FLAGS_DUMMY_PKT BIT(3)
124+
/* Free, was ICE_TX_FLAGS_DUMMY_PKT */
128125
#define ICE_TX_FLAGS_TSYN BIT(4)
129126
#define ICE_TX_FLAGS_IPV4 BIT(5)
130127
#define ICE_TX_FLAGS_IPV6 BIT(6)
@@ -149,22 +146,41 @@ static inline int ice_skb_pad(void)
149146

150147
#define ICE_TXD_LAST_DESC_CMD (ICE_TX_DESC_CMD_EOP | ICE_TX_DESC_CMD_RS)
151148

149+
/**
150+
* enum ice_tx_buf_type - type of &ice_tx_buf to act on Tx completion
151+
* @ICE_TX_BUF_EMPTY: unused OR XSk frame, no action required
152+
* @ICE_TX_BUF_DUMMY: dummy Flow Director packet, unmap and kfree()
153+
* @ICE_TX_BUF_FRAG: mapped skb OR &xdp_buff frag, only unmap DMA
154+
* @ICE_TX_BUF_SKB: &sk_buff, unmap and consume_skb(), update stats
155+
* @ICE_TX_BUF_XDP_TX: &xdp_buff, unmap and page_frag_free(), stats
156+
* @ICE_TX_BUF_XSK_TX: &xdp_buff on XSk queue, xsk_buff_free(), stats
157+
*/
158+
enum ice_tx_buf_type {
159+
ICE_TX_BUF_EMPTY = 0U,
160+
ICE_TX_BUF_DUMMY,
161+
ICE_TX_BUF_FRAG,
162+
ICE_TX_BUF_SKB,
163+
ICE_TX_BUF_XDP_TX,
164+
ICE_TX_BUF_XSK_TX,
165+
};
166+
152167
struct ice_tx_buf {
153168
union {
154169
struct ice_tx_desc *next_to_watch;
155170
u32 rs_idx;
156171
};
157172
union {
158-
struct sk_buff *skb;
159-
void *raw_buf; /* used for XDP */
160-
struct xdp_buff *xdp; /* used for XDP_TX ZC */
173+
void *raw_buf; /* used for XDP_TX and FDir rules */
174+
struct sk_buff *skb; /* used for .ndo_start_xmit() */
175+
struct xdp_buff *xdp; /* used for XDP_TX ZC */
161176
};
162177
unsigned int bytecount;
163178
union {
164179
unsigned int gso_segs;
165-
unsigned int nr_frags; /* used for mbuf XDP */
180+
unsigned int nr_frags; /* used for mbuf XDP */
166181
};
167-
u32 tx_flags;
182+
u32 type:16; /* &ice_tx_buf_type */
183+
u32 tx_flags:16;
168184
DEFINE_DMA_UNMAP_LEN(len);
169185
DEFINE_DMA_UNMAP_ADDR(dma);
170186
};

drivers/net/ethernet/intel/ice/ice_txrx_lib.c

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -231,8 +231,14 @@ ice_clean_xdp_tx_buf(struct ice_tx_ring *xdp_ring, struct ice_tx_buf *tx_buf)
231231
dma_unmap_single(xdp_ring->dev, dma_unmap_addr(tx_buf, dma),
232232
dma_unmap_len(tx_buf, len), DMA_TO_DEVICE);
233233
dma_unmap_len_set(tx_buf, len, 0);
234-
page_frag_free(tx_buf->raw_buf);
235-
tx_buf->raw_buf = NULL;
234+
235+
switch (tx_buf->type) {
236+
case ICE_TX_BUF_XDP_TX:
237+
page_frag_free(tx_buf->raw_buf);
238+
break;
239+
}
240+
241+
tx_buf->type = ICE_TX_BUF_EMPTY;
236242
}
237243

238244
/**
@@ -266,6 +272,7 @@ static u32 ice_clean_xdp_irq(struct ice_tx_ring *xdp_ring)
266272

267273
while (ready_frames) {
268274
struct ice_tx_buf *tx_buf = &xdp_ring->tx_buf[ntc];
275+
struct ice_tx_buf *head = tx_buf;
269276

270277
/* bytecount holds size of head + frags */
271278
total_bytes += tx_buf->bytecount;
@@ -275,7 +282,6 @@ static u32 ice_clean_xdp_irq(struct ice_tx_ring *xdp_ring)
275282
ready_frames -= frags + 1;
276283
xdp_tx++;
277284

278-
ice_clean_xdp_tx_buf(xdp_ring, tx_buf);
279285
ntc++;
280286
if (ntc == cnt)
281287
ntc = 0;
@@ -288,6 +294,8 @@ static u32 ice_clean_xdp_irq(struct ice_tx_ring *xdp_ring)
288294
if (ntc == cnt)
289295
ntc = 0;
290296
}
297+
298+
ice_clean_xdp_tx_buf(xdp_ring, head);
291299
}
292300

293301
tx_desc->cmd_type_offset_bsz = 0;
@@ -349,6 +357,7 @@ int __ice_xmit_xdp_ring(struct xdp_buff *xdp, struct ice_tx_ring *xdp_ring)
349357

350358
tx_desc->buf_addr = cpu_to_le64(dma);
351359
tx_desc->cmd_type_offset_bsz = ice_build_ctob(0, 0, size, 0);
360+
tx_buf->type = ICE_TX_BUF_XDP_TX;
352361
tx_buf->raw_buf = data;
353362

354363
ntu++;

drivers/net/ethernet/intel/ice/ice_xsk.c

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -631,7 +631,8 @@ static void ice_clean_xdp_irq_zc(struct ice_tx_ring *xdp_ring)
631631
for (i = 0; i < xsk_frames; i++) {
632632
tx_buf = &xdp_ring->tx_buf[ntc];
633633

634-
if (tx_buf->xdp) {
634+
if (tx_buf->type == ICE_TX_BUF_XSK_TX) {
635+
tx_buf->type = ICE_TX_BUF_EMPTY;
635636
xsk_buff_free(tx_buf->xdp);
636637
xdp_ring->xdp_tx_active--;
637638
} else {
@@ -685,6 +686,7 @@ static int ice_xmit_xdp_tx_zc(struct xdp_buff *xdp,
685686

686687
tx_buf = &xdp_ring->tx_buf[ntu];
687688
tx_buf->xdp = xdp;
689+
tx_buf->type = ICE_TX_BUF_XSK_TX;
688690
tx_desc = ICE_TX_DESC(xdp_ring, ntu);
689691
tx_desc->buf_addr = cpu_to_le64(dma);
690692
tx_desc->cmd_type_offset_bsz = ice_build_ctob(ICE_TX_DESC_CMD_EOP,
@@ -1083,12 +1085,12 @@ void ice_xsk_clean_xdp_ring(struct ice_tx_ring *xdp_ring)
10831085
while (ntc != ntu) {
10841086
struct ice_tx_buf *tx_buf = &xdp_ring->tx_buf[ntc];
10851087

1086-
if (tx_buf->xdp)
1088+
if (tx_buf->type == ICE_TX_BUF_XSK_TX) {
1089+
tx_buf->type = ICE_TX_BUF_EMPTY;
10871090
xsk_buff_free(tx_buf->xdp);
1088-
else
1091+
} else {
10891092
xsk_frames++;
1090-
1091-
tx_buf->raw_buf = NULL;
1093+
}
10921094

10931095
ntc++;
10941096
if (ntc >= xdp_ring->count)

0 commit comments

Comments
 (0)