Skip to content

Commit 9879a38

Browse files
Jiong Wangborkmann
authored andcommitted
nfp: bpf: implement memory bulk copy for length within 32-bytes
For NFP, we want to re-group a sequence of load/store pairs lowered from memcpy/memmove into single memory bulk operation which then could be accelerated using NFP CPP bus. This patch extends the existing load/store auxiliary information by adding two new fields: struct bpf_insn *paired_st; s16 ldst_gather_len; Both fields are supposed to be carried by the the load instruction at the head of the sequence. "paired_st" is the corresponding store instruction at the head and "ldst_gather_len" is the gathered length. If "ldst_gather_len" is negative, then the sequence is doing memory load/store in descending order, otherwise it is in ascending order. We need this information to detect overlapped memory access. This patch then optimize memory bulk copy when the copy length is within 32-bytes. The strategy of read/write used is: * Read. Use read32 (direct_ref), always. * Write. - length <= 8-bytes write8 (direct_ref). - length <= 32-bytes and is 4-byte aligned write32 (direct_ref). - length <= 32-bytes but is not 4-byte aligned write8 (indirect_ref). NOTE: the optimization should not change program semantics. The destination register of the last load instruction should contain the same value before and after this optimization. Signed-off-by: Jiong Wang <[email protected]> Reviewed-by: Jakub Kicinski <[email protected]> Signed-off-by: Daniel Borkmann <[email protected]>
1 parent 5e4d6d2 commit 9879a38

File tree

4 files changed

+122
-0
lines changed

4 files changed

+122
-0
lines changed

drivers/net/ethernet/netronome/nfp/bpf/jit.c

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,13 @@ emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer,
154154
emit_cmd_any(nfp_prog, op, mode, xfer, lreg, rreg, size, sync, false);
155155
}
156156

157+
static void
158+
emit_cmd_indir(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer,
159+
swreg lreg, swreg rreg, u8 size, bool sync)
160+
{
161+
emit_cmd_any(nfp_prog, op, mode, xfer, lreg, rreg, size, sync, true);
162+
}
163+
157164
static void
158165
__emit_br(struct nfp_prog *nfp_prog, enum br_mask mask, enum br_ev_pip ev_pip,
159166
enum br_ctx_signal_state css, u16 addr, u8 defer)
@@ -515,6 +522,109 @@ static void wrp_reg_mov(struct nfp_prog *nfp_prog, u16 dst, u16 src)
515522
wrp_mov(nfp_prog, reg_both(dst), reg_b(src));
516523
}
517524

525+
/* wrp_reg_subpart() - load @field_len bytes from @offset of @src, write the
526+
* result to @dst from low end.
527+
*/
528+
static void
529+
wrp_reg_subpart(struct nfp_prog *nfp_prog, swreg dst, swreg src, u8 field_len,
530+
u8 offset)
531+
{
532+
enum shf_sc sc = offset ? SHF_SC_R_SHF : SHF_SC_NONE;
533+
u8 mask = (1 << field_len) - 1;
534+
535+
emit_ld_field_any(nfp_prog, dst, mask, src, sc, offset * 8, true);
536+
}
537+
538+
/* NFP has Command Push Pull bus which supports bluk memory operations. */
539+
static int nfp_cpp_memcpy(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
540+
{
541+
bool descending_seq = meta->ldst_gather_len < 0;
542+
s16 len = abs(meta->ldst_gather_len);
543+
swreg src_base, off;
544+
unsigned int i;
545+
u8 xfer_num;
546+
547+
if (WARN_ON_ONCE(len > 32))
548+
return -EOPNOTSUPP;
549+
550+
off = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
551+
src_base = reg_a(meta->insn.src_reg * 2);
552+
xfer_num = round_up(len, 4) / 4;
553+
554+
/* Memory read from source addr into transfer-in registers. */
555+
emit_cmd(nfp_prog, CMD_TGT_READ32_SWAP, CMD_MODE_32b, 0, src_base, off,
556+
xfer_num - 1, true);
557+
558+
/* Move from transfer-in to transfer-out. */
559+
for (i = 0; i < xfer_num; i++)
560+
wrp_mov(nfp_prog, reg_xfer(i), reg_xfer(i));
561+
562+
off = re_load_imm_any(nfp_prog, meta->paired_st->off, imm_b(nfp_prog));
563+
564+
if (len <= 8) {
565+
/* Use single direct_ref write8. */
566+
emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
567+
reg_a(meta->paired_st->dst_reg * 2), off, len - 1,
568+
true);
569+
} else if (IS_ALIGNED(len, 4)) {
570+
/* Use single direct_ref write32. */
571+
emit_cmd(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
572+
reg_a(meta->paired_st->dst_reg * 2), off, xfer_num - 1,
573+
true);
574+
} else {
575+
/* Use single indirect_ref write8. */
576+
wrp_immed(nfp_prog, reg_none(),
577+
CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, len - 1));
578+
emit_cmd_indir(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
579+
reg_a(meta->paired_st->dst_reg * 2), off,
580+
len - 1, true);
581+
}
582+
583+
/* TODO: The following extra load is to make sure data flow be identical
584+
* before and after we do memory copy optimization.
585+
*
586+
* The load destination register is not guaranteed to be dead, so we
587+
* need to make sure it is loaded with the value the same as before
588+
* this transformation.
589+
*
590+
* These extra loads could be removed once we have accurate register
591+
* usage information.
592+
*/
593+
if (descending_seq)
594+
xfer_num = 0;
595+
else if (BPF_SIZE(meta->insn.code) != BPF_DW)
596+
xfer_num = xfer_num - 1;
597+
else
598+
xfer_num = xfer_num - 2;
599+
600+
switch (BPF_SIZE(meta->insn.code)) {
601+
case BPF_B:
602+
wrp_reg_subpart(nfp_prog, reg_both(meta->insn.dst_reg * 2),
603+
reg_xfer(xfer_num), 1,
604+
IS_ALIGNED(len, 4) ? 3 : (len & 3) - 1);
605+
break;
606+
case BPF_H:
607+
wrp_reg_subpart(nfp_prog, reg_both(meta->insn.dst_reg * 2),
608+
reg_xfer(xfer_num), 2, (len & 3) ^ 2);
609+
break;
610+
case BPF_W:
611+
wrp_mov(nfp_prog, reg_both(meta->insn.dst_reg * 2),
612+
reg_xfer(0));
613+
break;
614+
case BPF_DW:
615+
wrp_mov(nfp_prog, reg_both(meta->insn.dst_reg * 2),
616+
reg_xfer(xfer_num));
617+
wrp_mov(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1),
618+
reg_xfer(xfer_num + 1));
619+
break;
620+
}
621+
622+
if (BPF_SIZE(meta->insn.code) != BPF_DW)
623+
wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
624+
625+
return 0;
626+
}
627+
518628
static int
519629
data_ld(struct nfp_prog *nfp_prog, swreg offset, u8 dst_gpr, int size)
520630
{
@@ -1490,6 +1600,9 @@ static int
14901600
mem_ldx(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
14911601
unsigned int size)
14921602
{
1603+
if (meta->ldst_gather_len)
1604+
return nfp_cpp_memcpy(nfp_prog, meta);
1605+
14931606
if (meta->ptr.type == PTR_TO_CTX) {
14941607
if (nfp_prog->type == BPF_PROG_TYPE_XDP)
14951608
return mem_ldx_xdp(nfp_prog, meta, size);

drivers/net/ethernet/netronome/nfp/bpf/main.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,8 @@ typedef int (*instr_cb_t)(struct nfp_prog *, struct nfp_insn_meta *);
9595
* struct nfp_insn_meta - BPF instruction wrapper
9696
* @insn: BPF instruction
9797
* @ptr: pointer type for memory operations
98+
* @ldst_gather_len: memcpy length gathered from load/store sequence
99+
* @paired_st: the paired store insn at the head of the sequence
98100
* @ptr_not_const: pointer is not always constant
99101
* @jmp_dst: destination info for jump instructions
100102
* @off: index of first generated machine instruction (in nfp_prog.prog)
@@ -109,6 +111,8 @@ struct nfp_insn_meta {
109111
union {
110112
struct {
111113
struct bpf_reg_state ptr;
114+
struct bpf_insn *paired_st;
115+
s16 ldst_gather_len;
112116
bool ptr_not_const;
113117
};
114118
struct nfp_insn_meta *jmp_dst;

drivers/net/ethernet/netronome/nfp/nfp_asm.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141

4242
const struct cmd_tgt_act cmd_tgt_act[__CMD_TGT_MAP_SIZE] = {
4343
[CMD_TGT_WRITE8_SWAP] = { 0x02, 0x42 },
44+
[CMD_TGT_WRITE32_SWAP] = { 0x02, 0x5f },
4445
[CMD_TGT_READ8] = { 0x01, 0x43 },
4546
[CMD_TGT_READ32] = { 0x00, 0x5c },
4647
[CMD_TGT_READ32_LE] = { 0x01, 0x5c },

drivers/net/ethernet/netronome/nfp/nfp_asm.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,7 @@ struct cmd_tgt_act {
220220
enum cmd_tgt_map {
221221
CMD_TGT_READ8,
222222
CMD_TGT_WRITE8_SWAP,
223+
CMD_TGT_WRITE32_SWAP,
223224
CMD_TGT_READ32,
224225
CMD_TGT_READ32_LE,
225226
CMD_TGT_READ32_SWAP,
@@ -241,6 +242,9 @@ enum cmd_ctx_swap {
241242
CMD_CTX_NO_SWAP = 3,
242243
};
243244

245+
#define CMD_OVE_LEN BIT(7)
246+
#define CMD_OV_LEN GENMASK(12, 8)
247+
244248
#define OP_LCSR_BASE 0x0fc00000000ULL
245249
#define OP_LCSR_A_SRC 0x000000003ffULL
246250
#define OP_LCSR_B_SRC 0x000000ffc00ULL

0 commit comments

Comments
 (0)