Skip to content

Commit 6c21a2a

Browse files
committed
Merge branch 'bpf-stack-tracker'
Alexei Starovoitov says: ==================== bpf: stack depth tracking Introduce tracking of bpf program stack depth in the verifier and use that info to reduce bpf program stack consumption in the interpreter and x64 JIT. Other JITs can take advantage of it as well in the future. Most of the programs consume very little stack, so it's good optimization in general and it's the first step toward bpf to bpf function calls. Also use internal opcode for bpf_tail_call() marking to make clear that jmp|call|x opcode is not uapi and may be used for actual indirect call opcode in the future. ==================== Signed-off-by: David S. Miller <[email protected]>
2 parents d2e0ef4 + 2960ae4 commit 6c21a2a

File tree

12 files changed

+147
-71
lines changed

12 files changed

+147
-71
lines changed

arch/arm64/net/bpf_jit_comp.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -586,7 +586,7 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)
586586
break;
587587
}
588588
/* tail call */
589-
case BPF_JMP | BPF_CALL | BPF_X:
589+
case BPF_JMP | BPF_TAIL_CALL:
590590
if (emit_bpf_tail_call(ctx))
591591
return -EFAULT;
592592
break;

arch/powerpc/net/bpf_jit_comp64.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -938,7 +938,7 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image,
938938
/*
939939
* Tail call
940940
*/
941-
case BPF_JMP | BPF_CALL | BPF_X:
941+
case BPF_JMP | BPF_TAIL_CALL:
942942
ctx->seen |= SEEN_TAILCALL;
943943
bpf_jit_emit_tail_call(image, ctx, addrs[i + 1]);
944944
break;

arch/s390/net/bpf_jit_comp.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -991,7 +991,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i
991991
}
992992
break;
993993
}
994-
case BPF_JMP | BPF_CALL | BPF_X:
994+
case BPF_JMP | BPF_TAIL_CALL:
995995
/*
996996
* Implicit input:
997997
* B1: pointer to ctx

arch/sparc/net/bpf_jit_comp_64.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1217,7 +1217,7 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)
12171217
}
12181218

12191219
/* tail call */
1220-
case BPF_JMP | BPF_CALL |BPF_X:
1220+
case BPF_JMP | BPF_TAIL_CALL:
12211221
emit_tail_call(ctx);
12221222
break;
12231223

arch/x86/net/bpf_jit.S

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,6 @@
1919
*/
2020
#define SKBDATA %r10
2121
#define SKF_MAX_NEG_OFF $(-0x200000) /* SKF_LL_OFF from filter.h */
22-
#define MAX_BPF_STACK (512 /* from filter.h */ + \
23-
32 /* space for rbx,r13,r14,r15 */ + \
24-
8 /* space for skb_copy_bits */)
2522

2623
#define FUNC(name) \
2724
.globl name; \
@@ -66,7 +63,7 @@ FUNC(sk_load_byte_positive_offset)
6663

6764
/* rsi contains offset and can be scratched */
6865
#define bpf_slow_path_common(LEN) \
69-
lea -MAX_BPF_STACK + 32(%rbp), %rdx;\
66+
lea 32(%rbp), %rdx;\
7067
FRAME_BEGIN; \
7168
mov %rbx, %rdi; /* arg1 == skb */ \
7269
push %r9; \
@@ -83,22 +80,22 @@ FUNC(sk_load_byte_positive_offset)
8380
bpf_slow_path_word:
8481
bpf_slow_path_common(4)
8582
js bpf_error
86-
mov - MAX_BPF_STACK + 32(%rbp),%eax
83+
mov 32(%rbp),%eax
8784
bswap %eax
8885
ret
8986

9087
bpf_slow_path_half:
9188
bpf_slow_path_common(2)
9289
js bpf_error
93-
mov - MAX_BPF_STACK + 32(%rbp),%ax
90+
mov 32(%rbp),%ax
9491
rol $8,%ax
9592
movzwl %ax,%eax
9693
ret
9794

9895
bpf_slow_path_byte:
9996
bpf_slow_path_common(1)
10097
js bpf_error
101-
movzbl - MAX_BPF_STACK + 32(%rbp),%eax
98+
movzbl 32(%rbp),%eax
10299
ret
103100

104101
#define sk_negative_common(SIZE) \
@@ -148,9 +145,10 @@ FUNC(sk_load_byte_negative_offset)
148145
bpf_error:
149146
# force a return 0 from jit handler
150147
xor %eax,%eax
151-
mov - MAX_BPF_STACK(%rbp),%rbx
152-
mov - MAX_BPF_STACK + 8(%rbp),%r13
153-
mov - MAX_BPF_STACK + 16(%rbp),%r14
154-
mov - MAX_BPF_STACK + 24(%rbp),%r15
148+
mov (%rbp),%rbx
149+
mov 8(%rbp),%r13
150+
mov 16(%rbp),%r14
151+
mov 24(%rbp),%r15
152+
add $40, %rbp
155153
leaveq
156154
ret

arch/x86/net/bpf_jit_comp.c

Lines changed: 35 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -197,31 +197,34 @@ struct jit_context {
197197
#define BPF_MAX_INSN_SIZE 128
198198
#define BPF_INSN_SAFETY 64
199199

200-
#define STACKSIZE \
201-
(MAX_BPF_STACK + \
202-
32 /* space for rbx, r13, r14, r15 */ + \
200+
#define AUX_STACK_SPACE \
201+
(32 /* space for rbx, r13, r14, r15 */ + \
203202
8 /* space for skb_copy_bits() buffer */)
204203

205-
#define PROLOGUE_SIZE 48
204+
#define PROLOGUE_SIZE 37
206205

207206
/* emit x64 prologue code for BPF program and check it's size.
208207
* bpf_tail_call helper will skip it while jumping into another program
209208
*/
210-
static void emit_prologue(u8 **pprog)
209+
static void emit_prologue(u8 **pprog, u32 stack_depth)
211210
{
212211
u8 *prog = *pprog;
213212
int cnt = 0;
214213

215214
EMIT1(0x55); /* push rbp */
216215
EMIT3(0x48, 0x89, 0xE5); /* mov rbp,rsp */
217216

218-
/* sub rsp, STACKSIZE */
219-
EMIT3_off32(0x48, 0x81, 0xEC, STACKSIZE);
217+
/* sub rsp, rounded_stack_depth + AUX_STACK_SPACE */
218+
EMIT3_off32(0x48, 0x81, 0xEC,
219+
round_up(stack_depth, 8) + AUX_STACK_SPACE);
220+
221+
/* sub rbp, AUX_STACK_SPACE */
222+
EMIT4(0x48, 0x83, 0xED, AUX_STACK_SPACE);
220223

221224
/* all classic BPF filters use R6(rbx) save it */
222225

223-
/* mov qword ptr [rbp-X],rbx */
224-
EMIT3_off32(0x48, 0x89, 0x9D, -STACKSIZE);
226+
/* mov qword ptr [rbp+0],rbx */
227+
EMIT4(0x48, 0x89, 0x5D, 0);
225228

226229
/* bpf_convert_filter() maps classic BPF register X to R7 and uses R8
227230
* as temporary, so all tcpdump filters need to spill/fill R7(r13) and
@@ -231,12 +234,12 @@ static void emit_prologue(u8 **pprog)
231234
* than synthetic ones. Therefore not worth adding complexity.
232235
*/
233236

234-
/* mov qword ptr [rbp-X],r13 */
235-
EMIT3_off32(0x4C, 0x89, 0xAD, -STACKSIZE + 8);
236-
/* mov qword ptr [rbp-X],r14 */
237-
EMIT3_off32(0x4C, 0x89, 0xB5, -STACKSIZE + 16);
238-
/* mov qword ptr [rbp-X],r15 */
239-
EMIT3_off32(0x4C, 0x89, 0xBD, -STACKSIZE + 24);
237+
/* mov qword ptr [rbp+8],r13 */
238+
EMIT4(0x4C, 0x89, 0x6D, 8);
239+
/* mov qword ptr [rbp+16],r14 */
240+
EMIT4(0x4C, 0x89, 0x75, 16);
241+
/* mov qword ptr [rbp+24],r15 */
242+
EMIT4(0x4C, 0x89, 0x7D, 24);
240243

241244
/* Clear the tail call counter (tail_call_cnt): for eBPF tail calls
242245
* we need to reset the counter to 0. It's done in two instructions,
@@ -246,8 +249,8 @@ static void emit_prologue(u8 **pprog)
246249

247250
/* xor eax, eax */
248251
EMIT2(0x31, 0xc0);
249-
/* mov qword ptr [rbp-X], rax */
250-
EMIT3_off32(0x48, 0x89, 0x85, -STACKSIZE + 32);
252+
/* mov qword ptr [rbp+32], rax */
253+
EMIT4(0x48, 0x89, 0x45, 32);
251254

252255
BUILD_BUG_ON(cnt != PROLOGUE_SIZE);
253256
*pprog = prog;
@@ -289,13 +292,13 @@ static void emit_bpf_tail_call(u8 **pprog)
289292
/* if (tail_call_cnt > MAX_TAIL_CALL_CNT)
290293
* goto out;
291294
*/
292-
EMIT2_off32(0x8B, 0x85, -STACKSIZE + 36); /* mov eax, dword ptr [rbp - 516] */
295+
EMIT2_off32(0x8B, 0x85, 36); /* mov eax, dword ptr [rbp + 36] */
293296
EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */
294297
#define OFFSET2 36
295298
EMIT2(X86_JA, OFFSET2); /* ja out */
296299
label2 = cnt;
297300
EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */
298-
EMIT2_off32(0x89, 0x85, -STACKSIZE + 36); /* mov dword ptr [rbp - 516], eax */
301+
EMIT2_off32(0x89, 0x85, 36); /* mov dword ptr [rbp + 36], eax */
299302

300303
/* prog = array->ptrs[index]; */
301304
EMIT4_off32(0x48, 0x8D, 0x84, 0xD6, /* lea rax, [rsi + rdx * 8 + offsetof(...)] */
@@ -361,7 +364,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
361364
int proglen = 0;
362365
u8 *prog = temp;
363366

364-
emit_prologue(&prog);
367+
emit_prologue(&prog, bpf_prog->aux->stack_depth);
365368

366369
if (seen_ld_abs)
367370
emit_load_skb_data_hlen(&prog);
@@ -877,7 +880,7 @@ xadd: if (is_imm8(insn->off))
877880
}
878881
break;
879882

880-
case BPF_JMP | BPF_CALL | BPF_X:
883+
case BPF_JMP | BPF_TAIL_CALL:
881884
emit_bpf_tail_call(&prog);
882885
break;
883886

@@ -1036,15 +1039,17 @@ xadd: if (is_imm8(insn->off))
10361039
seen_exit = true;
10371040
/* update cleanup_addr */
10381041
ctx->cleanup_addr = proglen;
1039-
/* mov rbx, qword ptr [rbp-X] */
1040-
EMIT3_off32(0x48, 0x8B, 0x9D, -STACKSIZE);
1041-
/* mov r13, qword ptr [rbp-X] */
1042-
EMIT3_off32(0x4C, 0x8B, 0xAD, -STACKSIZE + 8);
1043-
/* mov r14, qword ptr [rbp-X] */
1044-
EMIT3_off32(0x4C, 0x8B, 0xB5, -STACKSIZE + 16);
1045-
/* mov r15, qword ptr [rbp-X] */
1046-
EMIT3_off32(0x4C, 0x8B, 0xBD, -STACKSIZE + 24);
1047-
1042+
/* mov rbx, qword ptr [rbp+0] */
1043+
EMIT4(0x48, 0x8B, 0x5D, 0);
1044+
/* mov r13, qword ptr [rbp+8] */
1045+
EMIT4(0x4C, 0x8B, 0x6D, 8);
1046+
/* mov r14, qword ptr [rbp+16] */
1047+
EMIT4(0x4C, 0x8B, 0x75, 16);
1048+
/* mov r15, qword ptr [rbp+24] */
1049+
EMIT4(0x4C, 0x8B, 0x7D, 24);
1050+
1051+
/* add rbp, AUX_STACK_SPACE */
1052+
EMIT4(0x48, 0x83, 0xC5, AUX_STACK_SPACE);
10481053
EMIT1(0xC9); /* leave */
10491054
EMIT1(0xC3); /* ret */
10501055
break;

include/linux/bpf.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,7 @@ struct bpf_prog_aux {
171171
atomic_t refcnt;
172172
u32 used_map_cnt;
173173
u32 max_ctx_offset;
174+
u32 stack_depth;
174175
struct latch_tree_node ksym_tnode;
175176
struct list_head ksym_lnode;
176177
const struct bpf_verifier_ops *ops;

include/linux/filter.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,9 @@ struct bpf_prog_aux;
5757
#define BPF_REG_AX MAX_BPF_REG
5858
#define MAX_BPF_JIT_REG (MAX_BPF_REG + 1)
5959

60+
/* unused opcode to mark special call to bpf_tail_call() helper */
61+
#define BPF_TAIL_CALL 0xf0
62+
6063
/* As per nm, we expose JITed images as text (code) section for
6164
* kallsyms. That way, tools like perf can find it to match
6265
* addresses.

kernel/bpf/core.c

Lines changed: 38 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -763,10 +763,10 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
763763
*
764764
* Decode and execute eBPF instructions.
765765
*/
766-
static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
766+
static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn,
767+
u64 *stack)
767768
{
768-
u64 stack[MAX_BPF_STACK / sizeof(u64)];
769-
u64 regs[MAX_BPF_REG], tmp;
769+
u64 tmp;
770770
static const void *jumptable[256] = {
771771
[0 ... 255] = &&default_label,
772772
/* Now overwrite non-defaults ... */
@@ -824,7 +824,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
824824
[BPF_ALU64 | BPF_NEG] = &&ALU64_NEG,
825825
/* Call instruction */
826826
[BPF_JMP | BPF_CALL] = &&JMP_CALL,
827-
[BPF_JMP | BPF_CALL | BPF_X] = &&JMP_TAIL_CALL,
827+
[BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL,
828828
/* Jumps */
829829
[BPF_JMP | BPF_JA] = &&JMP_JA,
830830
[BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X,
@@ -874,9 +874,6 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
874874
#define CONT ({ insn++; goto select_insn; })
875875
#define CONT_JMP ({ insn++; goto select_insn; })
876876

877-
FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
878-
ARG1 = (u64) (unsigned long) ctx;
879-
880877
select_insn:
881878
goto *jumptable[insn->code];
882879

@@ -1219,7 +1216,39 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
12191216
WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code);
12201217
return 0;
12211218
}
1222-
STACK_FRAME_NON_STANDARD(__bpf_prog_run); /* jump table */
1219+
STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */
1220+
1221+
#define PROG_NAME(stack_size) __bpf_prog_run##stack_size
1222+
#define DEFINE_BPF_PROG_RUN(stack_size) \
1223+
static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \
1224+
{ \
1225+
u64 stack[stack_size / sizeof(u64)]; \
1226+
u64 regs[MAX_BPF_REG]; \
1227+
\
1228+
FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
1229+
ARG1 = (u64) (unsigned long) ctx; \
1230+
return ___bpf_prog_run(regs, insn, stack); \
1231+
}
1232+
1233+
#define EVAL1(FN, X) FN(X)
1234+
#define EVAL2(FN, X, Y...) FN(X) EVAL1(FN, Y)
1235+
#define EVAL3(FN, X, Y...) FN(X) EVAL2(FN, Y)
1236+
#define EVAL4(FN, X, Y...) FN(X) EVAL3(FN, Y)
1237+
#define EVAL5(FN, X, Y...) FN(X) EVAL4(FN, Y)
1238+
#define EVAL6(FN, X, Y...) FN(X) EVAL5(FN, Y)
1239+
1240+
EVAL6(DEFINE_BPF_PROG_RUN, 32, 64, 96, 128, 160, 192);
1241+
EVAL6(DEFINE_BPF_PROG_RUN, 224, 256, 288, 320, 352, 384);
1242+
EVAL4(DEFINE_BPF_PROG_RUN, 416, 448, 480, 512);
1243+
1244+
#define PROG_NAME_LIST(stack_size) PROG_NAME(stack_size),
1245+
1246+
static unsigned int (*interpreters[])(const void *ctx,
1247+
const struct bpf_insn *insn) = {
1248+
EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192)
1249+
EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)
1250+
EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
1251+
};
12231252

12241253
bool bpf_prog_array_compatible(struct bpf_array *array,
12251254
const struct bpf_prog *fp)
@@ -1268,7 +1297,7 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
12681297
*/
12691298
struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
12701299
{
1271-
fp->bpf_func = (void *) __bpf_prog_run;
1300+
fp->bpf_func = interpreters[round_down(fp->aux->stack_depth, 32) / 32];
12721301

12731302
/* eBPF JITs can rewrite the program in case constant
12741303
* blinding is active. However, in case of error during

kernel/bpf/verifier.c

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -926,6 +926,10 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
926926
verbose("invalid stack off=%d size=%d\n", off, size);
927927
return -EACCES;
928928
}
929+
930+
if (env->prog->aux->stack_depth < -off)
931+
env->prog->aux->stack_depth = -off;
932+
929933
if (t == BPF_WRITE) {
930934
if (!env->allow_ptr_leaks &&
931935
state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL &&
@@ -1032,6 +1036,9 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
10321036
return -EACCES;
10331037
}
10341038

1039+
if (env->prog->aux->stack_depth < -off)
1040+
env->prog->aux->stack_depth = -off;
1041+
10351042
if (meta && meta->raw_mode) {
10361043
meta->access_size = access_size;
10371044
meta->regno = regno;
@@ -3167,7 +3174,8 @@ static int do_check(struct bpf_verifier_env *env)
31673174
insn_idx++;
31683175
}
31693176

3170-
verbose("processed %d insns\n", insn_processed);
3177+
verbose("processed %d insns, stack depth %d\n",
3178+
insn_processed, env->prog->aux->stack_depth);
31713179
return 0;
31723180
}
31733181

@@ -3462,14 +3470,15 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
34623470
* the program array.
34633471
*/
34643472
prog->cb_access = 1;
3473+
env->prog->aux->stack_depth = MAX_BPF_STACK;
34653474

34663475
/* mark bpf_tail_call as different opcode to avoid
34673476
* conditional branch in the interpeter for every normal
34683477
* call and to prevent accidental JITing by JIT compiler
34693478
* that doesn't support bpf_tail_call yet
34703479
*/
34713480
insn->imm = 0;
3472-
insn->code |= BPF_X;
3481+
insn->code = BPF_JMP | BPF_TAIL_CALL;
34733482
continue;
34743483
}
34753484

0 commit comments

Comments
 (0)