Skip to content

Commit ec15103

Browse files
jkoong-fbAlexei Starovoitov
authored andcommitted
selftest/bpf/benchs: Add bpf_loop benchmark
Add benchmark to measure the throughput and latency of the bpf_loop call. Testing this on my dev machine on 1 thread, the data is as follows: nr_loops: 10 bpf_loop - throughput: 198.519 ± 0.155 M ops/s, latency: 5.037 ns/op nr_loops: 100 bpf_loop - throughput: 247.448 ± 0.305 M ops/s, latency: 4.041 ns/op nr_loops: 500 bpf_loop - throughput: 260.839 ± 0.380 M ops/s, latency: 3.834 ns/op nr_loops: 1000 bpf_loop - throughput: 262.806 ± 0.629 M ops/s, latency: 3.805 ns/op nr_loops: 5000 bpf_loop - throughput: 264.211 ± 1.508 M ops/s, latency: 3.785 ns/op nr_loops: 10000 bpf_loop - throughput: 265.366 ± 3.054 M ops/s, latency: 3.768 ns/op nr_loops: 50000 bpf_loop - throughput: 235.986 ± 20.205 M ops/s, latency: 4.238 ns/op nr_loops: 100000 bpf_loop - throughput: 264.482 ± 0.279 M ops/s, latency: 3.781 ns/op nr_loops: 500000 bpf_loop - throughput: 309.773 ± 87.713 M ops/s, latency: 3.228 ns/op nr_loops: 1000000 bpf_loop - throughput: 262.818 ± 4.143 M ops/s, latency: 3.805 ns/op >From this data, we can see that the latency per loop decreases as the number of loops increases. On this particular machine, each loop had an overhead of about ~4 ns, and we were able to run ~250 million loops per second. Signed-off-by: Joanne Koong <[email protected]> Signed-off-by: Alexei Starovoitov <[email protected]> Acked-by: Andrii Nakryiko <[email protected]> Link: https://lore.kernel.org/bpf/[email protected]
1 parent f6e659b commit ec15103

File tree

7 files changed

+203
-1
lines changed

7 files changed

+203
-1
lines changed

tools/testing/selftests/bpf/Makefile

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -531,6 +531,7 @@ $(OUTPUT)/bench_trigger.o: $(OUTPUT)/trigger_bench.skel.h
531531
$(OUTPUT)/bench_ringbufs.o: $(OUTPUT)/ringbuf_bench.skel.h \
532532
$(OUTPUT)/perfbuf_bench.skel.h
533533
$(OUTPUT)/bench_bloom_filter_map.o: $(OUTPUT)/bloom_filter_bench.skel.h
534+
$(OUTPUT)/bench_bpf_loop.o: $(OUTPUT)/bpf_loop_bench.skel.h
534535
$(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ)
535536
$(OUTPUT)/bench: LDLIBS += -lm
536537
$(OUTPUT)/bench: $(OUTPUT)/bench.o \
@@ -540,7 +541,8 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o \
540541
$(OUTPUT)/bench_rename.o \
541542
$(OUTPUT)/bench_trigger.o \
542543
$(OUTPUT)/bench_ringbufs.o \
543-
$(OUTPUT)/bench_bloom_filter_map.o
544+
$(OUTPUT)/bench_bloom_filter_map.o \
545+
$(OUTPUT)/bench_bpf_loop.o
544546
$(call msg,BINARY,,$@)
545547
$(Q)$(CC) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@
546548

tools/testing/selftests/bpf/bench.c

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,39 @@ void hits_drops_report_final(struct bench_res res[], int res_cnt)
134134
total_ops_mean, total_ops_stddev);
135135
}
136136

137+
void ops_report_progress(int iter, struct bench_res *res, long delta_ns)
138+
{
139+
double hits_per_sec, hits_per_prod;
140+
141+
hits_per_sec = res->hits / 1000000.0 / (delta_ns / 1000000000.0);
142+
hits_per_prod = hits_per_sec / env.producer_cnt;
143+
144+
printf("Iter %3d (%7.3lfus): ", iter, (delta_ns - 1000000000) / 1000.0);
145+
146+
printf("hits %8.3lfM/s (%7.3lfM/prod)\n", hits_per_sec, hits_per_prod);
147+
}
148+
149+
void ops_report_final(struct bench_res res[], int res_cnt)
150+
{
151+
double hits_mean = 0.0, hits_stddev = 0.0;
152+
int i;
153+
154+
for (i = 0; i < res_cnt; i++)
155+
hits_mean += res[i].hits / 1000000.0 / (0.0 + res_cnt);
156+
157+
if (res_cnt > 1) {
158+
for (i = 0; i < res_cnt; i++)
159+
hits_stddev += (hits_mean - res[i].hits / 1000000.0) *
160+
(hits_mean - res[i].hits / 1000000.0) /
161+
(res_cnt - 1.0);
162+
163+
hits_stddev = sqrt(hits_stddev);
164+
}
165+
printf("Summary: throughput %8.3lf \u00B1 %5.3lf M ops/s (%7.3lfM ops/prod), ",
166+
hits_mean, hits_stddev, hits_mean / env.producer_cnt);
167+
printf("latency %8.3lf ns/op\n", 1000.0 / hits_mean * env.producer_cnt);
168+
}
169+
137170
const char *argp_program_version = "benchmark";
138171
const char *argp_program_bug_address = "<[email protected]>";
139172
const char argp_program_doc[] =
@@ -171,10 +204,12 @@ static const struct argp_option opts[] = {
171204

172205
extern struct argp bench_ringbufs_argp;
173206
extern struct argp bench_bloom_map_argp;
207+
extern struct argp bench_bpf_loop_argp;
174208

175209
static const struct argp_child bench_parsers[] = {
176210
{ &bench_ringbufs_argp, 0, "Ring buffers benchmark", 0 },
177211
{ &bench_bloom_map_argp, 0, "Bloom filter map benchmark", 0 },
212+
{ &bench_bpf_loop_argp, 0, "bpf_loop helper benchmark", 0 },
178213
{},
179214
};
180215

@@ -373,6 +408,7 @@ extern const struct bench bench_bloom_update;
373408
extern const struct bench bench_bloom_false_positive;
374409
extern const struct bench bench_hashmap_without_bloom;
375410
extern const struct bench bench_hashmap_with_bloom;
411+
extern const struct bench bench_bpf_loop;
376412

377413
static const struct bench *benchs[] = {
378414
&bench_count_global,
@@ -404,6 +440,7 @@ static const struct bench *benchs[] = {
404440
&bench_bloom_false_positive,
405441
&bench_hashmap_without_bloom,
406442
&bench_hashmap_with_bloom,
443+
&bench_bpf_loop,
407444
};
408445

409446
static void setup_benchmark()

tools/testing/selftests/bpf/bench.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,8 @@ void hits_drops_report_progress(int iter, struct bench_res *res, long delta_ns);
5959
void hits_drops_report_final(struct bench_res res[], int res_cnt);
6060
void false_hits_report_progress(int iter, struct bench_res *res, long delta_ns);
6161
void false_hits_report_final(struct bench_res res[], int res_cnt);
62+
void ops_report_progress(int iter, struct bench_res *res, long delta_ns);
63+
void ops_report_final(struct bench_res res[], int res_cnt);
6264

6365
static inline __u64 get_time_ns() {
6466
struct timespec t;
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
// SPDX-License-Identifier: GPL-2.0
2+
/* Copyright (c) 2021 Facebook */
3+
4+
#include <argp.h>
5+
#include "bench.h"
6+
#include "bpf_loop_bench.skel.h"
7+
8+
/* BPF triggering benchmarks */
9+
static struct ctx {
10+
struct bpf_loop_bench *skel;
11+
} ctx;
12+
13+
static struct {
14+
__u32 nr_loops;
15+
} args = {
16+
.nr_loops = 10,
17+
};
18+
19+
enum {
20+
ARG_NR_LOOPS = 4000,
21+
};
22+
23+
static const struct argp_option opts[] = {
24+
{ "nr_loops", ARG_NR_LOOPS, "nr_loops", 0,
25+
"Set number of loops for the bpf_loop helper"},
26+
{},
27+
};
28+
29+
static error_t parse_arg(int key, char *arg, struct argp_state *state)
30+
{
31+
switch (key) {
32+
case ARG_NR_LOOPS:
33+
args.nr_loops = strtol(arg, NULL, 10);
34+
break;
35+
default:
36+
return ARGP_ERR_UNKNOWN;
37+
}
38+
39+
return 0;
40+
}
41+
42+
/* exported into benchmark runner */
43+
const struct argp bench_bpf_loop_argp = {
44+
.options = opts,
45+
.parser = parse_arg,
46+
};
47+
48+
static void validate(void)
49+
{
50+
if (env.consumer_cnt != 1) {
51+
fprintf(stderr, "benchmark doesn't support multi-consumer!\n");
52+
exit(1);
53+
}
54+
}
55+
56+
static void *producer(void *input)
57+
{
58+
while (true)
59+
/* trigger the bpf program */
60+
syscall(__NR_getpgid);
61+
62+
return NULL;
63+
}
64+
65+
static void *consumer(void *input)
66+
{
67+
return NULL;
68+
}
69+
70+
static void measure(struct bench_res *res)
71+
{
72+
res->hits = atomic_swap(&ctx.skel->bss->hits, 0);
73+
}
74+
75+
static void setup(void)
76+
{
77+
struct bpf_link *link;
78+
79+
setup_libbpf();
80+
81+
ctx.skel = bpf_loop_bench__open_and_load();
82+
if (!ctx.skel) {
83+
fprintf(stderr, "failed to open skeleton\n");
84+
exit(1);
85+
}
86+
87+
link = bpf_program__attach(ctx.skel->progs.benchmark);
88+
if (!link) {
89+
fprintf(stderr, "failed to attach program!\n");
90+
exit(1);
91+
}
92+
93+
ctx.skel->bss->nr_loops = args.nr_loops;
94+
}
95+
96+
const struct bench bench_bpf_loop = {
97+
.name = "bpf-loop",
98+
.validate = validate,
99+
.setup = setup,
100+
.producer_thread = producer,
101+
.consumer_thread = consumer,
102+
.measure = measure,
103+
.report_progress = ops_report_progress,
104+
.report_final = ops_report_final,
105+
};
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#!/bin/bash
2+
# SPDX-License-Identifier: GPL-2.0
3+
4+
source ./benchs/run_common.sh
5+
6+
set -eufo pipefail
7+
8+
for t in 1 4 8 12 16; do
9+
for i in 10 100 500 1000 5000 10000 50000 100000 500000 1000000; do
10+
subtitle "nr_loops: $i, nr_threads: $t"
11+
summarize_ops "bpf_loop: " \
12+
"$($RUN_BENCH -p $t --nr_loops $i bpf-loop)"
13+
printf "\n"
14+
done
15+
done

tools/testing/selftests/bpf/benchs/run_common.sh

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,14 @@ function percentage()
3333
echo "$*" | sed -E "s/.*Percentage\s=\s+([0-9]+\.[0-9]+).*/\1/"
3434
}
3535

36+
function ops()
37+
{
38+
echo -n "throughput: "
39+
echo -n "$*" | sed -E "s/.*throughput\s+([0-9]+\.[0-9]+ ± [0-9]+\.[0-9]+\sM\sops\/s).*/\1/"
40+
echo -n -e ", latency: "
41+
echo "$*" | sed -E "s/.*latency\s+([0-9]+\.[0-9]+\sns\/op).*/\1/"
42+
}
43+
3644
function total()
3745
{
3846
echo "$*" | sed -E "s/.*total operations\s+([0-9]+\.[0-9]+ ± [0-9]+\.[0-9]+M\/s).*/\1/"
@@ -52,6 +60,13 @@ function summarize_percentage()
5260
printf "%-20s %s%%\n" "$bench" "$(percentage $summary)"
5361
}
5462

63+
function summarize_ops()
64+
{
65+
bench="$1"
66+
summary=$(echo $2 | tail -n1)
67+
printf "%-20s %s\n" "$bench" "$(ops $summary)"
68+
}
69+
5570
function summarize_total()
5671
{
5772
bench="$1"
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
// SPDX-License-Identifier: GPL-2.0
2+
/* Copyright (c) 2021 Facebook */
3+
4+
#include "vmlinux.h"
5+
#include <bpf/bpf_helpers.h>
6+
7+
char _license[] SEC("license") = "GPL";
8+
9+
u32 nr_loops;
10+
long hits;
11+
12+
static int empty_callback(__u32 index, void *data)
13+
{
14+
return 0;
15+
}
16+
17+
SEC("fentry/__x64_sys_getpgid")
18+
int benchmark(void *ctx)
19+
{
20+
for (int i = 0; i < 1000; i++) {
21+
bpf_loop(nr_loops, empty_callback, NULL, 0);
22+
23+
__sync_add_and_fetch(&hits, nr_loops);
24+
}
25+
return 0;
26+
}

0 commit comments

Comments
 (0)