Skip to content

Commit 8c7dcb8

Browse files
Delyan KratunovAlexei Starovoitov
authored andcommitted
bpf: implement sleepable uprobes by chaining gps
uprobes work by raising a trap, setting a task flag from within the interrupt handler, and processing the actual work for the uprobe on the way back to userspace. As a result, uprobe handlers already execute in a might_fault/_sleep context. The primary obstacle to sleepable bpf uprobe programs is therefore on the bpf side. Namely, the bpf_prog_array attached to the uprobe is protected by normal rcu. In order for uprobe bpf programs to become sleepable, it has to be protected by the tasks_trace rcu flavor instead (and kfree() called after a corresponding grace period). Therefore, the free path for bpf_prog_array now chains a tasks_trace and normal grace periods one after the other. Users who iterate under tasks_trace read section would be safe, as would users who iterate under normal read sections (from non-sleepable locations). The downside is that the tasks_trace latency affects all perf_event-attached bpf programs (and not just uprobe ones). This is deemed safe given the possible attach rates for kprobe/uprobe/tp programs. Separately, non-sleepable programs need access to dynamically sized rcu-protected maps, so bpf_run_prog_array_sleepables now conditionally takes an rcu read section, in addition to the overarching tasks_trace section. Signed-off-by: Delyan Kratunov <[email protected]> Link: https://lore.kernel.org/r/ce844d62a2fd0443b08c5ab02e95bc7149f9aeb1.1655248076.git.delyank@fb.com Signed-off-by: Alexei Starovoitov <[email protected]>
1 parent d687f62 commit 8c7dcb8

File tree

4 files changed

+71
-5
lines changed

4 files changed

+71
-5
lines changed

include/linux/bpf.h

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#include <linux/stddef.h>
2727
#include <linux/bpfptr.h>
2828
#include <linux/btf.h>
29+
#include <linux/rcupdate_trace.h>
2930

3031
struct bpf_verifier_env;
3132
struct bpf_verifier_log;
@@ -1372,6 +1373,8 @@ extern struct bpf_empty_prog_array bpf_empty_prog_array;
13721373

13731374
struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags);
13741375
void bpf_prog_array_free(struct bpf_prog_array *progs);
1376+
/* Use when traversal over the bpf_prog_array uses tasks_trace rcu */
1377+
void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs);
13751378
int bpf_prog_array_length(struct bpf_prog_array *progs);
13761379
bool bpf_prog_array_is_empty(struct bpf_prog_array *array);
13771380
int bpf_prog_array_copy_to_user(struct bpf_prog_array *progs,
@@ -1463,6 +1466,55 @@ bpf_prog_run_array(const struct bpf_prog_array *array,
14631466
return ret;
14641467
}
14651468

1469+
/* Notes on RCU design for bpf_prog_arrays containing sleepable programs:
1470+
*
1471+
* We use the tasks_trace rcu flavor read section to protect the bpf_prog_array
1472+
* overall. As a result, we must use the bpf_prog_array_free_sleepable
1473+
* in order to use the tasks_trace rcu grace period.
1474+
*
1475+
* When a non-sleepable program is inside the array, we take the rcu read
1476+
* section and disable preemption for that program alone, so it can access
1477+
* rcu-protected dynamically sized maps.
1478+
*/
1479+
static __always_inline u32
1480+
bpf_prog_run_array_sleepable(const struct bpf_prog_array __rcu *array_rcu,
1481+
const void *ctx, bpf_prog_run_fn run_prog)
1482+
{
1483+
const struct bpf_prog_array_item *item;
1484+
const struct bpf_prog *prog;
1485+
const struct bpf_prog_array *array;
1486+
struct bpf_run_ctx *old_run_ctx;
1487+
struct bpf_trace_run_ctx run_ctx;
1488+
u32 ret = 1;
1489+
1490+
might_fault();
1491+
1492+
rcu_read_lock_trace();
1493+
migrate_disable();
1494+
1495+
array = rcu_dereference_check(array_rcu, rcu_read_lock_trace_held());
1496+
if (unlikely(!array))
1497+
goto out;
1498+
old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
1499+
item = &array->items[0];
1500+
while ((prog = READ_ONCE(item->prog))) {
1501+
if (!prog->aux->sleepable)
1502+
rcu_read_lock();
1503+
1504+
run_ctx.bpf_cookie = item->bpf_cookie;
1505+
ret &= run_prog(prog, ctx);
1506+
item++;
1507+
1508+
if (!prog->aux->sleepable)
1509+
rcu_read_unlock();
1510+
}
1511+
bpf_reset_run_ctx(old_run_ctx);
1512+
out:
1513+
migrate_enable();
1514+
rcu_read_unlock_trace();
1515+
return ret;
1516+
}
1517+
14661518
#ifdef CONFIG_BPF_SYSCALL
14671519
DECLARE_PER_CPU(int, bpf_prog_active);
14681520
extern struct mutex bpf_stats_enabled_mutex;

kernel/bpf/core.c

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2279,6 +2279,21 @@ void bpf_prog_array_free(struct bpf_prog_array *progs)
22792279
kfree_rcu(progs, rcu);
22802280
}
22812281

2282+
static void __bpf_prog_array_free_sleepable_cb(struct rcu_head *rcu)
2283+
{
2284+
struct bpf_prog_array *progs;
2285+
2286+
progs = container_of(rcu, struct bpf_prog_array, rcu);
2287+
kfree_rcu(progs, rcu);
2288+
}
2289+
2290+
void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs)
2291+
{
2292+
if (!progs || progs == &bpf_empty_prog_array.hdr)
2293+
return;
2294+
call_rcu_tasks_trace(&progs->rcu, __bpf_prog_array_free_sleepable_cb);
2295+
}
2296+
22822297
int bpf_prog_array_length(struct bpf_prog_array *array)
22832298
{
22842299
struct bpf_prog_array_item *item;

kernel/trace/bpf_trace.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1936,7 +1936,7 @@ int perf_event_attach_bpf_prog(struct perf_event *event,
19361936
event->prog = prog;
19371937
event->bpf_cookie = bpf_cookie;
19381938
rcu_assign_pointer(event->tp_event->prog_array, new_array);
1939-
bpf_prog_array_free(old_array);
1939+
bpf_prog_array_free_sleepable(old_array);
19401940

19411941
unlock:
19421942
mutex_unlock(&bpf_event_mutex);
@@ -1962,7 +1962,7 @@ void perf_event_detach_bpf_prog(struct perf_event *event)
19621962
bpf_prog_array_delete_safe(old_array, event->prog);
19631963
} else {
19641964
rcu_assign_pointer(event->tp_event->prog_array, new_array);
1965-
bpf_prog_array_free(old_array);
1965+
bpf_prog_array_free_sleepable(old_array);
19661966
}
19671967

19681968
bpf_prog_put(event->prog);

kernel/trace/trace_uprobe.c

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include <linux/namei.h>
1717
#include <linux/string.h>
1818
#include <linux/rculist.h>
19+
#include <linux/filter.h>
1920

2021
#include "trace_dynevent.h"
2122
#include "trace_probe.h"
@@ -1346,9 +1347,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
13461347
if (bpf_prog_array_valid(call)) {
13471348
u32 ret;
13481349

1349-
preempt_disable();
1350-
ret = trace_call_bpf(call, regs);
1351-
preempt_enable();
1350+
ret = bpf_prog_run_array_sleepable(call->prog_array, regs, bpf_prog_run);
13521351
if (!ret)
13531352
return;
13541353
}

0 commit comments

Comments
 (0)