Skip to content

Commit 557c0c6

Browse files
4astdavem330
authored andcommitted
bpf: convert stackmap to pre-allocation
It was observed that calling bpf_get_stackid() from a kprobe inside slub or from spin_unlock causes similar deadlock as with hashmap, therefore convert stackmap to use pre-allocated memory. The call_rcu is no longer feasible mechanism, since delayed freeing causes bpf_get_stackid() to fail unpredictably when number of actual stacks is significantly less than user requested max_entries. Since elements are no longer freed into slub, we can push elements into freelist immediately and let them be recycled. However the very unlikley race between user space map_lookup() and program-side recycling is possible: cpu0 cpu1 ---- ---- user does lookup(stackidX) starts copying ips into buffer delete(stackidX) calls bpf_get_stackid() which recyles the element and overwrites with new stack trace To avoid user space seeing a partial stack trace consisting of two merged stack traces, do bucket = xchg(, NULL); copy; xchg(,bucket); to preserve consistent stack trace delivery to user space. Now we can move memset(,0) of left-over element value from critical path of bpf_get_stackid() into slow-path of user space lookup. Also disallow lookup() from bpf program, since it's useless and program shouldn't be messing with collected stack trace. Note that similar race between user space lookup and kernel side updates is also present in hashmap, but it's not a new race. bpf programs were always allowed to modify hash and array map elements while user space is copying them. Fixes: d5a3b1f ("bpf: introduce BPF_MAP_TYPE_STACK_TRACE") Signed-off-by: Alexei Starovoitov <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 823707b commit 557c0c6

File tree

3 files changed

+71
-18
lines changed

3 files changed

+71
-18
lines changed

include/linux/bpf.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,7 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
195195
u64 flags);
196196
int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
197197
u64 flags);
198+
int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value);
198199

199200
/* memcpy that is used with 8-byte aligned pointers, power-of-8 size and
200201
* forced to use 'long' read/writes to try to atomically copy long counters.

kernel/bpf/stackmap.c

Lines changed: 68 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -10,20 +10,45 @@
1010
#include <linux/vmalloc.h>
1111
#include <linux/stacktrace.h>
1212
#include <linux/perf_event.h>
13+
#include "percpu_freelist.h"
1314

1415
struct stack_map_bucket {
15-
struct rcu_head rcu;
16+
struct pcpu_freelist_node fnode;
1617
u32 hash;
1718
u32 nr;
1819
u64 ip[];
1920
};
2021

2122
struct bpf_stack_map {
2223
struct bpf_map map;
24+
void *elems;
25+
struct pcpu_freelist freelist;
2326
u32 n_buckets;
24-
struct stack_map_bucket __rcu *buckets[];
27+
struct stack_map_bucket *buckets[];
2528
};
2629

30+
static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
31+
{
32+
u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
33+
int err;
34+
35+
smap->elems = vzalloc(elem_size * smap->map.max_entries);
36+
if (!smap->elems)
37+
return -ENOMEM;
38+
39+
err = pcpu_freelist_init(&smap->freelist);
40+
if (err)
41+
goto free_elems;
42+
43+
pcpu_freelist_populate(&smap->freelist, smap->elems, elem_size,
44+
smap->map.max_entries);
45+
return 0;
46+
47+
free_elems:
48+
vfree(smap->elems);
49+
return err;
50+
}
51+
2752
/* Called from syscall */
2853
static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
2954
{
@@ -70,12 +95,22 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
7095
smap->n_buckets = n_buckets;
7196
smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
7297

98+
err = bpf_map_precharge_memlock(smap->map.pages);
99+
if (err)
100+
goto free_smap;
101+
73102
err = get_callchain_buffers();
74103
if (err)
75104
goto free_smap;
76105

106+
err = prealloc_elems_and_freelist(smap);
107+
if (err)
108+
goto put_buffers;
109+
77110
return &smap->map;
78111

112+
put_buffers:
113+
put_callchain_buffers();
79114
free_smap:
80115
kvfree(smap);
81116
return ERR_PTR(err);
@@ -121,7 +156,7 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
121156
ips = trace->ip + skip + init_nr;
122157
hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0);
123158
id = hash & (smap->n_buckets - 1);
124-
bucket = rcu_dereference(smap->buckets[id]);
159+
bucket = READ_ONCE(smap->buckets[id]);
125160

126161
if (bucket && bucket->hash == hash) {
127162
if (flags & BPF_F_FAST_STACK_CMP)
@@ -135,19 +170,18 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
135170
if (bucket && !(flags & BPF_F_REUSE_STACKID))
136171
return -EEXIST;
137172

138-
new_bucket = kmalloc(sizeof(struct stack_map_bucket) + map->value_size,
139-
GFP_ATOMIC | __GFP_NOWARN);
173+
new_bucket = (struct stack_map_bucket *)
174+
pcpu_freelist_pop(&smap->freelist);
140175
if (unlikely(!new_bucket))
141176
return -ENOMEM;
142177

143178
memcpy(new_bucket->ip, ips, trace_len);
144-
memset(new_bucket->ip + trace_len / 8, 0, map->value_size - trace_len);
145179
new_bucket->hash = hash;
146180
new_bucket->nr = trace_nr;
147181

148182
old_bucket = xchg(&smap->buckets[id], new_bucket);
149183
if (old_bucket)
150-
kfree_rcu(old_bucket, rcu);
184+
pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
151185
return id;
152186
}
153187

@@ -160,17 +194,34 @@ const struct bpf_func_proto bpf_get_stackid_proto = {
160194
.arg3_type = ARG_ANYTHING,
161195
};
162196

163-
/* Called from syscall or from eBPF program */
197+
/* Called from eBPF program */
164198
static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
199+
{
200+
return NULL;
201+
}
202+
203+
/* Called from syscall */
204+
int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
165205
{
166206
struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
167-
struct stack_map_bucket *bucket;
168-
u32 id = *(u32 *)key;
207+
struct stack_map_bucket *bucket, *old_bucket;
208+
u32 id = *(u32 *)key, trace_len;
169209

170210
if (unlikely(id >= smap->n_buckets))
171-
return NULL;
172-
bucket = rcu_dereference(smap->buckets[id]);
173-
return bucket ? bucket->ip : NULL;
211+
return -ENOENT;
212+
213+
bucket = xchg(&smap->buckets[id], NULL);
214+
if (!bucket)
215+
return -ENOENT;
216+
217+
trace_len = bucket->nr * sizeof(u64);
218+
memcpy(value, bucket->ip, trace_len);
219+
memset(value + trace_len, 0, map->value_size - trace_len);
220+
221+
old_bucket = xchg(&smap->buckets[id], bucket);
222+
if (old_bucket)
223+
pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
224+
return 0;
174225
}
175226

176227
static int stack_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
@@ -196,7 +247,7 @@ static int stack_map_delete_elem(struct bpf_map *map, void *key)
196247

197248
old_bucket = xchg(&smap->buckets[id], NULL);
198249
if (old_bucket) {
199-
kfree_rcu(old_bucket, rcu);
250+
pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
200251
return 0;
201252
} else {
202253
return -ENOENT;
@@ -207,13 +258,12 @@ static int stack_map_delete_elem(struct bpf_map *map, void *key)
207258
static void stack_map_free(struct bpf_map *map)
208259
{
209260
struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
210-
int i;
211261

262+
/* wait for bpf programs to complete before freeing stack map */
212263
synchronize_rcu();
213264

214-
for (i = 0; i < smap->n_buckets; i++)
215-
if (smap->buckets[i])
216-
kfree_rcu(smap->buckets[i], rcu);
265+
vfree(smap->elems);
266+
pcpu_freelist_destroy(&smap->freelist);
217267
kvfree(smap);
218268
put_callchain_buffers();
219269
}

kernel/bpf/syscall.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,8 @@ static int map_lookup_elem(union bpf_attr *attr)
290290
err = bpf_percpu_hash_copy(map, key, value);
291291
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
292292
err = bpf_percpu_array_copy(map, key, value);
293+
} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
294+
err = bpf_stackmap_copy(map, key, value);
293295
} else {
294296
rcu_read_lock();
295297
ptr = map->ops->map_lookup_elem(map, key);

0 commit comments

Comments
 (0)