Skip to content

Commit 615755a

Browse files
liu-song-6borkmann
authored andcommitted
bpf: extend stackmap to save binary_build_id+offset instead of address
Currently, bpf stackmap store address for each entry in the call trace. To map these addresses to user space files, it is necessary to maintain the mapping from these virtual address to symbols in the binary. Usually, the user space profiler (such as perf) has to scan /proc/pid/maps at the beginning of profiling, and monitor mmap2() calls afterwards. Given the cost of maintaining the address map, this solution is not practical for system wide profiling that is always on. This patch tries to solve this problem with a variation of stackmap. This variation is enabled by flag BPF_F_STACK_BUILD_ID. Instead of storing addresses, the variation stores ELF file build_id + offset. Build ID is a 20-byte unique identifier for ELF files. The following command shows the Build ID of /bin/bash: [user@]$ readelf -n /bin/bash ... Build ID: XXXXXXXXXX ... With BPF_F_STACK_BUILD_ID, bpf_get_stackid() tries to parse Build ID for each entry in the call trace, and translate it into the following struct: struct bpf_stack_build_id_offset { __s32 status; unsigned char build_id[BPF_BUILD_ID_SIZE]; union { __u64 offset; __u64 ip; }; }; The search of build_id is limited to the first page of the file, and this page should be in page cache. Otherwise, we fallback to store ip for this entry (ip field in struct bpf_stack_build_id_offset). This requires the build_id to be stored in the first page. A quick survey of binary and dynamic library files in a few different systems shows that almost all binary and dynamic library files have build_id in the first page. Build_id is only meaningful for user stack. If a kernel stack is added to a stackmap with BPF_F_STACK_BUILD_ID, it will automatically fallback to only store ip (status == BPF_STACK_BUILD_ID_IP). Similarly, if build_id lookup failed for some reason, it will also fallback to store ip. User space can access struct bpf_stack_build_id_offset with bpf syscall BPF_MAP_LOOKUP_ELEM. It is necessary for user space to maintain mapping from build id to binary files. This mostly static mapping is much easier to maintain than per process address maps. Note: Stackmap with build_id only works in non-nmi context at this time. This is because we need to take mm->mmap_sem for find_vma(). If this changes, we would like to allow build_id lookup in nmi context. Signed-off-by: Song Liu <[email protected]> Signed-off-by: Daniel Borkmann <[email protected]>
1 parent 6d8cb04 commit 615755a

File tree

2 files changed

+257
-22
lines changed

2 files changed

+257
-22
lines changed

include/uapi/linux/bpf.h

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,28 @@ enum bpf_attach_type {
231231
#define BPF_F_RDONLY (1U << 3)
232232
#define BPF_F_WRONLY (1U << 4)
233233

234+
/* Flag for stack_map, store build_id+offset instead of pointer */
235+
#define BPF_F_STACK_BUILD_ID (1U << 5)
236+
237+
enum bpf_stack_build_id_status {
238+
/* user space need an empty entry to identify end of a trace */
239+
BPF_STACK_BUILD_ID_EMPTY = 0,
240+
/* with valid build_id and offset */
241+
BPF_STACK_BUILD_ID_VALID = 1,
242+
/* couldn't get build_id, fallback to ip */
243+
BPF_STACK_BUILD_ID_IP = 2,
244+
};
245+
246+
#define BPF_BUILD_ID_SIZE 20
247+
struct bpf_stack_build_id {
248+
__s32 status;
249+
unsigned char build_id[BPF_BUILD_ID_SIZE];
250+
union {
251+
__u64 offset;
252+
__u64 ip;
253+
};
254+
};
255+
234256
union bpf_attr {
235257
struct { /* anonymous struct used by BPF_MAP_CREATE command */
236258
__u32 map_type; /* one of enum bpf_map_type */

kernel/bpf/stackmap.c

Lines changed: 235 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -9,16 +9,19 @@
99
#include <linux/filter.h>
1010
#include <linux/stacktrace.h>
1111
#include <linux/perf_event.h>
12+
#include <linux/elf.h>
13+
#include <linux/pagemap.h>
1214
#include "percpu_freelist.h"
1315

14-
#define STACK_CREATE_FLAG_MASK \
15-
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
16+
#define STACK_CREATE_FLAG_MASK \
17+
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | \
18+
BPF_F_STACK_BUILD_ID)
1619

1720
struct stack_map_bucket {
1821
struct pcpu_freelist_node fnode;
1922
u32 hash;
2023
u32 nr;
21-
u64 ip[];
24+
u64 data[];
2225
};
2326

2427
struct bpf_stack_map {
@@ -29,6 +32,17 @@ struct bpf_stack_map {
2932
struct stack_map_bucket *buckets[];
3033
};
3134

35+
static inline bool stack_map_use_build_id(struct bpf_map *map)
36+
{
37+
return (map->map_flags & BPF_F_STACK_BUILD_ID);
38+
}
39+
40+
static inline int stack_map_data_size(struct bpf_map *map)
41+
{
42+
return stack_map_use_build_id(map) ?
43+
sizeof(struct bpf_stack_build_id) : sizeof(u64);
44+
}
45+
3246
static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
3347
{
3448
u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
@@ -68,8 +82,16 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
6882

6983
/* check sanity of attributes */
7084
if (attr->max_entries == 0 || attr->key_size != 4 ||
71-
value_size < 8 || value_size % 8 ||
72-
value_size / 8 > sysctl_perf_event_max_stack)
85+
value_size < 8 || value_size % 8)
86+
return ERR_PTR(-EINVAL);
87+
88+
BUILD_BUG_ON(sizeof(struct bpf_stack_build_id) % sizeof(u64));
89+
if (attr->map_flags & BPF_F_STACK_BUILD_ID) {
90+
if (value_size % sizeof(struct bpf_stack_build_id) ||
91+
value_size / sizeof(struct bpf_stack_build_id)
92+
> sysctl_perf_event_max_stack)
93+
return ERR_PTR(-EINVAL);
94+
} else if (value_size / 8 > sysctl_perf_event_max_stack)
7395
return ERR_PTR(-EINVAL);
7496

7597
/* hash table size must be power of 2 */
@@ -114,20 +136,192 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
114136
return ERR_PTR(err);
115137
}
116138

139+
#define BPF_BUILD_ID 3
140+
/*
141+
* Parse build id from the note segment. This logic can be shared between
142+
* 32-bit and 64-bit system, because Elf32_Nhdr and Elf64_Nhdr are
143+
* identical.
144+
*/
145+
static inline int stack_map_parse_build_id(void *page_addr,
146+
unsigned char *build_id,
147+
void *note_start,
148+
Elf32_Word note_size)
149+
{
150+
Elf32_Word note_offs = 0, new_offs;
151+
152+
/* check for overflow */
153+
if (note_start < page_addr || note_start + note_size < note_start)
154+
return -EINVAL;
155+
156+
/* only supports note that fits in the first page */
157+
if (note_start + note_size > page_addr + PAGE_SIZE)
158+
return -EINVAL;
159+
160+
while (note_offs + sizeof(Elf32_Nhdr) < note_size) {
161+
Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs);
162+
163+
if (nhdr->n_type == BPF_BUILD_ID &&
164+
nhdr->n_namesz == sizeof("GNU") &&
165+
nhdr->n_descsz == BPF_BUILD_ID_SIZE) {
166+
memcpy(build_id,
167+
note_start + note_offs +
168+
ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr),
169+
BPF_BUILD_ID_SIZE);
170+
return 0;
171+
}
172+
new_offs = note_offs + sizeof(Elf32_Nhdr) +
173+
ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4);
174+
if (new_offs <= note_offs) /* overflow */
175+
break;
176+
note_offs = new_offs;
177+
}
178+
return -EINVAL;
179+
}
180+
181+
/* Parse build ID from 32-bit ELF */
182+
static int stack_map_get_build_id_32(void *page_addr,
183+
unsigned char *build_id)
184+
{
185+
Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr;
186+
Elf32_Phdr *phdr;
187+
int i;
188+
189+
/* only supports phdr that fits in one page */
190+
if (ehdr->e_phnum >
191+
(PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr))
192+
return -EINVAL;
193+
194+
phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr));
195+
196+
for (i = 0; i < ehdr->e_phnum; ++i)
197+
if (phdr[i].p_type == PT_NOTE)
198+
return stack_map_parse_build_id(page_addr, build_id,
199+
page_addr + phdr[i].p_offset,
200+
phdr[i].p_filesz);
201+
return -EINVAL;
202+
}
203+
204+
/* Parse build ID from 64-bit ELF */
205+
static int stack_map_get_build_id_64(void *page_addr,
206+
unsigned char *build_id)
207+
{
208+
Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr;
209+
Elf64_Phdr *phdr;
210+
int i;
211+
212+
/* only supports phdr that fits in one page */
213+
if (ehdr->e_phnum >
214+
(PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr))
215+
return -EINVAL;
216+
217+
phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr));
218+
219+
for (i = 0; i < ehdr->e_phnum; ++i)
220+
if (phdr[i].p_type == PT_NOTE)
221+
return stack_map_parse_build_id(page_addr, build_id,
222+
page_addr + phdr[i].p_offset,
223+
phdr[i].p_filesz);
224+
return -EINVAL;
225+
}
226+
227+
/* Parse build ID of ELF file mapped to vma */
228+
static int stack_map_get_build_id(struct vm_area_struct *vma,
229+
unsigned char *build_id)
230+
{
231+
Elf32_Ehdr *ehdr;
232+
struct page *page;
233+
void *page_addr;
234+
int ret;
235+
236+
/* only works for page backed storage */
237+
if (!vma->vm_file)
238+
return -EINVAL;
239+
240+
page = find_get_page(vma->vm_file->f_mapping, 0);
241+
if (!page)
242+
return -EFAULT; /* page not mapped */
243+
244+
ret = -EINVAL;
245+
page_addr = page_address(page);
246+
ehdr = (Elf32_Ehdr *)page_addr;
247+
248+
/* compare magic x7f "ELF" */
249+
if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0)
250+
goto out;
251+
252+
/* only support executable file and shared object file */
253+
if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN)
254+
goto out;
255+
256+
if (ehdr->e_ident[EI_CLASS] == ELFCLASS32)
257+
ret = stack_map_get_build_id_32(page_addr, build_id);
258+
else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64)
259+
ret = stack_map_get_build_id_64(page_addr, build_id);
260+
out:
261+
put_page(page);
262+
return ret;
263+
}
264+
265+
static void stack_map_get_build_id_offset(struct bpf_map *map,
266+
struct stack_map_bucket *bucket,
267+
u64 *ips, u32 trace_nr, bool user)
268+
{
269+
int i;
270+
struct vm_area_struct *vma;
271+
struct bpf_stack_build_id *id_offs;
272+
273+
bucket->nr = trace_nr;
274+
id_offs = (struct bpf_stack_build_id *)bucket->data;
275+
276+
/*
277+
* We cannot do up_read() in nmi context, so build_id lookup is
278+
* only supported for non-nmi events. If at some point, it is
279+
* possible to run find_vma() without taking the semaphore, we
280+
* would like to allow build_id lookup in nmi context.
281+
*
282+
* Same fallback is used for kernel stack (!user) on a stackmap
283+
* with build_id.
284+
*/
285+
if (!user || !current || !current->mm || in_nmi() ||
286+
down_read_trylock(&current->mm->mmap_sem) == 0) {
287+
/* cannot access current->mm, fall back to ips */
288+
for (i = 0; i < trace_nr; i++) {
289+
id_offs[i].status = BPF_STACK_BUILD_ID_IP;
290+
id_offs[i].ip = ips[i];
291+
}
292+
return;
293+
}
294+
295+
for (i = 0; i < trace_nr; i++) {
296+
vma = find_vma(current->mm, ips[i]);
297+
if (!vma || stack_map_get_build_id(vma, id_offs[i].build_id)) {
298+
/* per entry fall back to ips */
299+
id_offs[i].status = BPF_STACK_BUILD_ID_IP;
300+
id_offs[i].ip = ips[i];
301+
continue;
302+
}
303+
id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i]
304+
- vma->vm_start;
305+
id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
306+
}
307+
up_read(&current->mm->mmap_sem);
308+
}
309+
117310
BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
118311
u64, flags)
119312
{
120313
struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
121314
struct perf_callchain_entry *trace;
122315
struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
123-
u32 max_depth = map->value_size / 8;
316+
u32 max_depth = map->value_size / stack_map_data_size(map);
124317
/* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
125318
u32 init_nr = sysctl_perf_event_max_stack - max_depth;
126319
u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
127320
u32 hash, id, trace_nr, trace_len;
128321
bool user = flags & BPF_F_USER_STACK;
129322
bool kernel = !user;
130323
u64 *ips;
324+
bool hash_matches;
131325

132326
if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
133327
BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
@@ -156,24 +350,43 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
156350
id = hash & (smap->n_buckets - 1);
157351
bucket = READ_ONCE(smap->buckets[id]);
158352

159-
if (bucket && bucket->hash == hash) {
160-
if (flags & BPF_F_FAST_STACK_CMP)
353+
hash_matches = bucket && bucket->hash == hash;
354+
/* fast cmp */
355+
if (hash_matches && flags & BPF_F_FAST_STACK_CMP)
356+
return id;
357+
358+
if (stack_map_use_build_id(map)) {
359+
/* for build_id+offset, pop a bucket before slow cmp */
360+
new_bucket = (struct stack_map_bucket *)
361+
pcpu_freelist_pop(&smap->freelist);
362+
if (unlikely(!new_bucket))
363+
return -ENOMEM;
364+
stack_map_get_build_id_offset(map, new_bucket, ips,
365+
trace_nr, user);
366+
trace_len = trace_nr * sizeof(struct bpf_stack_build_id);
367+
if (hash_matches && bucket->nr == trace_nr &&
368+
memcmp(bucket->data, new_bucket->data, trace_len) == 0) {
369+
pcpu_freelist_push(&smap->freelist, &new_bucket->fnode);
161370
return id;
162-
if (bucket->nr == trace_nr &&
163-
memcmp(bucket->ip, ips, trace_len) == 0)
371+
}
372+
if (bucket && !(flags & BPF_F_REUSE_STACKID)) {
373+
pcpu_freelist_push(&smap->freelist, &new_bucket->fnode);
374+
return -EEXIST;
375+
}
376+
} else {
377+
if (hash_matches && bucket->nr == trace_nr &&
378+
memcmp(bucket->data, ips, trace_len) == 0)
164379
return id;
380+
if (bucket && !(flags & BPF_F_REUSE_STACKID))
381+
return -EEXIST;
382+
383+
new_bucket = (struct stack_map_bucket *)
384+
pcpu_freelist_pop(&smap->freelist);
385+
if (unlikely(!new_bucket))
386+
return -ENOMEM;
387+
memcpy(new_bucket->data, ips, trace_len);
165388
}
166389

167-
/* this call stack is not in the map, try to add it */
168-
if (bucket && !(flags & BPF_F_REUSE_STACKID))
169-
return -EEXIST;
170-
171-
new_bucket = (struct stack_map_bucket *)
172-
pcpu_freelist_pop(&smap->freelist);
173-
if (unlikely(!new_bucket))
174-
return -ENOMEM;
175-
176-
memcpy(new_bucket->ip, ips, trace_len);
177390
new_bucket->hash = hash;
178391
new_bucket->nr = trace_nr;
179392

@@ -212,8 +425,8 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
212425
if (!bucket)
213426
return -ENOENT;
214427

215-
trace_len = bucket->nr * sizeof(u64);
216-
memcpy(value, bucket->ip, trace_len);
428+
trace_len = bucket->nr * stack_map_data_size(map);
429+
memcpy(value, bucket->data, trace_len);
217430
memset(value + trace_len, 0, map->value_size - trace_len);
218431

219432
old_bucket = xchg(&smap->buckets[id], bucket);

0 commit comments

Comments
 (0)