Skip to content

Commit f765274

Browse files
author
Alexei Starovoitov
committed
Merge branch 'harden-and-extend-elf-build-id-parsing-logic'
Andrii Nakryiko says: ==================== Harden and extend ELF build ID parsing logic The goal of this patch set is to extend existing ELF build ID parsing logic, currently mostly used by BPF subsystem, with support for working in sleepable mode in which memory faults are allowed and can be relied upon to fetch relevant parts of ELF file to find and fetch .note.gnu.build-id information. This is useful and important for BPF subsystem itself, but also for PROCMAP_QUERY ioctl(), built atop of /proc/<pid>/maps functionality (see [0]), which makes use of the same build_id_parse() functionality. PROCMAP_QUERY is always called from sleepable user process context, so it doesn't have to suffer from current restrictions of build_id_parse() which are due to the NMI context assumption. Along the way, we harden the logic to avoid TOCTOU, overflow, out-of-bounds access problems. This is the very first patch, which can be backported to older releases, if necessary. We also lift existing limitations of only working as long as ELF program headers and build ID note section is contained strictly within the very first page of ELF file. We achieve all of the above without duplication of logic between sleepable and non-sleepable modes through freader abstraction that manages underlying folio from page cache (on demand) and gives a simple to use direct memory access interface. With that, single page restrictions and adding sleepable mode support is rather straightforward. We also extend existing set of BPF selftests with a few tests targeting build ID logic across sleepable and non-sleepabe contexts (we utilize sleepable and non-sleepable uprobes for that). [0] https://lore.kernel.org/linux-mm/[email protected]/ v6->v7: - added filemap_invalidate_{lock,unlock}_shared() around read_cache_folio and kept Eduard's Reviewed-by (Eduard); v5->v6: - use local phnum variable in get_build_id_32() (Jann); - switch memcmp() instead of strcmp() in parse_build_id() (Jann); v4->v5: - pass proper file reference to read_cache_folio() (Shakeel); - fix another potential overflow due to two u32 additions (Andi); - add PageUptodate() check to patch #1 (Jann); v3->v4: - fix few more potential overflow and out-of-bounds access issues (Andi); - use purely folio-based implementation for freader (Matthew); v2->v3: - remove unneeded READ_ONCE()s and force phoff to u64 for 32-bit mode (Andi); - moved hardening fixes to the front for easier backporting (Jann); - call freader_cleanup() from build_id_parse_buf() for consistency (Jiri); v1->v2: - ensure MADV_PAGEOUT works reliably by paging data in first (Shakeel); - to fix BPF CI build optionally define MADV_POPULATE_READ in selftest. ==================== Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Alexei Starovoitov <[email protected]>
2 parents 58ff04e + 3c217a1 commit f765274

File tree

11 files changed

+605
-142
lines changed

11 files changed

+605
-142
lines changed

include/linux/bpf.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3200,7 +3200,9 @@ extern const struct bpf_func_proto bpf_get_current_uid_gid_proto;
32003200
extern const struct bpf_func_proto bpf_get_current_comm_proto;
32013201
extern const struct bpf_func_proto bpf_get_stackid_proto;
32023202
extern const struct bpf_func_proto bpf_get_stack_proto;
3203+
extern const struct bpf_func_proto bpf_get_stack_sleepable_proto;
32033204
extern const struct bpf_func_proto bpf_get_task_stack_proto;
3205+
extern const struct bpf_func_proto bpf_get_task_stack_sleepable_proto;
32043206
extern const struct bpf_func_proto bpf_get_stackid_proto_pe;
32053207
extern const struct bpf_func_proto bpf_get_stack_proto_pe;
32063208
extern const struct bpf_func_proto bpf_sock_map_update_proto;

include/linux/buildid.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@
77
#define BUILD_ID_SIZE_MAX 20
88

99
struct vm_area_struct;
10-
int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id,
11-
__u32 *size);
10+
int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size);
11+
int build_id_parse_nofault(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size);
1212
int build_id_parse_buf(const void *buf, unsigned char *build_id, u32 buf_size);
1313

1414
#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) || IS_ENABLED(CONFIG_VMCORE_INFO)

kernel/bpf/stackmap.c

Lines changed: 101 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -124,8 +124,24 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
124124
return ERR_PTR(err);
125125
}
126126

127+
static int fetch_build_id(struct vm_area_struct *vma, unsigned char *build_id, bool may_fault)
128+
{
129+
return may_fault ? build_id_parse(vma, build_id, NULL)
130+
: build_id_parse_nofault(vma, build_id, NULL);
131+
}
132+
133+
/*
134+
* Expects all id_offs[i].ip values to be set to correct initial IPs.
135+
* They will be subsequently:
136+
* - either adjusted in place to a file offset, if build ID fetching
137+
* succeeds; in this case id_offs[i].build_id is set to correct build ID,
138+
* and id_offs[i].status is set to BPF_STACK_BUILD_ID_VALID;
139+
* - or IP will be kept intact, if build ID fetching failed; in this case
140+
* id_offs[i].build_id is zeroed out and id_offs[i].status is set to
141+
* BPF_STACK_BUILD_ID_IP.
142+
*/
127143
static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
128-
u64 *ips, u32 trace_nr, bool user)
144+
u32 trace_nr, bool user, bool may_fault)
129145
{
130146
int i;
131147
struct mmap_unlock_irq_work *work = NULL;
@@ -142,30 +158,28 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
142158
/* cannot access current->mm, fall back to ips */
143159
for (i = 0; i < trace_nr; i++) {
144160
id_offs[i].status = BPF_STACK_BUILD_ID_IP;
145-
id_offs[i].ip = ips[i];
146161
memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX);
147162
}
148163
return;
149164
}
150165

151166
for (i = 0; i < trace_nr; i++) {
152-
if (range_in_vma(prev_vma, ips[i], ips[i])) {
167+
u64 ip = READ_ONCE(id_offs[i].ip);
168+
169+
if (range_in_vma(prev_vma, ip, ip)) {
153170
vma = prev_vma;
154-
memcpy(id_offs[i].build_id, prev_build_id,
155-
BUILD_ID_SIZE_MAX);
171+
memcpy(id_offs[i].build_id, prev_build_id, BUILD_ID_SIZE_MAX);
156172
goto build_id_valid;
157173
}
158-
vma = find_vma(current->mm, ips[i]);
159-
if (!vma || build_id_parse(vma, id_offs[i].build_id, NULL)) {
174+
vma = find_vma(current->mm, ip);
175+
if (!vma || fetch_build_id(vma, id_offs[i].build_id, may_fault)) {
160176
/* per entry fall back to ips */
161177
id_offs[i].status = BPF_STACK_BUILD_ID_IP;
162-
id_offs[i].ip = ips[i];
163178
memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX);
164179
continue;
165180
}
166181
build_id_valid:
167-
id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i]
168-
- vma->vm_start;
182+
id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ip - vma->vm_start;
169183
id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
170184
prev_vma = vma;
171185
prev_build_id = id_offs[i].build_id;
@@ -216,7 +230,7 @@ static long __bpf_get_stackid(struct bpf_map *map,
216230
struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
217231
struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
218232
u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
219-
u32 hash, id, trace_nr, trace_len;
233+
u32 hash, id, trace_nr, trace_len, i;
220234
bool user = flags & BPF_F_USER_STACK;
221235
u64 *ips;
222236
bool hash_matches;
@@ -238,15 +252,18 @@ static long __bpf_get_stackid(struct bpf_map *map,
238252
return id;
239253

240254
if (stack_map_use_build_id(map)) {
255+
struct bpf_stack_build_id *id_offs;
256+
241257
/* for build_id+offset, pop a bucket before slow cmp */
242258
new_bucket = (struct stack_map_bucket *)
243259
pcpu_freelist_pop(&smap->freelist);
244260
if (unlikely(!new_bucket))
245261
return -ENOMEM;
246262
new_bucket->nr = trace_nr;
247-
stack_map_get_build_id_offset(
248-
(struct bpf_stack_build_id *)new_bucket->data,
249-
ips, trace_nr, user);
263+
id_offs = (struct bpf_stack_build_id *)new_bucket->data;
264+
for (i = 0; i < trace_nr; i++)
265+
id_offs[i].ip = ips[i];
266+
stack_map_get_build_id_offset(id_offs, trace_nr, user, false /* !may_fault */);
250267
trace_len = trace_nr * sizeof(struct bpf_stack_build_id);
251268
if (hash_matches && bucket->nr == trace_nr &&
252269
memcmp(bucket->data, new_bucket->data, trace_len) == 0) {
@@ -387,7 +404,7 @@ const struct bpf_func_proto bpf_get_stackid_proto_pe = {
387404

388405
static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
389406
struct perf_callchain_entry *trace_in,
390-
void *buf, u32 size, u64 flags)
407+
void *buf, u32 size, u64 flags, bool may_fault)
391408
{
392409
u32 trace_nr, copy_len, elem_size, num_elem, max_depth;
393410
bool user_build_id = flags & BPF_F_USER_BUILD_ID;
@@ -405,8 +422,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
405422
if (kernel && user_build_id)
406423
goto clear;
407424

408-
elem_size = (user && user_build_id) ? sizeof(struct bpf_stack_build_id)
409-
: sizeof(u64);
425+
elem_size = user_build_id ? sizeof(struct bpf_stack_build_id) : sizeof(u64);
410426
if (unlikely(size % elem_size))
411427
goto clear;
412428

@@ -427,28 +443,44 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
427443
if (sysctl_perf_event_max_stack < max_depth)
428444
max_depth = sysctl_perf_event_max_stack;
429445

446+
if (may_fault)
447+
rcu_read_lock(); /* need RCU for perf's callchain below */
448+
430449
if (trace_in)
431450
trace = trace_in;
432451
else if (kernel && task)
433452
trace = get_callchain_entry_for_task(task, max_depth);
434453
else
435454
trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
436455
crosstask, false);
437-
if (unlikely(!trace))
438-
goto err_fault;
439456

440-
if (trace->nr < skip)
457+
if (unlikely(!trace) || trace->nr < skip) {
458+
if (may_fault)
459+
rcu_read_unlock();
441460
goto err_fault;
461+
}
442462

443463
trace_nr = trace->nr - skip;
444464
trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem;
445465
copy_len = trace_nr * elem_size;
446466

447467
ips = trace->ip + skip;
448-
if (user && user_build_id)
449-
stack_map_get_build_id_offset(buf, ips, trace_nr, user);
450-
else
468+
if (user_build_id) {
469+
struct bpf_stack_build_id *id_offs = buf;
470+
u32 i;
471+
472+
for (i = 0; i < trace_nr; i++)
473+
id_offs[i].ip = ips[i];
474+
} else {
451475
memcpy(buf, ips, copy_len);
476+
}
477+
478+
/* trace/ips should not be dereferenced after this point */
479+
if (may_fault)
480+
rcu_read_unlock();
481+
482+
if (user_build_id)
483+
stack_map_get_build_id_offset(buf, trace_nr, user, may_fault);
452484

453485
if (size > copy_len)
454486
memset(buf + copy_len, 0, size - copy_len);
@@ -464,7 +496,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
464496
BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
465497
u64, flags)
466498
{
467-
return __bpf_get_stack(regs, NULL, NULL, buf, size, flags);
499+
return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, false /* !may_fault */);
468500
}
469501

470502
const struct bpf_func_proto bpf_get_stack_proto = {
@@ -477,8 +509,24 @@ const struct bpf_func_proto bpf_get_stack_proto = {
477509
.arg4_type = ARG_ANYTHING,
478510
};
479511

480-
BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
481-
u32, size, u64, flags)
512+
BPF_CALL_4(bpf_get_stack_sleepable, struct pt_regs *, regs, void *, buf, u32, size,
513+
u64, flags)
514+
{
515+
return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, true /* may_fault */);
516+
}
517+
518+
const struct bpf_func_proto bpf_get_stack_sleepable_proto = {
519+
.func = bpf_get_stack_sleepable,
520+
.gpl_only = true,
521+
.ret_type = RET_INTEGER,
522+
.arg1_type = ARG_PTR_TO_CTX,
523+
.arg2_type = ARG_PTR_TO_UNINIT_MEM,
524+
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
525+
.arg4_type = ARG_ANYTHING,
526+
};
527+
528+
static long __bpf_get_task_stack(struct task_struct *task, void *buf, u32 size,
529+
u64 flags, bool may_fault)
482530
{
483531
struct pt_regs *regs;
484532
long res = -EINVAL;
@@ -488,12 +536,18 @@ BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
488536

489537
regs = task_pt_regs(task);
490538
if (regs)
491-
res = __bpf_get_stack(regs, task, NULL, buf, size, flags);
539+
res = __bpf_get_stack(regs, task, NULL, buf, size, flags, may_fault);
492540
put_task_stack(task);
493541

494542
return res;
495543
}
496544

545+
BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
546+
u32, size, u64, flags)
547+
{
548+
return __bpf_get_task_stack(task, buf, size, flags, false /* !may_fault */);
549+
}
550+
497551
const struct bpf_func_proto bpf_get_task_stack_proto = {
498552
.func = bpf_get_task_stack,
499553
.gpl_only = false,
@@ -505,6 +559,23 @@ const struct bpf_func_proto bpf_get_task_stack_proto = {
505559
.arg4_type = ARG_ANYTHING,
506560
};
507561

562+
BPF_CALL_4(bpf_get_task_stack_sleepable, struct task_struct *, task, void *, buf,
563+
u32, size, u64, flags)
564+
{
565+
return __bpf_get_task_stack(task, buf, size, flags, true /* !may_fault */);
566+
}
567+
568+
const struct bpf_func_proto bpf_get_task_stack_sleepable_proto = {
569+
.func = bpf_get_task_stack_sleepable,
570+
.gpl_only = false,
571+
.ret_type = RET_INTEGER,
572+
.arg1_type = ARG_PTR_TO_BTF_ID,
573+
.arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
574+
.arg2_type = ARG_PTR_TO_UNINIT_MEM,
575+
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
576+
.arg4_type = ARG_ANYTHING,
577+
};
578+
508579
BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
509580
void *, buf, u32, size, u64, flags)
510581
{
@@ -516,7 +587,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
516587
__u64 nr_kernel;
517588

518589
if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN))
519-
return __bpf_get_stack(regs, NULL, NULL, buf, size, flags);
590+
return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, false /* !may_fault */);
520591

521592
if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
522593
BPF_F_USER_BUILD_ID)))
@@ -536,7 +607,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
536607
__u64 nr = trace->nr;
537608

538609
trace->nr = nr_kernel;
539-
err = __bpf_get_stack(regs, NULL, trace, buf, size, flags);
610+
err = __bpf_get_stack(regs, NULL, trace, buf, size, flags, false /* !may_fault */);
540611

541612
/* restore nr */
542613
trace->nr = nr;
@@ -548,7 +619,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
548619
goto clear;
549620

550621
flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip;
551-
err = __bpf_get_stack(regs, NULL, trace, buf, size, flags);
622+
err = __bpf_get_stack(regs, NULL, trace, buf, size, flags, false /* !may_fault */);
552623
}
553624
return err;
554625

kernel/events/core.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8851,7 +8851,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
88518851
mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
88528852

88538853
if (atomic_read(&nr_build_id_events))
8854-
build_id_parse(vma, mmap_event->build_id, &mmap_event->build_id_size);
8854+
build_id_parse_nofault(vma, mmap_event->build_id, &mmap_event->build_id_size);
88558855

88568856
perf_iterate_sb(perf_event_mmap_output,
88578857
mmap_event,

kernel/trace/bpf_trace.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1507,7 +1507,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
15071507
case BPF_FUNC_jiffies64:
15081508
return &bpf_jiffies64_proto;
15091509
case BPF_FUNC_get_task_stack:
1510-
return &bpf_get_task_stack_proto;
1510+
return prog->sleepable ? &bpf_get_task_stack_sleepable_proto
1511+
: &bpf_get_task_stack_proto;
15111512
case BPF_FUNC_copy_from_user:
15121513
return &bpf_copy_from_user_proto;
15131514
case BPF_FUNC_copy_from_user_task:
@@ -1563,7 +1564,7 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
15631564
case BPF_FUNC_get_stackid:
15641565
return &bpf_get_stackid_proto;
15651566
case BPF_FUNC_get_stack:
1566-
return &bpf_get_stack_proto;
1567+
return prog->sleepable ? &bpf_get_stack_sleepable_proto : &bpf_get_stack_proto;
15671568
#ifdef CONFIG_BPF_KPROBE_OVERRIDE
15681569
case BPF_FUNC_override_return:
15691570
return &bpf_override_return_proto;

0 commit comments

Comments
 (0)