Skip to content

Commit ef54c1a

Browse files
author
Peter Zijlstra
committed
perf: Rework perf_event_exit_event()
Make perf_event_exit_event() more robust, such that we can use it from other contexts. Specifically the up and coming remove_on_exec. For this to work we need to address a few issues. Remove_on_exec will not destroy the entire context, so we cannot rely on TASK_TOMBSTONE to disable event_function_call() and we thus have to use perf_remove_from_context(). When using perf_remove_from_context(), there's two races to consider. The first is against close(), where we can have concurrent tear-down of the event. The second is against child_list iteration, which should not find a half baked event. To address this, teach perf_remove_from_context() to special case !ctx->is_active and about DETACH_CHILD. [ [email protected]: fix racing parent/child exit in sync_child_event(). ] Signed-off-by: Marco Elver <[email protected]> Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Link: https://lkml.kernel.org/r/[email protected]
1 parent 874fc35 commit ef54c1a

File tree

2 files changed

+80
-63
lines changed

2 files changed

+80
-63
lines changed

include/linux/perf_event.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -607,6 +607,7 @@ struct swevent_hlist {
607607
#define PERF_ATTACH_TASK_DATA 0x08
608608
#define PERF_ATTACH_ITRACE 0x10
609609
#define PERF_ATTACH_SCHED_CB 0x20
610+
#define PERF_ATTACH_CHILD 0x40
610611

611612
struct perf_cgroup;
612613
struct perf_buffer;

kernel/events/core.c

Lines changed: 79 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -2205,6 +2205,26 @@ static void perf_group_detach(struct perf_event *event)
22052205
perf_event__header_size(leader);
22062206
}
22072207

2208+
static void sync_child_event(struct perf_event *child_event);
2209+
2210+
static void perf_child_detach(struct perf_event *event)
2211+
{
2212+
struct perf_event *parent_event = event->parent;
2213+
2214+
if (!(event->attach_state & PERF_ATTACH_CHILD))
2215+
return;
2216+
2217+
event->attach_state &= ~PERF_ATTACH_CHILD;
2218+
2219+
if (WARN_ON_ONCE(!parent_event))
2220+
return;
2221+
2222+
lockdep_assert_held(&parent_event->child_mutex);
2223+
2224+
sync_child_event(event);
2225+
list_del_init(&event->child_list);
2226+
}
2227+
22082228
static bool is_orphaned_event(struct perf_event *event)
22092229
{
22102230
return event->state == PERF_EVENT_STATE_DEAD;
@@ -2312,6 +2332,7 @@ group_sched_out(struct perf_event *group_event,
23122332
}
23132333

23142334
#define DETACH_GROUP 0x01UL
2335+
#define DETACH_CHILD 0x02UL
23152336

23162337
/*
23172338
* Cross CPU call to remove a performance event
@@ -2335,6 +2356,8 @@ __perf_remove_from_context(struct perf_event *event,
23352356
event_sched_out(event, cpuctx, ctx);
23362357
if (flags & DETACH_GROUP)
23372358
perf_group_detach(event);
2359+
if (flags & DETACH_CHILD)
2360+
perf_child_detach(event);
23382361
list_del_event(event, ctx);
23392362

23402363
if (!ctx->nr_events && ctx->is_active) {
@@ -2363,25 +2386,21 @@ static void perf_remove_from_context(struct perf_event *event, unsigned long fla
23632386

23642387
lockdep_assert_held(&ctx->mutex);
23652388

2366-
event_function_call(event, __perf_remove_from_context, (void *)flags);
2367-
23682389
/*
2369-
* The above event_function_call() can NO-OP when it hits
2370-
* TASK_TOMBSTONE. In that case we must already have been detached
2371-
* from the context (by perf_event_exit_event()) but the grouping
2372-
* might still be in-tact.
2390+
* Because of perf_event_exit_task(), perf_remove_from_context() ought
2391+
* to work in the face of TASK_TOMBSTONE, unlike every other
2392+
* event_function_call() user.
23732393
*/
2374-
WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
2375-
if ((flags & DETACH_GROUP) &&
2376-
(event->attach_state & PERF_ATTACH_GROUP)) {
2377-
/*
2378-
* Since in that case we cannot possibly be scheduled, simply
2379-
* detach now.
2380-
*/
2381-
raw_spin_lock_irq(&ctx->lock);
2382-
perf_group_detach(event);
2394+
raw_spin_lock_irq(&ctx->lock);
2395+
if (!ctx->is_active) {
2396+
__perf_remove_from_context(event, __get_cpu_context(ctx),
2397+
ctx, (void *)flags);
23832398
raw_spin_unlock_irq(&ctx->lock);
2399+
return;
23842400
}
2401+
raw_spin_unlock_irq(&ctx->lock);
2402+
2403+
event_function_call(event, __perf_remove_from_context, (void *)flags);
23852404
}
23862405

23872406
/*
@@ -12377,14 +12396,17 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
1237712396
}
1237812397
EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
1237912398

12380-
static void sync_child_event(struct perf_event *child_event,
12381-
struct task_struct *child)
12399+
static void sync_child_event(struct perf_event *child_event)
1238212400
{
1238312401
struct perf_event *parent_event = child_event->parent;
1238412402
u64 child_val;
1238512403

12386-
if (child_event->attr.inherit_stat)
12387-
perf_event_read_event(child_event, child);
12404+
if (child_event->attr.inherit_stat) {
12405+
struct task_struct *task = child_event->ctx->task;
12406+
12407+
if (task && task != TASK_TOMBSTONE)
12408+
perf_event_read_event(child_event, task);
12409+
}
1238812410

1238912411
child_val = perf_event_count(child_event);
1239012412

@@ -12399,60 +12421,53 @@ static void sync_child_event(struct perf_event *child_event,
1239912421
}
1240012422

1240112423
static void
12402-
perf_event_exit_event(struct perf_event *child_event,
12403-
struct perf_event_context *child_ctx,
12404-
struct task_struct *child)
12424+
perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
1240512425
{
12406-
struct perf_event *parent_event = child_event->parent;
12426+
struct perf_event *parent_event = event->parent;
12427+
unsigned long detach_flags = 0;
1240712428

12408-
/*
12409-
* Do not destroy the 'original' grouping; because of the context
12410-
* switch optimization the original events could've ended up in a
12411-
* random child task.
12412-
*
12413-
* If we were to destroy the original group, all group related
12414-
* operations would cease to function properly after this random
12415-
* child dies.
12416-
*
12417-
* Do destroy all inherited groups, we don't care about those
12418-
* and being thorough is better.
12419-
*/
12420-
raw_spin_lock_irq(&child_ctx->lock);
12421-
WARN_ON_ONCE(child_ctx->is_active);
12429+
if (parent_event) {
12430+
/*
12431+
* Do not destroy the 'original' grouping; because of the
12432+
* context switch optimization the original events could've
12433+
* ended up in a random child task.
12434+
*
12435+
* If we were to destroy the original group, all group related
12436+
* operations would cease to function properly after this
12437+
* random child dies.
12438+
*
12439+
* Do destroy all inherited groups, we don't care about those
12440+
* and being thorough is better.
12441+
*/
12442+
detach_flags = DETACH_GROUP | DETACH_CHILD;
12443+
mutex_lock(&parent_event->child_mutex);
12444+
}
1242212445

12423-
if (parent_event)
12424-
perf_group_detach(child_event);
12425-
list_del_event(child_event, child_ctx);
12426-
perf_event_set_state(child_event, PERF_EVENT_STATE_EXIT); /* is_event_hup() */
12427-
raw_spin_unlock_irq(&child_ctx->lock);
12446+
perf_remove_from_context(event, detach_flags);
12447+
12448+
raw_spin_lock_irq(&ctx->lock);
12449+
if (event->state > PERF_EVENT_STATE_EXIT)
12450+
perf_event_set_state(event, PERF_EVENT_STATE_EXIT);
12451+
raw_spin_unlock_irq(&ctx->lock);
1242812452

1242912453
/*
12430-
* Parent events are governed by their filedesc, retain them.
12454+
* Child events can be freed.
1243112455
*/
12432-
if (!parent_event) {
12433-
perf_event_wakeup(child_event);
12456+
if (parent_event) {
12457+
mutex_unlock(&parent_event->child_mutex);
12458+
/*
12459+
* Kick perf_poll() for is_event_hup();
12460+
*/
12461+
perf_event_wakeup(parent_event);
12462+
free_event(event);
12463+
put_event(parent_event);
1243412464
return;
1243512465
}
12436-
/*
12437-
* Child events can be cleaned up.
12438-
*/
12439-
12440-
sync_child_event(child_event, child);
1244112466

1244212467
/*
12443-
* Remove this event from the parent's list
12444-
*/
12445-
WARN_ON_ONCE(parent_event->ctx->parent_ctx);
12446-
mutex_lock(&parent_event->child_mutex);
12447-
list_del_init(&child_event->child_list);
12448-
mutex_unlock(&parent_event->child_mutex);
12449-
12450-
/*
12451-
* Kick perf_poll() for is_event_hup().
12468+
* Parent events are governed by their filedesc, retain them.
1245212469
*/
12453-
perf_event_wakeup(parent_event);
12454-
free_event(child_event);
12455-
put_event(parent_event);
12470+
perf_event_wakeup(event);
1245612471
}
1245712472

1245812473
static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
@@ -12509,7 +12524,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
1250912524
perf_event_task(child, child_ctx, 0);
1251012525

1251112526
list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
12512-
perf_event_exit_event(child_event, child_ctx, child);
12527+
perf_event_exit_event(child_event, child_ctx);
1251312528

1251412529
mutex_unlock(&child_ctx->mutex);
1251512530

@@ -12769,6 +12784,7 @@ inherit_event(struct perf_event *parent_event,
1276912784
*/
1277012785
raw_spin_lock_irqsave(&child_ctx->lock, flags);
1277112786
add_event_to_ctx(child_event, child_ctx);
12787+
child_event->attach_state |= PERF_ATTACH_CHILD;
1277212788
raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
1277312789

1277412790
/*

0 commit comments

Comments
 (0)