Skip to content

Commit ca6c213

Browse files
author
Peter Zijlstra
committed
perf: Fix missing SIGTRAPs
Marco reported: Due to the implementation of how SIGTRAP are delivered if perf_event_attr::sigtrap is set, we've noticed 3 issues: 1. Missing SIGTRAP due to a race with event_sched_out() (more details below). 2. Hardware PMU events being disabled due to returning 1 from perf_event_overflow(). The only way to re-enable the event is for user space to first "properly" disable the event and then re-enable it. 3. The inability to automatically disable an event after a specified number of overflows via PERF_EVENT_IOC_REFRESH. The worst of the 3 issues is problem (1), which occurs when a pending_disable is "consumed" by a racing event_sched_out(), observed as follows: CPU0 | CPU1 --------------------------------+--------------------------- __perf_event_overflow() | perf_event_disable_inatomic() | pending_disable = CPU0 | ... | _perf_event_enable() | event_function_call() | task_function_call() | /* sends IPI to CPU0 */ <IPI> | ... __perf_event_enable() +--------------------------- ctx_resched() task_ctx_sched_out() ctx_sched_out() group_sched_out() event_sched_out() pending_disable = -1 </IPI> <IRQ-work> perf_pending_event() perf_pending_event_disable() /* Fails to send SIGTRAP because no pending_disable! */ </IRQ-work> In the above case, not only is that particular SIGTRAP missed, but also all future SIGTRAPs because 'event_limit' is not reset back to 1. To fix, rework pending delivery of SIGTRAP via IRQ-work by introduction of a separate 'pending_sigtrap', no longer using 'event_limit' and 'pending_disable' for its delivery. Additionally; and different to Marco's proposed patch: - recognise that pending_disable effectively duplicates oncpu for the case where it is set. As such, change the irq_work handler to use ->oncpu to target the event and use pending_* as boolean toggles. - observe that SIGTRAP targets the ctx->task, so the context switch optimization that carries contexts between tasks is invalid. If the irq_work were delayed enough to hit after a context switch the SIGTRAP would be delivered to the wrong task. - observe that if the event gets scheduled out (rotation/migration/context-switch/...) the irq-work would be insufficient to deliver the SIGTRAP when the event gets scheduled back in (the irq-work might still be pending on the old CPU). Therefore have event_sched_out() convert the pending sigtrap into a task_work which will deliver the signal at return_to_user. Fixes: 97ba62b ("perf: Add support for SIGTRAP on perf events") Reported-by: Dmitry Vyukov <[email protected]> Debugged-by: Dmitry Vyukov <[email protected]> Reported-by: Marco Elver <[email protected]> Debugged-by: Marco Elver <[email protected]> Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Reviewed-by: Marco Elver <[email protected]> Tested-by: Marco Elver <[email protected]>
1 parent 9abf231 commit ca6c213

File tree

3 files changed

+129
-43
lines changed

3 files changed

+129
-43
lines changed

include/linux/perf_event.h

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -756,11 +756,14 @@ struct perf_event {
756756
struct fasync_struct *fasync;
757757

758758
/* delayed work for NMIs and such */
759-
int pending_wakeup;
760-
int pending_kill;
761-
int pending_disable;
759+
unsigned int pending_wakeup;
760+
unsigned int pending_kill;
761+
unsigned int pending_disable;
762+
unsigned int pending_sigtrap;
762763
unsigned long pending_addr; /* SIGTRAP */
763-
struct irq_work pending;
764+
struct irq_work pending_irq;
765+
struct callback_head pending_task;
766+
unsigned int pending_work;
764767

765768
atomic_t event_limit;
766769

@@ -877,6 +880,14 @@ struct perf_event_context {
877880
#endif
878881
void *task_ctx_data; /* pmu specific data */
879882
struct rcu_head rcu_head;
883+
884+
/*
885+
* Sum (event->pending_sigtrap + event->pending_work)
886+
*
887+
* The SIGTRAP is targeted at ctx->task, as such it won't do changing
888+
* that until the signal is delivered.
889+
*/
890+
local_t nr_pending;
880891
};
881892

882893
/*

kernel/events/core.c

Lines changed: 113 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@
5454
#include <linux/highmem.h>
5555
#include <linux/pgtable.h>
5656
#include <linux/buildid.h>
57+
#include <linux/task_work.h>
5758

5859
#include "internal.h"
5960

@@ -2276,11 +2277,26 @@ event_sched_out(struct perf_event *event,
22762277
event->pmu->del(event, 0);
22772278
event->oncpu = -1;
22782279

2279-
if (READ_ONCE(event->pending_disable) >= 0) {
2280-
WRITE_ONCE(event->pending_disable, -1);
2280+
if (event->pending_disable) {
2281+
event->pending_disable = 0;
22812282
perf_cgroup_event_disable(event, ctx);
22822283
state = PERF_EVENT_STATE_OFF;
22832284
}
2285+
2286+
if (event->pending_sigtrap) {
2287+
bool dec = true;
2288+
2289+
event->pending_sigtrap = 0;
2290+
if (state != PERF_EVENT_STATE_OFF &&
2291+
!event->pending_work) {
2292+
event->pending_work = 1;
2293+
dec = false;
2294+
task_work_add(current, &event->pending_task, TWA_RESUME);
2295+
}
2296+
if (dec)
2297+
local_dec(&event->ctx->nr_pending);
2298+
}
2299+
22842300
perf_event_set_state(event, state);
22852301

22862302
if (!is_software_event(event))
@@ -2432,7 +2448,7 @@ static void __perf_event_disable(struct perf_event *event,
24322448
* hold the top-level event's child_mutex, so any descendant that
24332449
* goes to exit will block in perf_event_exit_event().
24342450
*
2435-
* When called from perf_pending_event it's OK because event->ctx
2451+
* When called from perf_pending_irq it's OK because event->ctx
24362452
* is the current context on this CPU and preemption is disabled,
24372453
* hence we can't get into perf_event_task_sched_out for this context.
24382454
*/
@@ -2471,9 +2487,8 @@ EXPORT_SYMBOL_GPL(perf_event_disable);
24712487

24722488
void perf_event_disable_inatomic(struct perf_event *event)
24732489
{
2474-
WRITE_ONCE(event->pending_disable, smp_processor_id());
2475-
/* can fail, see perf_pending_event_disable() */
2476-
irq_work_queue(&event->pending);
2490+
event->pending_disable = 1;
2491+
irq_work_queue(&event->pending_irq);
24772492
}
24782493

24792494
#define MAX_INTERRUPTS (~0ULL)
@@ -3428,11 +3443,23 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
34283443
raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
34293444
if (context_equiv(ctx, next_ctx)) {
34303445

3446+
perf_pmu_disable(pmu);
3447+
3448+
/* PMIs are disabled; ctx->nr_pending is stable. */
3449+
if (local_read(&ctx->nr_pending) ||
3450+
local_read(&next_ctx->nr_pending)) {
3451+
/*
3452+
* Must not swap out ctx when there's pending
3453+
* events that rely on the ctx->task relation.
3454+
*/
3455+
raw_spin_unlock(&next_ctx->lock);
3456+
rcu_read_unlock();
3457+
goto inside_switch;
3458+
}
3459+
34313460
WRITE_ONCE(ctx->task, next);
34323461
WRITE_ONCE(next_ctx->task, task);
34333462

3434-
perf_pmu_disable(pmu);
3435-
34363463
if (cpuctx->sched_cb_usage && pmu->sched_task)
34373464
pmu->sched_task(ctx, false);
34383465

@@ -3473,6 +3500,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
34733500
raw_spin_lock(&ctx->lock);
34743501
perf_pmu_disable(pmu);
34753502

3503+
inside_switch:
34763504
if (cpuctx->sched_cb_usage && pmu->sched_task)
34773505
pmu->sched_task(ctx, false);
34783506
task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
@@ -4939,7 +4967,7 @@ static void perf_addr_filters_splice(struct perf_event *event,
49394967

49404968
static void _free_event(struct perf_event *event)
49414969
{
4942-
irq_work_sync(&event->pending);
4970+
irq_work_sync(&event->pending_irq);
49434971

49444972
unaccount_event(event);
49454973

@@ -6439,7 +6467,8 @@ static void perf_sigtrap(struct perf_event *event)
64396467
return;
64406468

64416469
/*
6442-
* perf_pending_event() can race with the task exiting.
6470+
* Both perf_pending_task() and perf_pending_irq() can race with the
6471+
* task exiting.
64436472
*/
64446473
if (current->flags & PF_EXITING)
64456474
return;
@@ -6448,23 +6477,33 @@ static void perf_sigtrap(struct perf_event *event)
64486477
event->attr.type, event->attr.sig_data);
64496478
}
64506479

6451-
static void perf_pending_event_disable(struct perf_event *event)
6480+
/*
6481+
* Deliver the pending work in-event-context or follow the context.
6482+
*/
6483+
static void __perf_pending_irq(struct perf_event *event)
64526484
{
6453-
int cpu = READ_ONCE(event->pending_disable);
6485+
int cpu = READ_ONCE(event->oncpu);
64546486

6487+
/*
6488+
* If the event isn't running; we done. event_sched_out() will have
6489+
* taken care of things.
6490+
*/
64556491
if (cpu < 0)
64566492
return;
64576493

6494+
/*
6495+
* Yay, we hit home and are in the context of the event.
6496+
*/
64586497
if (cpu == smp_processor_id()) {
6459-
WRITE_ONCE(event->pending_disable, -1);
6460-
6461-
if (event->attr.sigtrap) {
6498+
if (event->pending_sigtrap) {
6499+
event->pending_sigtrap = 0;
64626500
perf_sigtrap(event);
6463-
atomic_set_release(&event->event_limit, 1); /* rearm event */
6464-
return;
6501+
local_dec(&event->ctx->nr_pending);
6502+
}
6503+
if (event->pending_disable) {
6504+
event->pending_disable = 0;
6505+
perf_event_disable_local(event);
64656506
}
6466-
6467-
perf_event_disable_local(event);
64686507
return;
64696508
}
64706509

@@ -6484,35 +6523,62 @@ static void perf_pending_event_disable(struct perf_event *event)
64846523
* irq_work_queue(); // FAILS
64856524
*
64866525
* irq_work_run()
6487-
* perf_pending_event()
6526+
* perf_pending_irq()
64886527
*
64896528
* But the event runs on CPU-B and wants disabling there.
64906529
*/
6491-
irq_work_queue_on(&event->pending, cpu);
6530+
irq_work_queue_on(&event->pending_irq, cpu);
64926531
}
64936532

6494-
static void perf_pending_event(struct irq_work *entry)
6533+
static void perf_pending_irq(struct irq_work *entry)
64956534
{
6496-
struct perf_event *event = container_of(entry, struct perf_event, pending);
6535+
struct perf_event *event = container_of(entry, struct perf_event, pending_irq);
64976536
int rctx;
64986537

6499-
rctx = perf_swevent_get_recursion_context();
65006538
/*
65016539
* If we 'fail' here, that's OK, it means recursion is already disabled
65026540
* and we won't recurse 'further'.
65036541
*/
6542+
rctx = perf_swevent_get_recursion_context();
65046543

6505-
perf_pending_event_disable(event);
6506-
6544+
/*
6545+
* The wakeup isn't bound to the context of the event -- it can happen
6546+
* irrespective of where the event is.
6547+
*/
65076548
if (event->pending_wakeup) {
65086549
event->pending_wakeup = 0;
65096550
perf_event_wakeup(event);
65106551
}
65116552

6553+
__perf_pending_irq(event);
6554+
65126555
if (rctx >= 0)
65136556
perf_swevent_put_recursion_context(rctx);
65146557
}
65156558

6559+
static void perf_pending_task(struct callback_head *head)
6560+
{
6561+
struct perf_event *event = container_of(head, struct perf_event, pending_task);
6562+
int rctx;
6563+
6564+
/*
6565+
* If we 'fail' here, that's OK, it means recursion is already disabled
6566+
* and we won't recurse 'further'.
6567+
*/
6568+
preempt_disable_notrace();
6569+
rctx = perf_swevent_get_recursion_context();
6570+
6571+
if (event->pending_work) {
6572+
event->pending_work = 0;
6573+
perf_sigtrap(event);
6574+
local_dec(&event->ctx->nr_pending);
6575+
}
6576+
6577+
if (rctx >= 0)
6578+
perf_swevent_put_recursion_context(rctx);
6579+
preempt_enable_notrace();
6580+
}
6581+
65166582
#ifdef CONFIG_GUEST_PERF_EVENTS
65176583
struct perf_guest_info_callbacks __rcu *perf_guest_cbs;
65186584

@@ -9212,8 +9278,8 @@ int perf_event_account_interrupt(struct perf_event *event)
92129278
*/
92139279

92149280
static int __perf_event_overflow(struct perf_event *event,
9215-
int throttle, struct perf_sample_data *data,
9216-
struct pt_regs *regs)
9281+
int throttle, struct perf_sample_data *data,
9282+
struct pt_regs *regs)
92179283
{
92189284
int events = atomic_read(&event->event_limit);
92199285
int ret = 0;
@@ -9236,24 +9302,36 @@ static int __perf_event_overflow(struct perf_event *event,
92369302
if (events && atomic_dec_and_test(&event->event_limit)) {
92379303
ret = 1;
92389304
event->pending_kill = POLL_HUP;
9239-
event->pending_addr = data->addr;
9240-
92419305
perf_event_disable_inatomic(event);
92429306
}
92439307

9308+
if (event->attr.sigtrap) {
9309+
/*
9310+
* Should not be able to return to user space without processing
9311+
* pending_sigtrap (kernel events can overflow multiple times).
9312+
*/
9313+
WARN_ON_ONCE(event->pending_sigtrap && event->attr.exclude_kernel);
9314+
if (!event->pending_sigtrap) {
9315+
event->pending_sigtrap = 1;
9316+
local_inc(&event->ctx->nr_pending);
9317+
}
9318+
event->pending_addr = data->addr;
9319+
irq_work_queue(&event->pending_irq);
9320+
}
9321+
92449322
READ_ONCE(event->overflow_handler)(event, data, regs);
92459323

92469324
if (*perf_event_fasync(event) && event->pending_kill) {
92479325
event->pending_wakeup = 1;
9248-
irq_work_queue(&event->pending);
9326+
irq_work_queue(&event->pending_irq);
92499327
}
92509328

92519329
return ret;
92529330
}
92539331

92549332
int perf_event_overflow(struct perf_event *event,
9255-
struct perf_sample_data *data,
9256-
struct pt_regs *regs)
9333+
struct perf_sample_data *data,
9334+
struct pt_regs *regs)
92579335
{
92589336
return __perf_event_overflow(event, 1, data, regs);
92599337
}
@@ -11570,8 +11648,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
1157011648

1157111649

1157211650
init_waitqueue_head(&event->waitq);
11573-
event->pending_disable = -1;
11574-
init_irq_work(&event->pending, perf_pending_event);
11651+
init_irq_work(&event->pending_irq, perf_pending_irq);
11652+
init_task_work(&event->pending_task, perf_pending_task);
1157511653

1157611654
mutex_init(&event->mmap_mutex);
1157711655
raw_spin_lock_init(&event->addr_filters.lock);
@@ -11593,9 +11671,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
1159311671
if (parent_event)
1159411672
event->event_caps = parent_event->event_caps;
1159511673

11596-
if (event->attr.sigtrap)
11597-
atomic_set(&event->event_limit, 1);
11598-
1159911674
if (task) {
1160011675
event->attach_state = PERF_ATTACH_TASK;
1160111676
/*

kernel/events/ring_buffer.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
2222
atomic_set(&handle->rb->poll, EPOLLIN);
2323

2424
handle->event->pending_wakeup = 1;
25-
irq_work_queue(&handle->event->pending);
25+
irq_work_queue(&handle->event->pending_irq);
2626
}
2727

2828
/*

0 commit comments

Comments
 (0)