Skip to content

Commit a4faf00

Browse files
virtuosoIngo Molnar
authored andcommitted
perf/aux: Allow using AUX data in perf samples
AUX data can be used to annotate perf events such as performance counters or tracepoints/breakpoints by including it in sample records when PERF_SAMPLE_AUX flag is set. Such samples would be instrumental in debugging and profiling by providing, for example, a history of instruction flow leading up to the event's overflow. The implementation makes use of grouping an AUX event with all the events that wish to take samples of the AUX data, such that the former is the group leader. The samplees should also specify the desired size of the AUX sample via attr.aux_sample_size. AUX capable PMUs need to explicitly add support for sampling, because it relies on a new callback to take a snapshot of the buffer without touching the event states. Signed-off-by: Alexander Shishkin <[email protected]> Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Cc: Arnaldo Carvalho de Melo <[email protected]> Cc: David Ahern <[email protected]> Cc: Jiri Olsa <[email protected]> Cc: Linus Torvalds <[email protected]> Cc: Mark Rutland <[email protected]> Cc: Namhyung Kim <[email protected]> Cc: Stephane Eranian <[email protected]> Cc: Thomas Gleixner <[email protected]> Cc: Vince Weaver <[email protected]> Cc: [email protected] Cc: [email protected] Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: Ingo Molnar <[email protected]>
1 parent deb0c3c commit a4faf00

File tree

5 files changed

+234
-5
lines changed

5 files changed

+234
-5
lines changed

include/linux/perf_event.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,8 @@ struct perf_event;
249249
#define PERF_PMU_CAP_NO_EXCLUDE 0x80
250250
#define PERF_PMU_CAP_AUX_OUTPUT 0x100
251251

252+
struct perf_output_handle;
253+
252254
/**
253255
* struct pmu - generic performance monitoring unit
254256
*/
@@ -432,6 +434,19 @@ struct pmu {
432434
*/
433435
void (*free_aux) (void *aux); /* optional */
434436

437+
/*
438+
* Take a snapshot of the AUX buffer without touching the event
439+
* state, so that preempting ->start()/->stop() callbacks does
440+
* not interfere with their logic. Called in PMI context.
441+
*
442+
* Returns the size of AUX data copied to the output handle.
443+
*
444+
* Optional.
445+
*/
446+
long (*snapshot_aux) (struct perf_event *event,
447+
struct perf_output_handle *handle,
448+
unsigned long size);
449+
435450
/*
436451
* Validate address range filters: make sure the HW supports the
437452
* requested configuration and number of filters; return 0 if the
@@ -973,6 +988,7 @@ struct perf_sample_data {
973988
u32 reserved;
974989
} cpu_entry;
975990
struct perf_callchain_entry *callchain;
991+
u64 aux_size;
976992

977993
/*
978994
* regs_user may point to task_pt_regs or to regs_user_copy, depending
@@ -1362,6 +1378,9 @@ extern unsigned int perf_output_copy(struct perf_output_handle *handle,
13621378
const void *buf, unsigned int len);
13631379
extern unsigned int perf_output_skip(struct perf_output_handle *handle,
13641380
unsigned int len);
1381+
extern long perf_output_copy_aux(struct perf_output_handle *aux_handle,
1382+
struct perf_output_handle *handle,
1383+
unsigned long from, unsigned long to);
13651384
extern int perf_swevent_get_recursion_context(void);
13661385
extern void perf_swevent_put_recursion_context(int rctx);
13671386
extern u64 perf_swevent_set_period(struct perf_event *event);

include/uapi/linux/perf_event.h

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -141,8 +141,9 @@ enum perf_event_sample_format {
141141
PERF_SAMPLE_TRANSACTION = 1U << 17,
142142
PERF_SAMPLE_REGS_INTR = 1U << 18,
143143
PERF_SAMPLE_PHYS_ADDR = 1U << 19,
144+
PERF_SAMPLE_AUX = 1U << 20,
144145

145-
PERF_SAMPLE_MAX = 1U << 20, /* non-ABI */
146+
PERF_SAMPLE_MAX = 1U << 21, /* non-ABI */
146147

147148
__PERF_SAMPLE_CALLCHAIN_EARLY = 1ULL << 63, /* non-ABI; internal use */
148149
};
@@ -300,6 +301,7 @@ enum perf_event_read_format {
300301
/* add: sample_stack_user */
301302
#define PERF_ATTR_SIZE_VER4 104 /* add: sample_regs_intr */
302303
#define PERF_ATTR_SIZE_VER5 112 /* add: aux_watermark */
304+
#define PERF_ATTR_SIZE_VER6 120 /* add: aux_sample_size */
303305

304306
/*
305307
* Hardware event_id to monitor via a performance monitoring event:
@@ -424,7 +426,9 @@ struct perf_event_attr {
424426
*/
425427
__u32 aux_watermark;
426428
__u16 sample_max_stack;
427-
__u16 __reserved_2; /* align to __u64 */
429+
__u16 __reserved_2;
430+
__u32 aux_sample_size;
431+
__u32 __reserved_3;
428432
};
429433

430434
/*
@@ -864,6 +868,8 @@ enum perf_event_type {
864868
* { u64 abi; # enum perf_sample_regs_abi
865869
* u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
866870
* { u64 phys_addr;} && PERF_SAMPLE_PHYS_ADDR
871+
* { u64 size;
872+
* char data[size]; } && PERF_SAMPLE_AUX
867873
* };
868874
*/
869875
PERF_RECORD_SAMPLE = 9,

kernel/events/core.c

Lines changed: 170 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1941,6 +1941,11 @@ static void perf_put_aux_event(struct perf_event *event)
19411941
}
19421942
}
19431943

1944+
static bool perf_need_aux_event(struct perf_event *event)
1945+
{
1946+
return !!event->attr.aux_output || !!event->attr.aux_sample_size;
1947+
}
1948+
19441949
static int perf_get_aux_event(struct perf_event *event,
19451950
struct perf_event *group_leader)
19461951
{
@@ -1953,7 +1958,17 @@ static int perf_get_aux_event(struct perf_event *event,
19531958
if (!group_leader)
19541959
return 0;
19551960

1956-
if (!perf_aux_output_match(event, group_leader))
1961+
/*
1962+
* aux_output and aux_sample_size are mutually exclusive.
1963+
*/
1964+
if (event->attr.aux_output && event->attr.aux_sample_size)
1965+
return 0;
1966+
1967+
if (event->attr.aux_output &&
1968+
!perf_aux_output_match(event, group_leader))
1969+
return 0;
1970+
1971+
if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux)
19571972
return 0;
19581973

19591974
if (!atomic_long_inc_not_zero(&group_leader->refcount))
@@ -6222,6 +6237,122 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
62226237
}
62236238
}
62246239

6240+
static unsigned long perf_prepare_sample_aux(struct perf_event *event,
6241+
struct perf_sample_data *data,
6242+
size_t size)
6243+
{
6244+
struct perf_event *sampler = event->aux_event;
6245+
struct ring_buffer *rb;
6246+
6247+
data->aux_size = 0;
6248+
6249+
if (!sampler)
6250+
goto out;
6251+
6252+
if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE))
6253+
goto out;
6254+
6255+
if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id()))
6256+
goto out;
6257+
6258+
rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler);
6259+
if (!rb)
6260+
goto out;
6261+
6262+
/*
6263+
* If this is an NMI hit inside sampling code, don't take
6264+
* the sample. See also perf_aux_sample_output().
6265+
*/
6266+
if (READ_ONCE(rb->aux_in_sampling)) {
6267+
data->aux_size = 0;
6268+
} else {
6269+
size = min_t(size_t, size, perf_aux_size(rb));
6270+
data->aux_size = ALIGN(size, sizeof(u64));
6271+
}
6272+
ring_buffer_put(rb);
6273+
6274+
out:
6275+
return data->aux_size;
6276+
}
6277+
6278+
long perf_pmu_snapshot_aux(struct ring_buffer *rb,
6279+
struct perf_event *event,
6280+
struct perf_output_handle *handle,
6281+
unsigned long size)
6282+
{
6283+
unsigned long flags;
6284+
long ret;
6285+
6286+
/*
6287+
* Normal ->start()/->stop() callbacks run in IRQ mode in scheduler
6288+
* paths. If we start calling them in NMI context, they may race with
6289+
* the IRQ ones, that is, for example, re-starting an event that's just
6290+
* been stopped, which is why we're using a separate callback that
6291+
* doesn't change the event state.
6292+
*
6293+
* IRQs need to be disabled to prevent IPIs from racing with us.
6294+
*/
6295+
local_irq_save(flags);
6296+
/*
6297+
* Guard against NMI hits inside the critical section;
6298+
* see also perf_prepare_sample_aux().
6299+
*/
6300+
WRITE_ONCE(rb->aux_in_sampling, 1);
6301+
barrier();
6302+
6303+
ret = event->pmu->snapshot_aux(event, handle, size);
6304+
6305+
barrier();
6306+
WRITE_ONCE(rb->aux_in_sampling, 0);
6307+
local_irq_restore(flags);
6308+
6309+
return ret;
6310+
}
6311+
6312+
static void perf_aux_sample_output(struct perf_event *event,
6313+
struct perf_output_handle *handle,
6314+
struct perf_sample_data *data)
6315+
{
6316+
struct perf_event *sampler = event->aux_event;
6317+
unsigned long pad;
6318+
struct ring_buffer *rb;
6319+
long size;
6320+
6321+
if (WARN_ON_ONCE(!sampler || !data->aux_size))
6322+
return;
6323+
6324+
rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler);
6325+
if (!rb)
6326+
return;
6327+
6328+
size = perf_pmu_snapshot_aux(rb, sampler, handle, data->aux_size);
6329+
6330+
/*
6331+
* An error here means that perf_output_copy() failed (returned a
6332+
* non-zero surplus that it didn't copy), which in its current
6333+
* enlightened implementation is not possible. If that changes, we'd
6334+
* like to know.
6335+
*/
6336+
if (WARN_ON_ONCE(size < 0))
6337+
goto out_put;
6338+
6339+
/*
6340+
* The pad comes from ALIGN()ing data->aux_size up to u64 in
6341+
* perf_prepare_sample_aux(), so should not be more than that.
6342+
*/
6343+
pad = data->aux_size - size;
6344+
if (WARN_ON_ONCE(pad >= sizeof(u64)))
6345+
pad = 8;
6346+
6347+
if (pad) {
6348+
u64 zero = 0;
6349+
perf_output_copy(handle, &zero, pad);
6350+
}
6351+
6352+
out_put:
6353+
ring_buffer_put(rb);
6354+
}
6355+
62256356
static void __perf_event_header__init_id(struct perf_event_header *header,
62266357
struct perf_sample_data *data,
62276358
struct perf_event *event)
@@ -6541,6 +6672,13 @@ void perf_output_sample(struct perf_output_handle *handle,
65416672
if (sample_type & PERF_SAMPLE_PHYS_ADDR)
65426673
perf_output_put(handle, data->phys_addr);
65436674

6675+
if (sample_type & PERF_SAMPLE_AUX) {
6676+
perf_output_put(handle, data->aux_size);
6677+
6678+
if (data->aux_size)
6679+
perf_aux_sample_output(event, handle, data);
6680+
}
6681+
65446682
if (!event->attr.watermark) {
65456683
int wakeup_events = event->attr.wakeup_events;
65466684

@@ -6729,6 +6867,35 @@ void perf_prepare_sample(struct perf_event_header *header,
67296867

67306868
if (sample_type & PERF_SAMPLE_PHYS_ADDR)
67316869
data->phys_addr = perf_virt_to_phys(data->addr);
6870+
6871+
if (sample_type & PERF_SAMPLE_AUX) {
6872+
u64 size;
6873+
6874+
header->size += sizeof(u64); /* size */
6875+
6876+
/*
6877+
* Given the 16bit nature of header::size, an AUX sample can
6878+
* easily overflow it, what with all the preceding sample bits.
6879+
* Make sure this doesn't happen by using up to U16_MAX bytes
6880+
* per sample in total (rounded down to 8 byte boundary).
6881+
*/
6882+
size = min_t(size_t, U16_MAX - header->size,
6883+
event->attr.aux_sample_size);
6884+
size = rounddown(size, 8);
6885+
size = perf_prepare_sample_aux(event, data, size);
6886+
6887+
WARN_ON_ONCE(size + header->size > U16_MAX);
6888+
header->size += size;
6889+
}
6890+
/*
6891+
* If you're adding more sample types here, you likely need to do
6892+
* something about the overflowing header::size, like repurpose the
6893+
* lowest 3 bits of size, which should be always zero at the moment.
6894+
* This raises a more important question, do we really need 512k sized
6895+
* samples and why, so good argumentation is in order for whatever you
6896+
* do here next.
6897+
*/
6898+
WARN_ON_ONCE(header->size & 7);
67326899
}
67336900

67346901
static __always_inline int
@@ -10727,7 +10894,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
1072710894

1072810895
attr->size = size;
1072910896

10730-
if (attr->__reserved_1 || attr->__reserved_2)
10897+
if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
1073110898
return -EINVAL;
1073210899

1073310900
if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
@@ -11277,7 +11444,7 @@ SYSCALL_DEFINE5(perf_event_open,
1127711444
}
1127811445
}
1127911446

11280-
if (event->attr.aux_output && !perf_get_aux_event(event, group_leader))
11447+
if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader))
1128111448
goto err_locked;
1128211449

1128311450
/*

kernel/events/internal.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ struct ring_buffer {
5050
unsigned long aux_mmap_locked;
5151
void (*free_aux)(void *);
5252
refcount_t aux_refcount;
53+
int aux_in_sampling;
5354
void **aux_pages;
5455
void *aux_priv;
5556

kernel/events/ring_buffer.c

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -562,6 +562,42 @@ void *perf_get_aux(struct perf_output_handle *handle)
562562
}
563563
EXPORT_SYMBOL_GPL(perf_get_aux);
564564

565+
/*
566+
* Copy out AUX data from an AUX handle.
567+
*/
568+
long perf_output_copy_aux(struct perf_output_handle *aux_handle,
569+
struct perf_output_handle *handle,
570+
unsigned long from, unsigned long to)
571+
{
572+
unsigned long tocopy, remainder, len = 0;
573+
struct ring_buffer *rb = aux_handle->rb;
574+
void *addr;
575+
576+
from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1;
577+
to &= (rb->aux_nr_pages << PAGE_SHIFT) - 1;
578+
579+
do {
580+
tocopy = PAGE_SIZE - offset_in_page(from);
581+
if (to > from)
582+
tocopy = min(tocopy, to - from);
583+
if (!tocopy)
584+
break;
585+
586+
addr = rb->aux_pages[from >> PAGE_SHIFT];
587+
addr += offset_in_page(from);
588+
589+
remainder = perf_output_copy(handle, addr, tocopy);
590+
if (remainder)
591+
return -EFAULT;
592+
593+
len += tocopy;
594+
from += tocopy;
595+
from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1;
596+
} while (to != from);
597+
598+
return len;
599+
}
600+
565601
#define PERF_AUX_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY)
566602

567603
static struct page *rb_alloc_aux_page(int node, int order)

0 commit comments

Comments
 (0)