Skip to content

Commit 7e3f977

Browse files
borkmanndavem330
authored andcommitted
perf, events: add non-linear data support for raw records
This patch adds support for non-linear data on raw records. It extends raw records to have one or multiple fragments that will be written linearly into the ring slot, where each fragment can optionally have a custom callback handler to walk and extract complex, possibly non-linear data. If a callback handler is provided for a fragment, then the new __output_custom() will be used instead of __output_copy() for the perf_output_sample() part. perf_prepare_sample() does all the size calculation only once, so perf_output_sample() doesn't need to redo the same work anymore, meaning real_size and padding will be cached in the raw record. The raw record becomes 32 bytes in size without holes; to not increase it further and to avoid doing unnecessary recalculations in fast-path, we can reuse next pointer of the last fragment, idea here is borrowed from ZERO_OR_NULL_PTR(), which should keep the perf_output_sample() path for PERF_SAMPLE_RAW minimal. This facility is needed for BPF's event output helper as a first user that will, in a follow-up, add an additional perf_raw_frag to its perf_raw_record in order to be able to more efficiently dump skb context after a linear head meta data related to it. skbs can be non-linear and thus need a custom output function to dump buffers. Currently, the skb data needs to be copied twice; with the help of __output_custom() this work only needs to be done once. Future users could be things like XDP/BPF programs that work on different context though and would thus also have a different callback function. The few users of raw records are adapted to initialize their frag data from the raw record itself, no change in behavior for them. The code is based upon a PoC diff provided by Peter Zijlstra [1]. [1] http://thread.gmane.org/gmane.linux.network/421294 Suggested-by: Peter Zijlstra <[email protected]> Signed-off-by: Daniel Borkmann <[email protected]> Acked-by: Alexei Starovoitov <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 7acef60 commit 7e3f977

File tree

6 files changed

+93
-32
lines changed

6 files changed

+93
-32
lines changed

arch/s390/kernel/perf_cpum_sf.c

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -979,12 +979,15 @@ static int perf_push_sample(struct perf_event *event, struct sf_raw_sample *sfr)
979979
struct pt_regs regs;
980980
struct perf_sf_sde_regs *sde_regs;
981981
struct perf_sample_data data;
982-
struct perf_raw_record raw;
982+
struct perf_raw_record raw = {
983+
.frag = {
984+
.size = sfr->size,
985+
.data = sfr,
986+
},
987+
};
983988

984989
/* Setup perf sample */
985990
perf_sample_data_init(&data, 0, event->hw.last_period);
986-
raw.size = sfr->size;
987-
raw.data = sfr;
988991
data.raw = &raw;
989992

990993
/* Setup pt_regs to look like an CPU-measurement external interrupt

arch/x86/events/amd/ibs.c

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -655,8 +655,12 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
655655
}
656656

657657
if (event->attr.sample_type & PERF_SAMPLE_RAW) {
658-
raw.size = sizeof(u32) + ibs_data.size;
659-
raw.data = ibs_data.data;
658+
raw = (struct perf_raw_record){
659+
.frag = {
660+
.size = sizeof(u32) + ibs_data.size,
661+
.data = ibs_data.data,
662+
},
663+
};
660664
data.raw = &raw;
661665
}
662666

include/linux/perf_event.h

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,9 +69,22 @@ struct perf_callchain_entry_ctx {
6969
bool contexts_maxed;
7070
};
7171

72+
typedef unsigned long (*perf_copy_f)(void *dst, const void *src,
73+
unsigned long len);
74+
75+
struct perf_raw_frag {
76+
union {
77+
struct perf_raw_frag *next;
78+
unsigned long pad;
79+
};
80+
perf_copy_f copy;
81+
void *data;
82+
u32 size;
83+
} __packed;
84+
7285
struct perf_raw_record {
86+
struct perf_raw_frag frag;
7387
u32 size;
74-
void *data;
7588
};
7689

7790
/*
@@ -1283,6 +1296,11 @@ extern void perf_restore_debug_store(void);
12831296
static inline void perf_restore_debug_store(void) { }
12841297
#endif
12851298

1299+
static __always_inline bool perf_raw_frag_last(const struct perf_raw_frag *frag)
1300+
{
1301+
return frag->pad < sizeof(u64);
1302+
}
1303+
12861304
#define perf_output_put(handle, x) perf_output_copy((handle), &(x), sizeof(x))
12871305

12881306
/*

kernel/events/core.c

Lines changed: 46 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -5553,16 +5553,26 @@ void perf_output_sample(struct perf_output_handle *handle,
55535553
}
55545554

55555555
if (sample_type & PERF_SAMPLE_RAW) {
5556-
if (data->raw) {
5557-
u32 raw_size = data->raw->size;
5558-
u32 real_size = round_up(raw_size + sizeof(u32),
5559-
sizeof(u64)) - sizeof(u32);
5560-
u64 zero = 0;
5561-
5562-
perf_output_put(handle, real_size);
5563-
__output_copy(handle, data->raw->data, raw_size);
5564-
if (real_size - raw_size)
5565-
__output_copy(handle, &zero, real_size - raw_size);
5556+
struct perf_raw_record *raw = data->raw;
5557+
5558+
if (raw) {
5559+
struct perf_raw_frag *frag = &raw->frag;
5560+
5561+
perf_output_put(handle, raw->size);
5562+
do {
5563+
if (frag->copy) {
5564+
__output_custom(handle, frag->copy,
5565+
frag->data, frag->size);
5566+
} else {
5567+
__output_copy(handle, frag->data,
5568+
frag->size);
5569+
}
5570+
if (perf_raw_frag_last(frag))
5571+
break;
5572+
frag = frag->next;
5573+
} while (1);
5574+
if (frag->pad)
5575+
__output_skip(handle, NULL, frag->pad);
55665576
} else {
55675577
struct {
55685578
u32 size;
@@ -5687,14 +5697,28 @@ void perf_prepare_sample(struct perf_event_header *header,
56875697
}
56885698

56895699
if (sample_type & PERF_SAMPLE_RAW) {
5690-
int size = sizeof(u32);
5691-
5692-
if (data->raw)
5693-
size += data->raw->size;
5694-
else
5695-
size += sizeof(u32);
5700+
struct perf_raw_record *raw = data->raw;
5701+
int size;
5702+
5703+
if (raw) {
5704+
struct perf_raw_frag *frag = &raw->frag;
5705+
u32 sum = 0;
5706+
5707+
do {
5708+
sum += frag->size;
5709+
if (perf_raw_frag_last(frag))
5710+
break;
5711+
frag = frag->next;
5712+
} while (1);
5713+
5714+
size = round_up(sum + sizeof(u32), sizeof(u64));
5715+
raw->size = size - sizeof(u32);
5716+
frag->pad = raw->size - sum;
5717+
} else {
5718+
size = sizeof(u64);
5719+
}
56965720

5697-
header->size += round_up(size, sizeof(u64));
5721+
header->size += size;
56985722
}
56995723

57005724
if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
@@ -7331,7 +7355,7 @@ static struct pmu perf_swevent = {
73317355
static int perf_tp_filter_match(struct perf_event *event,
73327356
struct perf_sample_data *data)
73337357
{
7334-
void *record = data->raw->data;
7358+
void *record = data->raw->frag.data;
73357359

73367360
/* only top level events have filters set */
73377361
if (event->parent)
@@ -7387,8 +7411,10 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
73877411
struct perf_event *event;
73887412

73897413
struct perf_raw_record raw = {
7390-
.size = entry_size,
7391-
.data = record,
7414+
.frag = {
7415+
.size = entry_size,
7416+
.data = record,
7417+
},
73927418
};
73937419

73947420
perf_sample_data_init(&data, 0, 0);

kernel/events/internal.h

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -123,10 +123,7 @@ static inline unsigned long perf_aux_size(struct ring_buffer *rb)
123123
return rb->aux_nr_pages << PAGE_SHIFT;
124124
}
125125

126-
#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \
127-
static inline unsigned long \
128-
func_name(struct perf_output_handle *handle, \
129-
const void *buf, unsigned long len) \
126+
#define __DEFINE_OUTPUT_COPY_BODY(memcpy_func) \
130127
{ \
131128
unsigned long size, written; \
132129
\
@@ -152,6 +149,17 @@ func_name(struct perf_output_handle *handle, \
152149
return len; \
153150
}
154151

152+
#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \
153+
static inline unsigned long \
154+
func_name(struct perf_output_handle *handle, \
155+
const void *buf, unsigned long len) \
156+
__DEFINE_OUTPUT_COPY_BODY(memcpy_func)
157+
158+
static inline unsigned long
159+
__output_custom(struct perf_output_handle *handle, perf_copy_f copy_func,
160+
const void *buf, unsigned long len)
161+
__DEFINE_OUTPUT_COPY_BODY(copy_func)
162+
155163
static inline unsigned long
156164
memcpy_common(void *dst, const void *src, unsigned long n)
157165
{

kernel/trace/bpf_trace.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -245,8 +245,10 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
245245
struct bpf_event_entry *ee;
246246
struct perf_event *event;
247247
struct perf_raw_record raw = {
248-
.size = size,
249-
.data = data,
248+
.frag = {
249+
.size = size,
250+
.data = data,
251+
},
250252
};
251253

252254
if (unlikely(flags & ~(BPF_F_INDEX_MASK)))

0 commit comments

Comments
 (0)