Skip to content

Commit 34f4392

Browse files
Peter ZijlstraIngo Molnar
authored andcommitted
perf: Add per event clockid support
While thinking on the whole clock discussion it occurred to me we have two distinct uses of time: 1) the tracking of event/ctx/cgroup enabled/running/stopped times which includes the self-monitoring support in struct perf_event_mmap_page. 2) the actual timestamps visible in the data records. And we've been conflating them. The first is all about tracking time deltas, nobody should really care in what time base that happens, its all relative information, as long as its internally consistent it works. The second however is what people are worried about when having to merge their data with external sources. And here we have the discussion on MONOTONIC vs MONOTONIC_RAW etc.. Where MONOTONIC is good for correlating between machines (static offset), MONOTNIC_RAW is required for correlating against a fixed rate hardware clock. This means configurability; now 1) makes that hard because it needs to be internally consistent across groups of unrelated events; which is why we had to have a global perf_clock(). However, for 2) it doesn't really matter, perf itself doesn't care what it writes into the buffer. The below patch makes the distinction between these two cases by adding perf_event_clock() which is used for the second case. It further makes this configurable on a per-event basis, but adds a few sanity checks such that we cannot combine events with different clocks in confusing ways. And since we then have per-event configurability we might as well retain the 'legacy' behaviour as a default. Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Cc: Andrew Morton <[email protected]> Cc: Arnaldo Carvalho de Melo <[email protected]> Cc: David Ahern <[email protected]> Cc: Jiri Olsa <[email protected]> Cc: John Stultz <[email protected]> Cc: Linus Torvalds <[email protected]> Cc: Peter Zijlstra <[email protected]> Cc: Stephane Eranian <[email protected]> Cc: Thomas Gleixner <[email protected]> Signed-off-by: Ingo Molnar <[email protected]>
1 parent b381e63 commit 34f4392

File tree

4 files changed

+91
-8
lines changed

4 files changed

+91
-8
lines changed

arch/x86/kernel/cpu/perf_event.c

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1978,13 +1978,23 @@ void arch_perf_update_userpage(struct perf_event *event,
19781978

19791979
data = cyc2ns_read_begin();
19801980

1981+
/*
1982+
* Internal timekeeping for enabled/running/stopped times
1983+
* is always in the local_clock domain.
1984+
*/
19811985
userpg->cap_user_time = 1;
19821986
userpg->time_mult = data->cyc2ns_mul;
19831987
userpg->time_shift = data->cyc2ns_shift;
19841988
userpg->time_offset = data->cyc2ns_offset - now;
19851989

1986-
userpg->cap_user_time_zero = 1;
1987-
userpg->time_zero = data->cyc2ns_offset;
1990+
/*
1991+
* cap_user_time_zero doesn't make sense when we're using a different
1992+
* time base for the records.
1993+
*/
1994+
if (event->clock == &local_clock) {
1995+
userpg->cap_user_time_zero = 1;
1996+
userpg->time_zero = data->cyc2ns_offset;
1997+
}
19881998

19891999
cyc2ns_read_end(data);
19902000
}

include/linux/perf_event.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,7 @@ struct perf_event;
173173
* pmu::capabilities flags
174174
*/
175175
#define PERF_PMU_CAP_NO_INTERRUPT 0x01
176+
#define PERF_PMU_CAP_NO_NMI 0x02
176177

177178
/**
178179
* struct pmu - generic performance monitoring unit
@@ -457,6 +458,7 @@ struct perf_event {
457458
struct pid_namespace *ns;
458459
u64 id;
459460

461+
u64 (*clock)(void);
460462
perf_overflow_handler_t overflow_handler;
461463
void *overflow_handler_context;
462464

include/uapi/linux/perf_event.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -326,7 +326,8 @@ struct perf_event_attr {
326326
exclude_callchain_user : 1, /* exclude user callchains */
327327
mmap2 : 1, /* include mmap with inode data */
328328
comm_exec : 1, /* flag comm events that are due to an exec */
329-
__reserved_1 : 39;
329+
use_clockid : 1, /* use @clockid for time fields */
330+
__reserved_1 : 38;
330331

331332
union {
332333
__u32 wakeup_events; /* wakeup every n events */
@@ -355,8 +356,7 @@ struct perf_event_attr {
355356
*/
356357
__u32 sample_stack_user;
357358

358-
/* Align to u64. */
359-
__u32 __reserved_2;
359+
__s32 clockid;
360360
/*
361361
* Defines set of regs to dump for each sample
362362
* state captured on:

kernel/events/core.c

Lines changed: 74 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -327,6 +327,11 @@ static inline u64 perf_clock(void)
327327
return local_clock();
328328
}
329329

330+
static inline u64 perf_event_clock(struct perf_event *event)
331+
{
332+
return event->clock();
333+
}
334+
330335
static inline struct perf_cpu_context *
331336
__get_cpu_context(struct perf_event_context *ctx)
332337
{
@@ -4762,7 +4767,7 @@ static void __perf_event_header__init_id(struct perf_event_header *header,
47624767
}
47634768

47644769
if (sample_type & PERF_SAMPLE_TIME)
4765-
data->time = perf_clock();
4770+
data->time = perf_event_clock(event);
47664771

47674772
if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
47684773
data->id = primary_event_id(event);
@@ -5340,6 +5345,8 @@ static void perf_event_task_output(struct perf_event *event,
53405345
task_event->event_id.tid = perf_event_tid(event, task);
53415346
task_event->event_id.ptid = perf_event_tid(event, current);
53425347

5348+
task_event->event_id.time = perf_event_clock(event);
5349+
53435350
perf_output_put(&handle, task_event->event_id);
53445351

53455352
perf_event__output_id_sample(event, &handle, &sample);
@@ -5373,7 +5380,7 @@ static void perf_event_task(struct task_struct *task,
53735380
/* .ppid */
53745381
/* .tid */
53755382
/* .ptid */
5376-
.time = perf_clock(),
5383+
/* .time */
53775384
},
53785385
};
53795386

@@ -5749,7 +5756,7 @@ static void perf_log_throttle(struct perf_event *event, int enable)
57495756
.misc = 0,
57505757
.size = sizeof(throttle_event),
57515758
},
5752-
.time = perf_clock(),
5759+
.time = perf_event_clock(event),
57535760
.id = primary_event_id(event),
57545761
.stream_id = event->id,
57555762
};
@@ -6293,6 +6300,8 @@ static int perf_swevent_init(struct perf_event *event)
62936300
static struct pmu perf_swevent = {
62946301
.task_ctx_nr = perf_sw_context,
62956302

6303+
.capabilities = PERF_PMU_CAP_NO_NMI,
6304+
62966305
.event_init = perf_swevent_init,
62976306
.add = perf_swevent_add,
62986307
.del = perf_swevent_del,
@@ -6636,6 +6645,8 @@ static int cpu_clock_event_init(struct perf_event *event)
66366645
static struct pmu perf_cpu_clock = {
66376646
.task_ctx_nr = perf_sw_context,
66386647

6648+
.capabilities = PERF_PMU_CAP_NO_NMI,
6649+
66396650
.event_init = cpu_clock_event_init,
66406651
.add = cpu_clock_event_add,
66416652
.del = cpu_clock_event_del,
@@ -6715,6 +6726,8 @@ static int task_clock_event_init(struct perf_event *event)
67156726
static struct pmu perf_task_clock = {
67166727
.task_ctx_nr = perf_sw_context,
67176728

6729+
.capabilities = PERF_PMU_CAP_NO_NMI,
6730+
67186731
.event_init = task_clock_event_init,
67196732
.add = task_clock_event_add,
67206733
.del = task_clock_event_del,
@@ -7200,6 +7213,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
72007213
event->hw.target = task;
72017214
}
72027215

7216+
event->clock = &local_clock;
7217+
if (parent_event)
7218+
event->clock = parent_event->clock;
7219+
72037220
if (!overflow_handler && parent_event) {
72047221
overflow_handler = parent_event->overflow_handler;
72057222
context = parent_event->overflow_handler_context;
@@ -7422,6 +7439,12 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
74227439
if (output_event->cpu == -1 && output_event->ctx != event->ctx)
74237440
goto out;
74247441

7442+
/*
7443+
* Mixing clocks in the same buffer is trouble you don't need.
7444+
*/
7445+
if (output_event->clock != event->clock)
7446+
goto out;
7447+
74257448
set:
74267449
mutex_lock(&event->mmap_mutex);
74277450
/* Can't redirect output if we've got an active mmap() */
@@ -7454,6 +7477,43 @@ static void mutex_lock_double(struct mutex *a, struct mutex *b)
74547477
mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
74557478
}
74567479

7480+
static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
7481+
{
7482+
bool nmi_safe = false;
7483+
7484+
switch (clk_id) {
7485+
case CLOCK_MONOTONIC:
7486+
event->clock = &ktime_get_mono_fast_ns;
7487+
nmi_safe = true;
7488+
break;
7489+
7490+
case CLOCK_MONOTONIC_RAW:
7491+
event->clock = &ktime_get_raw_fast_ns;
7492+
nmi_safe = true;
7493+
break;
7494+
7495+
case CLOCK_REALTIME:
7496+
event->clock = &ktime_get_real_ns;
7497+
break;
7498+
7499+
case CLOCK_BOOTTIME:
7500+
event->clock = &ktime_get_boot_ns;
7501+
break;
7502+
7503+
case CLOCK_TAI:
7504+
event->clock = &ktime_get_tai_ns;
7505+
break;
7506+
7507+
default:
7508+
return -EINVAL;
7509+
}
7510+
7511+
if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
7512+
return -EINVAL;
7513+
7514+
return 0;
7515+
}
7516+
74577517
/**
74587518
* sys_perf_event_open - open a performance event, associate it to a task/cpu
74597519
*
@@ -7569,6 +7629,12 @@ SYSCALL_DEFINE5(perf_event_open,
75697629
*/
75707630
pmu = event->pmu;
75717631

7632+
if (attr.use_clockid) {
7633+
err = perf_event_set_clock(event, attr.clockid);
7634+
if (err)
7635+
goto err_alloc;
7636+
}
7637+
75727638
if (group_leader &&
75737639
(is_software_event(event) != is_software_event(group_leader))) {
75747640
if (is_software_event(event)) {
@@ -7618,6 +7684,11 @@ SYSCALL_DEFINE5(perf_event_open,
76187684
*/
76197685
if (group_leader->group_leader != group_leader)
76207686
goto err_context;
7687+
7688+
/* All events in a group should have the same clock */
7689+
if (group_leader->clock != event->clock)
7690+
goto err_context;
7691+
76217692
/*
76227693
* Do not allow to attach to a group in a different
76237694
* task or CPU context:

0 commit comments

Comments
 (0)