Skip to content

Commit 86c2786

Browse files
ahunter6acmel
authored andcommitted
perf intel-pt: Add support for PERF_RECORD_SWITCH
Add support for selecting and processing PERF_RECORD_SWITCH events for use by Intel PT. If they are available, they will be used in preference to sched_switch events. This enables an unprivileged user to trace multi-threaded or multi-process workloads with any level of perf_event_paranoid. However it depends on kernel support for PERF_RECORD_SWITCH. Without this patch, tracing a multi-threaded workload will decode without error but all the data will be attributed to the main thread. Without this patch, tracing a multi-process workload will result in decoder errors because the decoder will not know which executable is executing. Signed-off-by: Adrian Hunter <[email protected]> Cc: Jiri Olsa <[email protected]> Link: http://lkml.kernel.org/r/[email protected] Signed-off-by: Arnaldo Carvalho de Melo <[email protected]>
1 parent 1b29ac5 commit 86c2786

File tree

2 files changed

+151
-33
lines changed

2 files changed

+151
-33
lines changed

tools/perf/arch/x86/util/intel-pt.c

Lines changed: 47 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -624,13 +624,49 @@ static int intel_pt_recording_options(struct auxtrace_record *itr,
624624
* threads.
625625
*/
626626
if (have_timing_info && !cpu_map__empty(cpus)) {
627-
err = intel_pt_track_switches(evlist);
628-
if (err == -EPERM)
629-
pr_debug2("Unable to select sched:sched_switch\n");
630-
else if (err)
631-
return err;
632-
else
633-
ptr->have_sched_switch = 1;
627+
if (perf_can_record_switch_events()) {
628+
bool cpu_wide = !target__none(&opts->target) &&
629+
!target__has_task(&opts->target);
630+
631+
if (!cpu_wide && perf_can_record_cpu_wide()) {
632+
struct perf_evsel *switch_evsel;
633+
634+
err = parse_events(evlist, "dummy:u", NULL);
635+
if (err)
636+
return err;
637+
638+
switch_evsel = perf_evlist__last(evlist);
639+
640+
switch_evsel->attr.freq = 0;
641+
switch_evsel->attr.sample_period = 1;
642+
switch_evsel->attr.context_switch = 1;
643+
644+
switch_evsel->system_wide = true;
645+
switch_evsel->no_aux_samples = true;
646+
switch_evsel->immediate = true;
647+
648+
perf_evsel__set_sample_bit(switch_evsel, TID);
649+
perf_evsel__set_sample_bit(switch_evsel, TIME);
650+
perf_evsel__set_sample_bit(switch_evsel, CPU);
651+
652+
opts->record_switch_events = false;
653+
ptr->have_sched_switch = 3;
654+
} else {
655+
opts->record_switch_events = true;
656+
if (cpu_wide)
657+
ptr->have_sched_switch = 3;
658+
else
659+
ptr->have_sched_switch = 2;
660+
}
661+
} else {
662+
err = intel_pt_track_switches(evlist);
663+
if (err == -EPERM)
664+
pr_debug2("Unable to select sched:sched_switch\n");
665+
else if (err)
666+
return err;
667+
else
668+
ptr->have_sched_switch = 1;
669+
}
634670
}
635671

636672
if (intel_pt_evsel) {
@@ -663,8 +699,11 @@ static int intel_pt_recording_options(struct auxtrace_record *itr,
663699
tracking_evsel->attr.sample_period = 1;
664700

665701
/* In per-cpu case, always need the time of mmap events etc */
666-
if (!cpu_map__empty(cpus))
702+
if (!cpu_map__empty(cpus)) {
667703
perf_evsel__set_sample_bit(tracking_evsel, TIME);
704+
/* And the CPU for switch events */
705+
perf_evsel__set_sample_bit(tracking_evsel, CPU);
706+
}
668707
}
669708

670709
/*

tools/perf/util/intel-pt.c

Lines changed: 104 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1145,11 +1145,13 @@ static int intel_pt_sample(struct intel_pt_queue *ptq)
11451145
return 0;
11461146
}
11471147

1148-
static u64 intel_pt_switch_ip(struct machine *machine, u64 *ptss_ip)
1148+
static u64 intel_pt_switch_ip(struct intel_pt *pt, u64 *ptss_ip)
11491149
{
1150+
struct machine *machine = pt->machine;
11501151
struct map *map;
11511152
struct symbol *sym, *start;
11521153
u64 ip, switch_ip = 0;
1154+
const char *ptss;
11531155

11541156
if (ptss_ip)
11551157
*ptss_ip = 0;
@@ -1177,8 +1179,13 @@ static u64 intel_pt_switch_ip(struct machine *machine, u64 *ptss_ip)
11771179
if (!switch_ip || !ptss_ip)
11781180
return 0;
11791181

1182+
if (pt->have_sched_switch == 1)
1183+
ptss = "perf_trace_sched_switch";
1184+
else
1185+
ptss = "__perf_event_task_sched_out";
1186+
11801187
for (sym = start; sym; sym = dso__next_symbol(sym)) {
1181-
if (!strcmp(sym->name, "perf_trace_sched_switch")) {
1188+
if (!strcmp(sym->name, ptss)) {
11821189
ip = map->unmap_ip(map, sym->start);
11831190
if (ip >= map->start && ip < map->end) {
11841191
*ptss_ip = ip;
@@ -1198,11 +1205,11 @@ static int intel_pt_run_decoder(struct intel_pt_queue *ptq, u64 *timestamp)
11981205

11991206
if (!pt->kernel_start) {
12001207
pt->kernel_start = machine__kernel_start(pt->machine);
1201-
if (pt->per_cpu_mmaps && pt->have_sched_switch &&
1208+
if (pt->per_cpu_mmaps &&
1209+
(pt->have_sched_switch == 1 || pt->have_sched_switch == 3) &&
12021210
!pt->timeless_decoding && intel_pt_tracing_kernel(pt) &&
12031211
!pt->sampling_mode) {
1204-
pt->switch_ip = intel_pt_switch_ip(pt->machine,
1205-
&pt->ptss_ip);
1212+
pt->switch_ip = intel_pt_switch_ip(pt, &pt->ptss_ip);
12061213
if (pt->switch_ip) {
12071214
intel_pt_log("switch_ip: %"PRIx64" ptss_ip: %"PRIx64"\n",
12081215
pt->switch_ip, pt->ptss_ip);
@@ -1387,31 +1394,18 @@ static struct intel_pt_queue *intel_pt_cpu_to_ptq(struct intel_pt *pt, int cpu)
13871394
return NULL;
13881395
}
13891396

1390-
static int intel_pt_process_switch(struct intel_pt *pt,
1391-
struct perf_sample *sample)
1397+
static int intel_pt_sync_switch(struct intel_pt *pt, int cpu, pid_t tid,
1398+
u64 timestamp)
13921399
{
13931400
struct intel_pt_queue *ptq;
1394-
struct perf_evsel *evsel;
1395-
pid_t tid;
1396-
int cpu, err;
1397-
1398-
evsel = perf_evlist__id2evsel(pt->session->evlist, sample->id);
1399-
if (evsel != pt->switch_evsel)
1400-
return 0;
1401-
1402-
tid = perf_evsel__intval(evsel, sample, "next_pid");
1403-
cpu = sample->cpu;
1404-
1405-
intel_pt_log("sched_switch: cpu %d tid %d time %"PRIu64" tsc %#"PRIx64"\n",
1406-
cpu, tid, sample->time, perf_time_to_tsc(sample->time,
1407-
&pt->tc));
1401+
int err;
14081402

14091403
if (!pt->sync_switch)
1410-
goto out;
1404+
return 1;
14111405

14121406
ptq = intel_pt_cpu_to_ptq(pt, cpu);
14131407
if (!ptq)
1414-
goto out;
1408+
return 1;
14151409

14161410
switch (ptq->switch_state) {
14171411
case INTEL_PT_SS_NOT_TRACING:
@@ -1424,7 +1418,7 @@ static int intel_pt_process_switch(struct intel_pt *pt,
14241418
return 0;
14251419
case INTEL_PT_SS_EXPECTING_SWITCH_EVENT:
14261420
if (!ptq->on_heap) {
1427-
ptq->timestamp = perf_time_to_tsc(sample->time,
1421+
ptq->timestamp = perf_time_to_tsc(timestamp,
14281422
&pt->tc);
14291423
err = auxtrace_heap__add(&pt->heap, ptq->queue_nr,
14301424
ptq->timestamp);
@@ -1441,10 +1435,76 @@ static int intel_pt_process_switch(struct intel_pt *pt,
14411435
default:
14421436
break;
14431437
}
1444-
out:
1438+
1439+
return 1;
1440+
}
1441+
1442+
static int intel_pt_process_switch(struct intel_pt *pt,
1443+
struct perf_sample *sample)
1444+
{
1445+
struct perf_evsel *evsel;
1446+
pid_t tid;
1447+
int cpu, ret;
1448+
1449+
evsel = perf_evlist__id2evsel(pt->session->evlist, sample->id);
1450+
if (evsel != pt->switch_evsel)
1451+
return 0;
1452+
1453+
tid = perf_evsel__intval(evsel, sample, "next_pid");
1454+
cpu = sample->cpu;
1455+
1456+
intel_pt_log("sched_switch: cpu %d tid %d time %"PRIu64" tsc %#"PRIx64"\n",
1457+
cpu, tid, sample->time, perf_time_to_tsc(sample->time,
1458+
&pt->tc));
1459+
1460+
ret = intel_pt_sync_switch(pt, cpu, tid, sample->time);
1461+
if (ret <= 0)
1462+
return ret;
1463+
14451464
return machine__set_current_tid(pt->machine, cpu, -1, tid);
14461465
}
14471466

1467+
static int intel_pt_context_switch(struct intel_pt *pt, union perf_event *event,
1468+
struct perf_sample *sample)
1469+
{
1470+
bool out = event->header.misc & PERF_RECORD_MISC_SWITCH_OUT;
1471+
pid_t pid, tid;
1472+
int cpu, ret;
1473+
1474+
cpu = sample->cpu;
1475+
1476+
if (pt->have_sched_switch == 3) {
1477+
if (!out)
1478+
return 0;
1479+
if (event->header.type != PERF_RECORD_SWITCH_CPU_WIDE) {
1480+
pr_err("Expecting CPU-wide context switch event\n");
1481+
return -EINVAL;
1482+
}
1483+
pid = event->context_switch.next_prev_pid;
1484+
tid = event->context_switch.next_prev_tid;
1485+
} else {
1486+
if (out)
1487+
return 0;
1488+
pid = sample->pid;
1489+
tid = sample->tid;
1490+
}
1491+
1492+
if (tid == -1) {
1493+
pr_err("context_switch event has no tid\n");
1494+
return -EINVAL;
1495+
}
1496+
1497+
intel_pt_log("context_switch: cpu %d pid %d tid %d time %"PRIu64" tsc %#"PRIx64"\n",
1498+
cpu, pid, tid, sample->time, perf_time_to_tsc(sample->time,
1499+
&pt->tc));
1500+
1501+
ret = intel_pt_sync_switch(pt, cpu, tid, sample->time);
1502+
if (ret <= 0)
1503+
return ret;
1504+
1505+
return machine__set_current_tid(pt->machine, cpu, pid, tid);
1506+
}
1507+
14481508
static int intel_pt_process_itrace_start(struct intel_pt *pt,
14491509
union perf_event *event,
14501510
struct perf_sample *sample)
@@ -1515,6 +1575,9 @@ static int intel_pt_process_event(struct perf_session *session,
15151575
err = intel_pt_process_switch(pt, sample);
15161576
else if (event->header.type == PERF_RECORD_ITRACE_START)
15171577
err = intel_pt_process_itrace_start(pt, event, sample);
1578+
else if (event->header.type == PERF_RECORD_SWITCH ||
1579+
event->header.type == PERF_RECORD_SWITCH_CPU_WIDE)
1580+
err = intel_pt_context_switch(pt, event, sample);
15181581

15191582
intel_pt_log("event %s (%u): cpu %d time %"PRIu64" tsc %#"PRIx64"\n",
15201583
perf_event__name(event->header.type), event->header.type,
@@ -1777,6 +1840,18 @@ static struct perf_evsel *intel_pt_find_sched_switch(struct perf_evlist *evlist)
17771840
return NULL;
17781841
}
17791842

1843+
static bool intel_pt_find_switch(struct perf_evlist *evlist)
1844+
{
1845+
struct perf_evsel *evsel;
1846+
1847+
evlist__for_each(evlist, evsel) {
1848+
if (evsel->attr.context_switch)
1849+
return true;
1850+
}
1851+
1852+
return false;
1853+
}
1854+
17801855
static const char * const intel_pt_info_fmts[] = {
17811856
[INTEL_PT_PMU_TYPE] = " PMU Type %"PRId64"\n",
17821857
[INTEL_PT_TIME_SHIFT] = " Time Shift %"PRIu64"\n",
@@ -1888,6 +1963,10 @@ int intel_pt_process_auxtrace_info(union perf_event *event,
18881963
pr_err("%s: missing sched_switch event\n", __func__);
18891964
goto err_delete_thread;
18901965
}
1966+
} else if (pt->have_sched_switch == 2 &&
1967+
!intel_pt_find_switch(session->evlist)) {
1968+
pr_err("%s: missing context_switch attribute flag\n", __func__);
1969+
goto err_delete_thread;
18911970
}
18921971

18931972
if (session->itrace_synth_opts && session->itrace_synth_opts->set) {

0 commit comments

Comments
 (0)