Skip to content

Commit ff16562

Browse files
Kan Liangacmel
authored andcommitted
perf callchain: Stitch LBR call stack
In LBR call stack mode, the depth of reconstructed LBR call stack limits to the number of LBR registers. For example, on skylake, the depth of reconstructed LBR call stack is always <= 32. # To display the perf.data header info, please use # --header/--header-only options. # # # Total Lost Samples: 0 # # Samples: 6K of event 'cycles' # Event count (approx.): 6487119731 # # Children Self Command Shared Object Symbol # ........ ........ ............... .................. # ................................ 99.97% 99.97% tchain_edit tchain_edit [.] f43 | --99.64%--f11 f12 f13 f14 f15 f16 f17 f18 f19 f20 f21 f22 f23 f24 f25 f26 f27 f28 f29 f30 f31 f32 f33 f34 f35 f36 f37 f38 f39 f40 f41 f42 f43 For a call stack which is deeper than LBR limit, HW will overwrite the LBR register with oldest branch. Only partial call stacks can be reconstructed. However, the overwritten LBRs may still be retrieved from previous sample. At that moment, HW hasn't overwritten the LBR registers yet. Perf tools can stitch those overwritten LBRs on current call stacks to get a more complete call stack. To determine if LBRs can be stitched, perf tools need to compare current sample with previous sample. - They should have identical LBR records (Same from, to and flags values, and the same physical index of LBR registers). - The searching starts from the base-of-stack of current sample. Once perf determines to stitch the previous LBRs, the corresponding LBR cursor nodes will be copied to 'lists'. The 'lists' is to track the LBR cursor nodes which are going to be stitched. When the stitching is over, the nodes will not be freed immediately. They will be moved to 'free_lists'. Next stitching may reuse the space. Both 'lists' and 'free_lists' will be freed when all samples are processed. Committer notes: Fix the intel-pt.c initialization of the union with 'struct branch_flags', that breaks the build with its unnamed union on older gcc versions. Uninline thread__free_stitch_list(), as it grew big and started dragging includes to thread.h, so move it to thread.c where what it needs in terms of headers are already there. This fixes the build in several systems such as debian:experimental when cross building to the MIPS32 architecture, i.e. in the other cases what was needed was being included by sheer luck. In file included from builtin-sched.c:11: util/thread.h: In function 'thread__free_stitch_list': util/thread.h:169:3: error: implicit declaration of function 'free' [-Werror=implicit-function-declaration] 169 | free(pos); | ^~~~ util/thread.h:169:3: error: incompatible implicit declaration of built-in function 'free' [-Werror] util/thread.h:19:1: note: include '<stdlib.h>' or provide a declaration of 'free' 18 | #include "callchain.h" +++ |+#include <stdlib.h> 19 | util/thread.h:174:3: error: incompatible implicit declaration of built-in function 'free' [-Werror] 174 | free(pos); | ^~~~ util/thread.h:174:3: note: include '<stdlib.h>' or provide a declaration of 'free' Signed-off-by: Kan Liang <[email protected]> Reviewed-by: Andi Kleen <[email protected]> Acked-by: Jiri Olsa <[email protected]> Cc: Adrian Hunter <[email protected]> Cc: Alexey Budankov <[email protected]> Cc: Mathieu Poirier <[email protected]> Cc: Michael Ellerman <[email protected]> Cc: Namhyung Kim <[email protected]> Cc: Pavel Gerasimov <[email protected]> Cc: Peter Zijlstra <[email protected]> Cc: Ravi Bangoria <[email protected]> Cc: Stephane Eranian <[email protected]> Cc: Vitaly Slobodskoy <[email protected]> Link: http://lore.kernel.org/lkml/[email protected] Signed-off-by: Arnaldo Carvalho de Melo <[email protected]>
1 parent 7f1d393 commit ff16562

File tree

6 files changed

+188
-28
lines changed

6 files changed

+188
-28
lines changed

tools/perf/util/branch.h

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,18 @@
1515
#include "event.h"
1616

1717
struct branch_flags {
18-
u64 mispred:1;
19-
u64 predicted:1;
20-
u64 in_tx:1;
21-
u64 abort:1;
22-
u64 cycles:16;
23-
u64 type:4;
24-
u64 reserved:40;
18+
union {
19+
u64 value;
20+
struct {
21+
u64 mispred:1;
22+
u64 predicted:1;
23+
u64 in_tx:1;
24+
u64 abort:1;
25+
u64 cycles:16;
26+
u64 type:4;
27+
u64 reserved:40;
28+
};
29+
};
2530
};
2631

2732
struct branch_info {

tools/perf/util/callchain.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,11 @@ struct callchain_cursor_node {
154154
struct callchain_cursor_node *next;
155155
};
156156

157+
struct stitch_list {
158+
struct list_head node;
159+
struct callchain_cursor_node cursor;
160+
};
161+
157162
struct callchain_cursor {
158163
u64 nr;
159164
struct callchain_cursor_node *first;

tools/perf/util/intel-pt.c

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1717,15 +1717,14 @@ static u64 intel_pt_lbr_flags(u64 info)
17171717
union {
17181718
struct branch_flags flags;
17191719
u64 result;
1720-
} u = {
1721-
.flags = {
1722-
.mispred = !!(info & LBR_INFO_MISPRED),
1723-
.predicted = !(info & LBR_INFO_MISPRED),
1724-
.in_tx = !!(info & LBR_INFO_IN_TX),
1725-
.abort = !!(info & LBR_INFO_ABORT),
1726-
.cycles = info & LBR_INFO_CYCLES,
1727-
}
1728-
};
1720+
} u;
1721+
1722+
u.result = 0;
1723+
u.flags.mispred = !!(info & LBR_INFO_MISPRED);
1724+
u.flags.predicted = !(info & LBR_INFO_MISPRED);
1725+
u.flags.in_tx = !!(info & LBR_INFO_IN_TX);
1726+
u.flags.abort = !!(info & LBR_INFO_ABORT);
1727+
u.flags.cycles = info & LBR_INFO_CYCLES;
17291728

17301729
return u.result;
17311730
}

tools/perf/util/machine.c

Lines changed: 138 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2348,6 +2348,119 @@ static int lbr_callchain_add_lbr_ip(struct thread *thread,
23482348
return 0;
23492349
}
23502350

2351+
static int lbr_callchain_add_stitched_lbr_ip(struct thread *thread,
2352+
struct callchain_cursor *cursor)
2353+
{
2354+
struct lbr_stitch *lbr_stitch = thread->lbr_stitch;
2355+
struct callchain_cursor_node *cnode;
2356+
struct stitch_list *stitch_node;
2357+
int err;
2358+
2359+
list_for_each_entry(stitch_node, &lbr_stitch->lists, node) {
2360+
cnode = &stitch_node->cursor;
2361+
2362+
err = callchain_cursor_append(cursor, cnode->ip,
2363+
&cnode->ms,
2364+
cnode->branch,
2365+
&cnode->branch_flags,
2366+
cnode->nr_loop_iter,
2367+
cnode->iter_cycles,
2368+
cnode->branch_from,
2369+
cnode->srcline);
2370+
if (err)
2371+
return err;
2372+
}
2373+
return 0;
2374+
}
2375+
2376+
static struct stitch_list *get_stitch_node(struct thread *thread)
2377+
{
2378+
struct lbr_stitch *lbr_stitch = thread->lbr_stitch;
2379+
struct stitch_list *stitch_node;
2380+
2381+
if (!list_empty(&lbr_stitch->free_lists)) {
2382+
stitch_node = list_first_entry(&lbr_stitch->free_lists,
2383+
struct stitch_list, node);
2384+
list_del(&stitch_node->node);
2385+
2386+
return stitch_node;
2387+
}
2388+
2389+
return malloc(sizeof(struct stitch_list));
2390+
}
2391+
2392+
static bool has_stitched_lbr(struct thread *thread,
2393+
struct perf_sample *cur,
2394+
struct perf_sample *prev,
2395+
unsigned int max_lbr,
2396+
bool callee)
2397+
{
2398+
struct branch_stack *cur_stack = cur->branch_stack;
2399+
struct branch_entry *cur_entries = perf_sample__branch_entries(cur);
2400+
struct branch_stack *prev_stack = prev->branch_stack;
2401+
struct branch_entry *prev_entries = perf_sample__branch_entries(prev);
2402+
struct lbr_stitch *lbr_stitch = thread->lbr_stitch;
2403+
int i, j, nr_identical_branches = 0;
2404+
struct stitch_list *stitch_node;
2405+
u64 cur_base, distance;
2406+
2407+
if (!cur_stack || !prev_stack)
2408+
return false;
2409+
2410+
/* Find the physical index of the base-of-stack for current sample. */
2411+
cur_base = max_lbr - cur_stack->nr + cur_stack->hw_idx + 1;
2412+
2413+
distance = (prev_stack->hw_idx > cur_base) ? (prev_stack->hw_idx - cur_base) :
2414+
(max_lbr + prev_stack->hw_idx - cur_base);
2415+
/* Previous sample has shorter stack. Nothing can be stitched. */
2416+
if (distance + 1 > prev_stack->nr)
2417+
return false;
2418+
2419+
/*
2420+
* Check if there are identical LBRs between two samples.
2421+
* Identicall LBRs must have same from, to and flags values. Also,
2422+
* they have to be saved in the same LBR registers (same physical
2423+
* index).
2424+
*
2425+
* Starts from the base-of-stack of current sample.
2426+
*/
2427+
for (i = distance, j = cur_stack->nr - 1; (i >= 0) && (j >= 0); i--, j--) {
2428+
if ((prev_entries[i].from != cur_entries[j].from) ||
2429+
(prev_entries[i].to != cur_entries[j].to) ||
2430+
(prev_entries[i].flags.value != cur_entries[j].flags.value))
2431+
break;
2432+
nr_identical_branches++;
2433+
}
2434+
2435+
if (!nr_identical_branches)
2436+
return false;
2437+
2438+
/*
2439+
* Save the LBRs between the base-of-stack of previous sample
2440+
* and the base-of-stack of current sample into lbr_stitch->lists.
2441+
* These LBRs will be stitched later.
2442+
*/
2443+
for (i = prev_stack->nr - 1; i > (int)distance; i--) {
2444+
2445+
if (!lbr_stitch->prev_lbr_cursor[i].valid)
2446+
continue;
2447+
2448+
stitch_node = get_stitch_node(thread);
2449+
if (!stitch_node)
2450+
return false;
2451+
2452+
memcpy(&stitch_node->cursor, &lbr_stitch->prev_lbr_cursor[i],
2453+
sizeof(struct callchain_cursor_node));
2454+
2455+
if (callee)
2456+
list_add(&stitch_node->node, &lbr_stitch->lists);
2457+
else
2458+
list_add_tail(&stitch_node->node, &lbr_stitch->lists);
2459+
}
2460+
2461+
return true;
2462+
}
2463+
23512464
static bool alloc_lbr_stitch(struct thread *thread, unsigned int max_lbr)
23522465
{
23532466
if (thread->lbr_stitch)
@@ -2361,6 +2474,9 @@ static bool alloc_lbr_stitch(struct thread *thread, unsigned int max_lbr)
23612474
if (!thread->lbr_stitch->prev_lbr_cursor)
23622475
goto free_lbr_stitch;
23632476

2477+
INIT_LIST_HEAD(&thread->lbr_stitch->lists);
2478+
INIT_LIST_HEAD(&thread->lbr_stitch->free_lists);
2479+
23642480
return true;
23652481

23662482
free_lbr_stitch:
@@ -2386,9 +2502,11 @@ static int resolve_lbr_callchain_sample(struct thread *thread,
23862502
int max_stack,
23872503
unsigned int max_lbr)
23882504
{
2505+
bool callee = (callchain_param.order == ORDER_CALLEE);
23892506
struct ip_callchain *chain = sample->callchain;
23902507
int chain_nr = min(max_stack, (int)chain->nr), i;
23912508
struct lbr_stitch *lbr_stitch;
2509+
bool stitched_lbr = false;
23922510
u64 branch_from = 0;
23932511
int err;
23942512

@@ -2405,10 +2523,18 @@ static int resolve_lbr_callchain_sample(struct thread *thread,
24052523
(max_lbr > 0) && alloc_lbr_stitch(thread, max_lbr)) {
24062524
lbr_stitch = thread->lbr_stitch;
24072525

2526+
stitched_lbr = has_stitched_lbr(thread, sample,
2527+
&lbr_stitch->prev_sample,
2528+
max_lbr, callee);
2529+
2530+
if (!stitched_lbr && !list_empty(&lbr_stitch->lists)) {
2531+
list_replace_init(&lbr_stitch->lists,
2532+
&lbr_stitch->free_lists);
2533+
}
24082534
memcpy(&lbr_stitch->prev_sample, sample, sizeof(*sample));
24092535
}
24102536

2411-
if (callchain_param.order == ORDER_CALLEE) {
2537+
if (callee) {
24122538
/* Add kernel ip */
24132539
err = lbr_callchain_add_kernel_ip(thread, cursor, sample,
24142540
parent, root_al, branch_from,
@@ -2421,7 +2547,18 @@ static int resolve_lbr_callchain_sample(struct thread *thread,
24212547
if (err)
24222548
goto error;
24232549

2550+
if (stitched_lbr) {
2551+
err = lbr_callchain_add_stitched_lbr_ip(thread, cursor);
2552+
if (err)
2553+
goto error;
2554+
}
2555+
24242556
} else {
2557+
if (stitched_lbr) {
2558+
err = lbr_callchain_add_stitched_lbr_ip(thread, cursor);
2559+
if (err)
2560+
goto error;
2561+
}
24252562
err = lbr_callchain_add_lbr_ip(thread, cursor, sample, parent,
24262563
root_al, &branch_from, false);
24272564
if (err)

tools/perf/util/thread.c

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -454,3 +454,25 @@ int thread__memcpy(struct thread *thread, struct machine *machine,
454454

455455
return dso__data_read_offset(al.map->dso, machine, offset, buf, len);
456456
}
457+
458+
void thread__free_stitch_list(struct thread *thread)
459+
{
460+
struct lbr_stitch *lbr_stitch = thread->lbr_stitch;
461+
struct stitch_list *pos, *tmp;
462+
463+
if (!lbr_stitch)
464+
return;
465+
466+
list_for_each_entry_safe(pos, tmp, &lbr_stitch->lists, node) {
467+
list_del_init(&pos->node);
468+
free(pos);
469+
}
470+
471+
list_for_each_entry_safe(pos, tmp, &lbr_stitch->free_lists, node) {
472+
list_del_init(&pos->node);
473+
free(pos);
474+
}
475+
476+
zfree(&lbr_stitch->prev_lbr_cursor);
477+
zfree(&thread->lbr_stitch);
478+
}

tools/perf/util/thread.h

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
#include <linux/refcount.h>
66
#include <linux/rbtree.h>
77
#include <linux/list.h>
8-
#include <linux/zalloc.h>
98
#include <stdio.h>
109
#include <unistd.h>
1110
#include <sys/types.h>
@@ -24,6 +23,8 @@ struct thread_stack;
2423
struct unwind_libunwind_ops;
2524

2625
struct lbr_stitch {
26+
struct list_head lists;
27+
struct list_head free_lists;
2728
struct perf_sample prev_sample;
2829
struct callchain_cursor_node *prev_lbr_cursor;
2930
};
@@ -154,15 +155,6 @@ static inline bool thread__is_filtered(struct thread *thread)
154155
return false;
155156
}
156157

157-
static inline void thread__free_stitch_list(struct thread *thread)
158-
{
159-
struct lbr_stitch *lbr_stitch = thread->lbr_stitch;
160-
161-
if (!lbr_stitch)
162-
return;
163-
164-
zfree(&lbr_stitch->prev_lbr_cursor);
165-
zfree(&thread->lbr_stitch);
166-
}
158+
void thread__free_stitch_list(struct thread *thread);
167159

168160
#endif /* __PERF_THREAD_H */

0 commit comments

Comments
 (0)