Skip to content

Commit 340b47f

Browse files
kliang2acmel
authored andcommitted
perf top: Implement multithreading for perf_event__synthesize_threads
The proc files which is sorted with alphabetical order are evenly assigned to several synthesize threads to be processed in parallel. For 'perf top', the threads number hard code to online CPU number. The following patch will introduce an option to set it. For other perf tools, the thread number is 1. Because the process function is not ready for multithreading, e.g. process_synthesized_event. This patch series only support event synthesize multithreading for 'perf top'. For other tools, it can be done separately later. With multithread applied, the total processing time can get up to 1.56x speedup on Knights Mill for 'perf top'. For specific single event processing, the processing time could increase because of the lock contention. So proc_map_timeout may need to be increased. Otherwise some proc maps will be truncated. Based on my test, increasing the proc_map_timeout has small impact on the total processing time. The total processing time still get 1.49x speedup on Knights Mill after increasing the proc_map_timeout. The patch itself doesn't increase the proc_map_timeout. Doesn't need to implement multithreading for per task monitoring, perf_event__synthesize_thread_map. It doesn't have performance issue. Committer testing: # getconf _NPROCESSORS_ONLN 4 # perf trace --no-inherit -e clone -o /tmp/output perf top # tail -4 /tmp/bla 0.124 ( 0.041 ms): clone(flags: VM|FS|FILES|SIGHAND|THREAD|SYSVSEM|SETTLS|PARENT_SETTID|CHILD_CLEARTID, child_stack: 0x7fc3eb3a8f30, parent_tidptr: 0x7fc3eb3a99d0, child_tidptr: 0x7fc3eb3a99d0, tls: 0x7fc3eb3a9700) = 9548 (perf) 0.246 ( 0.023 ms): clone(flags: VM|FS|FILES|SIGHAND|THREAD|SYSVSEM|SETTLS|PARENT_SETTID|CHILD_CLEARTID, child_stack: 0x7fc3eaba7f30, parent_tidptr: 0x7fc3eaba89d0, child_tidptr: 0x7fc3eaba89d0, tls: 0x7fc3eaba8700) = 9549 (perf) 0.286 ( 0.019 ms): clone(flags: VM|FS|FILES|SIGHAND|THREAD|SYSVSEM|SETTLS|PARENT_SETTID|CHILD_CLEARTID, child_stack: 0x7fc3ea3a6f30, parent_tidptr: 0x7fc3ea3a79d0, child_tidptr: 0x7fc3ea3a79d0, tls: 0x7fc3ea3a7700) = 9550 (perf) 246.540 ( 0.047 ms): clone(flags: VM|FS|FILES|SIGHAND|THREAD|SYSVSEM|SETTLS|PARENT_SETTID|CHILD_CLEARTID, child_stack: 0x7fc3ea3a6f30, parent_tidptr: 0x7fc3ea3a79d0, child_tidptr: 0x7fc3ea3a79d0, tls: 0x7fc3ea3a7700) = 9551 (perf) # Signed-off-by: Kan Liang <[email protected]> Tested-by: Arnaldo Carvalho de Melo <[email protected]> Acked-by: Jiri Olsa <[email protected]> Cc: Adrian Hunter <[email protected]> Cc: Alexei Starovoitov <[email protected]> Cc: Andi Kleen <[email protected]> Cc: He Kuang <[email protected]> Cc: Lukasz Odzioba <[email protected]> Cc: Namhyung Kim <[email protected]> Cc: Peter Zijlstra <[email protected]> Cc: Wang Nan <[email protected]> Link: http://lkml.kernel.org/r/[email protected] Signed-off-by: Arnaldo Carvalho de Melo <[email protected]>
1 parent f988e71 commit 340b47f

File tree

9 files changed

+155
-42
lines changed

9 files changed

+155
-42
lines changed

tools/perf/builtin-kvm.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1441,7 +1441,8 @@ static int kvm_events_live(struct perf_kvm_stat *kvm,
14411441
perf_session__set_id_hdr_size(kvm->session);
14421442
ordered_events__set_copy_on_queue(&kvm->session->ordered_events, true);
14431443
machine__synthesize_threads(&kvm->session->machines.host, &kvm->opts.target,
1444-
kvm->evlist->threads, false, kvm->opts.proc_map_timeout);
1444+
kvm->evlist->threads, false,
1445+
kvm->opts.proc_map_timeout, 1);
14451446
err = kvm_live_open_events(kvm);
14461447
if (err)
14471448
goto out;

tools/perf/builtin-record.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -863,7 +863,7 @@ static int record__synthesize(struct record *rec, bool tail)
863863

864864
err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->threads,
865865
process_synthesized_event, opts->sample_address,
866-
opts->proc_map_timeout);
866+
opts->proc_map_timeout, 1);
867867
out:
868868
return err;
869869
}

tools/perf/builtin-top.c

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -958,8 +958,14 @@ static int __cmd_top(struct perf_top *top)
958958
if (perf_session__register_idle_thread(top->session) < 0)
959959
goto out_delete;
960960

961+
perf_set_multithreaded();
962+
961963
machine__synthesize_threads(&top->session->machines.host, &opts->target,
962-
top->evlist->threads, false, opts->proc_map_timeout);
964+
top->evlist->threads, false,
965+
opts->proc_map_timeout,
966+
(unsigned int)sysconf(_SC_NPROCESSORS_ONLN));
967+
968+
perf_set_singlethreaded();
963969

964970
if (perf_hpp_list.socket) {
965971
ret = perf_env__read_cpu_topology_map(&perf_env);

tools/perf/builtin-trace.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1131,7 +1131,7 @@ static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
11311131

11321132
err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
11331133
evlist->threads, trace__tool_process, false,
1134-
trace->opts.proc_map_timeout);
1134+
trace->opts.proc_map_timeout, 1);
11351135
if (err)
11361136
symbol__exit();
11371137

tools/perf/tests/mmap-thread-lookup.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ static int synth_all(struct machine *machine)
131131
{
132132
return perf_event__synthesize_threads(NULL,
133133
perf_event__process,
134-
machine, 0, 500);
134+
machine, 0, 500, 1);
135135
}
136136

137137
static int synth_process(struct machine *machine)

tools/perf/util/event.c

Lines changed: 129 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -678,23 +678,21 @@ int perf_event__synthesize_thread_map(struct perf_tool *tool,
678678
return err;
679679
}
680680

681-
int perf_event__synthesize_threads(struct perf_tool *tool,
682-
perf_event__handler_t process,
683-
struct machine *machine,
684-
bool mmap_data,
685-
unsigned int proc_map_timeout)
681+
static int __perf_event__synthesize_threads(struct perf_tool *tool,
682+
perf_event__handler_t process,
683+
struct machine *machine,
684+
bool mmap_data,
685+
unsigned int proc_map_timeout,
686+
struct dirent **dirent,
687+
int start,
688+
int num)
686689
{
687690
union perf_event *comm_event, *mmap_event, *fork_event;
688691
union perf_event *namespaces_event;
689-
char proc_path[PATH_MAX];
690-
struct dirent **dirent;
691692
int err = -1;
692693
char *end;
693694
pid_t pid;
694-
int n, i;
695-
696-
if (machine__is_default_guest(machine))
697-
return 0;
695+
int i;
698696

699697
comm_event = malloc(sizeof(comm_event->comm) + machine->id_hdr_size);
700698
if (comm_event == NULL)
@@ -714,34 +712,25 @@ int perf_event__synthesize_threads(struct perf_tool *tool,
714712
if (namespaces_event == NULL)
715713
goto out_free_fork;
716714

717-
snprintf(proc_path, sizeof(proc_path), "%s/proc", machine->root_dir);
718-
n = scandir(proc_path, &dirent, 0, alphasort);
719-
720-
if (n < 0)
721-
goto out_free_namespaces;
722-
723-
for (i = 0; i < n; i++) {
715+
for (i = start; i < start + num; i++) {
724716
if (!isdigit(dirent[i]->d_name[0]))
725717
continue;
726718

727719
pid = (pid_t)strtol(dirent[i]->d_name, &end, 10);
728720
/* only interested in proper numerical dirents */
729-
if (!*end) {
730-
/*
731-
* We may race with exiting thread, so don't stop just because
732-
* one thread couldn't be synthesized.
733-
*/
734-
__event__synthesize_thread(comm_event, mmap_event, fork_event,
735-
namespaces_event, pid, 1, process,
736-
tool, machine, mmap_data,
737-
proc_map_timeout);
738-
}
739-
free(dirent[i]);
721+
if (*end)
722+
continue;
723+
/*
724+
* We may race with exiting thread, so don't stop just because
725+
* one thread couldn't be synthesized.
726+
*/
727+
__event__synthesize_thread(comm_event, mmap_event, fork_event,
728+
namespaces_event, pid, 1, process,
729+
tool, machine, mmap_data,
730+
proc_map_timeout);
740731
}
741-
free(dirent);
742732
err = 0;
743733

744-
out_free_namespaces:
745734
free(namespaces_event);
746735
out_free_fork:
747736
free(fork_event);
@@ -753,6 +742,115 @@ int perf_event__synthesize_threads(struct perf_tool *tool,
753742
return err;
754743
}
755744

745+
struct synthesize_threads_arg {
746+
struct perf_tool *tool;
747+
perf_event__handler_t process;
748+
struct machine *machine;
749+
bool mmap_data;
750+
unsigned int proc_map_timeout;
751+
struct dirent **dirent;
752+
int num;
753+
int start;
754+
};
755+
756+
static void *synthesize_threads_worker(void *arg)
757+
{
758+
struct synthesize_threads_arg *args = arg;
759+
760+
__perf_event__synthesize_threads(args->tool, args->process,
761+
args->machine, args->mmap_data,
762+
args->proc_map_timeout, args->dirent,
763+
args->start, args->num);
764+
return NULL;
765+
}
766+
767+
int perf_event__synthesize_threads(struct perf_tool *tool,
768+
perf_event__handler_t process,
769+
struct machine *machine,
770+
bool mmap_data,
771+
unsigned int proc_map_timeout,
772+
unsigned int nr_threads_synthesize)
773+
{
774+
struct synthesize_threads_arg *args = NULL;
775+
pthread_t *synthesize_threads = NULL;
776+
char proc_path[PATH_MAX];
777+
struct dirent **dirent;
778+
int num_per_thread;
779+
int m, n, i, j;
780+
int thread_nr;
781+
int base = 0;
782+
int err = -1;
783+
784+
785+
if (machine__is_default_guest(machine))
786+
return 0;
787+
788+
snprintf(proc_path, sizeof(proc_path), "%s/proc", machine->root_dir);
789+
n = scandir(proc_path, &dirent, 0, alphasort);
790+
if (n < 0)
791+
return err;
792+
793+
thread_nr = nr_threads_synthesize;
794+
795+
if (thread_nr <= 1) {
796+
err = __perf_event__synthesize_threads(tool, process,
797+
machine, mmap_data,
798+
proc_map_timeout,
799+
dirent, base, n);
800+
goto free_dirent;
801+
}
802+
if (thread_nr > n)
803+
thread_nr = n;
804+
805+
synthesize_threads = calloc(sizeof(pthread_t), thread_nr);
806+
if (synthesize_threads == NULL)
807+
goto free_dirent;
808+
809+
args = calloc(sizeof(*args), thread_nr);
810+
if (args == NULL)
811+
goto free_threads;
812+
813+
num_per_thread = n / thread_nr;
814+
m = n % thread_nr;
815+
for (i = 0; i < thread_nr; i++) {
816+
args[i].tool = tool;
817+
args[i].process = process;
818+
args[i].machine = machine;
819+
args[i].mmap_data = mmap_data;
820+
args[i].proc_map_timeout = proc_map_timeout;
821+
args[i].dirent = dirent;
822+
}
823+
for (i = 0; i < m; i++) {
824+
args[i].num = num_per_thread + 1;
825+
args[i].start = i * args[i].num;
826+
}
827+
if (i != 0)
828+
base = args[i-1].start + args[i-1].num;
829+
for (j = i; j < thread_nr; j++) {
830+
args[j].num = num_per_thread;
831+
args[j].start = base + (j - i) * args[i].num;
832+
}
833+
834+
for (i = 0; i < thread_nr; i++) {
835+
if (pthread_create(&synthesize_threads[i], NULL,
836+
synthesize_threads_worker, &args[i]))
837+
goto out_join;
838+
}
839+
err = 0;
840+
out_join:
841+
for (i = 0; i < thread_nr; i++)
842+
pthread_join(synthesize_threads[i], NULL);
843+
free(args);
844+
free_threads:
845+
free(synthesize_threads);
846+
free_dirent:
847+
for (i = 0; i < n; i++)
848+
free(dirent[i]);
849+
free(dirent);
850+
851+
return err;
852+
}
853+
756854
struct process_symbol_args {
757855
const char *name;
758856
u64 start;

tools/perf/util/event.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -680,7 +680,8 @@ int perf_event__synthesize_cpu_map(struct perf_tool *tool,
680680
int perf_event__synthesize_threads(struct perf_tool *tool,
681681
perf_event__handler_t process,
682682
struct machine *machine, bool mmap_data,
683-
unsigned int proc_map_timeout);
683+
unsigned int proc_map_timeout,
684+
unsigned int nr_threads_synthesize);
684685
int perf_event__synthesize_kernel_mmap(struct perf_tool *tool,
685686
perf_event__handler_t process,
686687
struct machine *machine);

tools/perf/util/machine.c

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2218,12 +2218,16 @@ int machines__for_each_thread(struct machines *machines,
22182218
int __machine__synthesize_threads(struct machine *machine, struct perf_tool *tool,
22192219
struct target *target, struct thread_map *threads,
22202220
perf_event__handler_t process, bool data_mmap,
2221-
unsigned int proc_map_timeout)
2221+
unsigned int proc_map_timeout,
2222+
unsigned int nr_threads_synthesize)
22222223
{
22232224
if (target__has_task(target))
22242225
return perf_event__synthesize_thread_map(tool, threads, process, machine, data_mmap, proc_map_timeout);
22252226
else if (target__has_cpu(target))
2226-
return perf_event__synthesize_threads(tool, process, machine, data_mmap, proc_map_timeout);
2227+
return perf_event__synthesize_threads(tool, process,
2228+
machine, data_mmap,
2229+
proc_map_timeout,
2230+
nr_threads_synthesize);
22272231
/* command specified */
22282232
return 0;
22292233
}

tools/perf/util/machine.h

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -257,15 +257,18 @@ int machines__for_each_thread(struct machines *machines,
257257
int __machine__synthesize_threads(struct machine *machine, struct perf_tool *tool,
258258
struct target *target, struct thread_map *threads,
259259
perf_event__handler_t process, bool data_mmap,
260-
unsigned int proc_map_timeout);
260+
unsigned int proc_map_timeout,
261+
unsigned int nr_threads_synthesize);
261262
static inline
262263
int machine__synthesize_threads(struct machine *machine, struct target *target,
263264
struct thread_map *threads, bool data_mmap,
264-
unsigned int proc_map_timeout)
265+
unsigned int proc_map_timeout,
266+
unsigned int nr_threads_synthesize)
265267
{
266268
return __machine__synthesize_threads(machine, NULL, target, threads,
267269
perf_event__process, data_mmap,
268-
proc_map_timeout);
270+
proc_map_timeout,
271+
nr_threads_synthesize);
269272
}
270273

271274
pid_t machine__get_current_tid(struct machine *machine, int cpu);

0 commit comments

Comments
 (0)