Skip to content

Commit 058fe1c

Browse files
David Carrillo-CisnerosIngo Molnar
authored andcommitted
perf/core: Make cgroup switch visit only cpuctxs with cgroup events
This patch follows from a conversation in CQM/CMT's last series about speeding up the context switch for cgroup events: https://patchwork.kernel.org/patch/9478617/ This is a low-hanging fruit optimization. It replaces the iteration over the "pmus" list in cgroup switch by an iteration over a new list that contains only cpuctxs with at least one cgroup event. This is necessary because the number of PMUs have increased over the years e.g modern x86 server systems have well above 50 PMUs. The iteration over the full PMU list is unneccessary and can be costly in heavy cache contention scenarios. Below are some instrumentation measurements with 10, 50 and 90 percentiles of the total cost of context switch before and after this optimization for a simple array read/write microbenchark. Contention Level Nr events Before (us) After (us) Median L2 L3 types (10%, 50%, 90%) (10%, 50%, 90% Speedup -------------------------------------------------------------------------- Low Low 1 (1.72, 2.42, 5.85) (1.35, 1.64, 5.46) 29% High Low 1 (2.08, 4.56, 19.8) (1720, 2.20, 13.7) 51% High High 1 (2.86, 10.4, 12.7) (2.54, 4.32, 12.1) 58% Low Low 2 (1.98, 3.20, 6.89) (1.68, 2.41, 8.89) 24% High Low 2 (2.48, 5.28, 22.4) (2150, 3.69, 14.6) 30% High High 2 (3.32, 8.09, 13.9) (2.80, 5.15, 13.7) 36% where: 1 event type = cycles 2 event types = cycles,intel_cqm/llc_occupancy/ Contention L2 Low: workset < L2 cache size. High: " >> L2 " " . Contention L3 Low: workset of task on all sockets < L3 cache size. High: " " " " " " >> L3 " " . Median Speedup is (50%ile Before - 50%ile After) / 50%ile Before Unsurprisingly, the benefits of this optimization decrease with the number of cpuctxs with a cgroup events, yet, is never detrimental. Tested-by: Mark Rutland <[email protected]> Signed-off-by: David Carrillo-Cisneros <[email protected]> Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Acked-by: Mark Rutland <[email protected]> Cc: Alexander Shishkin <[email protected]> Cc: Arnaldo Carvalho de Melo <[email protected]> Cc: Arnaldo Carvalho de Melo <[email protected]> Cc: Borislav Petkov <[email protected]> Cc: Dave Hansen <[email protected]> Cc: Jiri Olsa <[email protected]> Cc: Kan Liang <[email protected]> Cc: Linus Torvalds <[email protected]> Cc: Paul Turner <[email protected]> Cc: Peter Zijlstra <[email protected]> Cc: Srinivas Pandruvada <[email protected]> Cc: Stephane Eranian <[email protected]> Cc: Thomas Gleixner <[email protected]> Cc: Vikas Shivappa <[email protected]> Cc: Vince Weaver <[email protected]> Cc: Vince Weaver <[email protected]> Link: http://lkml.kernel.org/r/[email protected] Signed-off-by: Ingo Molnar <[email protected]>
1 parent ae5112a commit 058fe1c

File tree

2 files changed

+46
-53
lines changed

2 files changed

+46
-53
lines changed

include/linux/perf_event.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -788,6 +788,7 @@ struct perf_cpu_context {
788788
struct pmu *unique_pmu;
789789
#ifdef CONFIG_CGROUP_PERF
790790
struct perf_cgroup *cgrp;
791+
struct list_head cgrp_cpuctx_entry;
791792
#endif
792793

793794
struct list_head sched_cb_entry;

kernel/events/core.c

Lines changed: 45 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -678,6 +678,8 @@ perf_cgroup_set_timestamp(struct task_struct *task,
678678
info->timestamp = ctx->timestamp;
679679
}
680680

681+
static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
682+
681683
#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */
682684
#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */
683685

@@ -690,61 +692,46 @@ perf_cgroup_set_timestamp(struct task_struct *task,
690692
static void perf_cgroup_switch(struct task_struct *task, int mode)
691693
{
692694
struct perf_cpu_context *cpuctx;
693-
struct pmu *pmu;
695+
struct list_head *list;
694696
unsigned long flags;
695697

696698
/*
697-
* disable interrupts to avoid geting nr_cgroup
698-
* changes via __perf_event_disable(). Also
699-
* avoids preemption.
699+
* Disable interrupts and preemption to avoid this CPU's
700+
* cgrp_cpuctx_entry to change under us.
700701
*/
701702
local_irq_save(flags);
702703

703-
/*
704-
* we reschedule only in the presence of cgroup
705-
* constrained events.
706-
*/
704+
list = this_cpu_ptr(&cgrp_cpuctx_list);
705+
list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) {
706+
WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
707707

708-
list_for_each_entry_rcu(pmu, &pmus, entry) {
709-
cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
710-
if (cpuctx->unique_pmu != pmu)
711-
continue; /* ensure we process each cpuctx once */
708+
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
709+
perf_pmu_disable(cpuctx->ctx.pmu);
712710

713-
/*
714-
* perf_cgroup_events says at least one
715-
* context on this CPU has cgroup events.
716-
*
717-
* ctx->nr_cgroups reports the number of cgroup
718-
* events for a context.
719-
*/
720-
if (cpuctx->ctx.nr_cgroups > 0) {
721-
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
722-
perf_pmu_disable(cpuctx->ctx.pmu);
723-
724-
if (mode & PERF_CGROUP_SWOUT) {
725-
cpu_ctx_sched_out(cpuctx, EVENT_ALL);
726-
/*
727-
* must not be done before ctxswout due
728-
* to event_filter_match() in event_sched_out()
729-
*/
730-
cpuctx->cgrp = NULL;
731-
}
711+
if (mode & PERF_CGROUP_SWOUT) {
712+
cpu_ctx_sched_out(cpuctx, EVENT_ALL);
713+
/*
714+
* must not be done before ctxswout due
715+
* to event_filter_match() in event_sched_out()
716+
*/
717+
cpuctx->cgrp = NULL;
718+
}
732719

733-
if (mode & PERF_CGROUP_SWIN) {
734-
WARN_ON_ONCE(cpuctx->cgrp);
735-
/*
736-
* set cgrp before ctxsw in to allow
737-
* event_filter_match() to not have to pass
738-
* task around
739-
* we pass the cpuctx->ctx to perf_cgroup_from_task()
740-
* because cgorup events are only per-cpu
741-
*/
742-
cpuctx->cgrp = perf_cgroup_from_task(task, &cpuctx->ctx);
743-
cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
744-
}
745-
perf_pmu_enable(cpuctx->ctx.pmu);
746-
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
720+
if (mode & PERF_CGROUP_SWIN) {
721+
WARN_ON_ONCE(cpuctx->cgrp);
722+
/*
723+
* set cgrp before ctxsw in to allow
724+
* event_filter_match() to not have to pass
725+
* task around
726+
* we pass the cpuctx->ctx to perf_cgroup_from_task()
727+
* because cgorup events are only per-cpu
728+
*/
729+
cpuctx->cgrp = perf_cgroup_from_task(task,
730+
&cpuctx->ctx);
731+
cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
747732
}
733+
perf_pmu_enable(cpuctx->ctx.pmu);
734+
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
748735
}
749736

750737
local_irq_restore(flags);
@@ -889,6 +876,7 @@ list_update_cgroup_event(struct perf_event *event,
889876
struct perf_event_context *ctx, bool add)
890877
{
891878
struct perf_cpu_context *cpuctx;
879+
struct list_head *cpuctx_entry;
892880

893881
if (!is_cgroup_event(event))
894882
return;
@@ -902,15 +890,16 @@ list_update_cgroup_event(struct perf_event *event,
902890
* this will always be called from the right CPU.
903891
*/
904892
cpuctx = __get_cpu_context(ctx);
905-
906-
/*
907-
* cpuctx->cgrp is NULL until a cgroup event is sched in or
908-
* ctx->nr_cgroup == 0 .
909-
*/
910-
if (add && perf_cgroup_from_task(current, ctx) == event->cgrp)
911-
cpuctx->cgrp = event->cgrp;
912-
else if (!add)
893+
cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
894+
/* cpuctx->cgrp is NULL unless a cgroup event is active in this CPU .*/
895+
if (add) {
896+
list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
897+
if (perf_cgroup_from_task(current, ctx) == event->cgrp)
898+
cpuctx->cgrp = event->cgrp;
899+
} else {
900+
list_del(cpuctx_entry);
913901
cpuctx->cgrp = NULL;
902+
}
914903
}
915904

916905
#else /* !CONFIG_CGROUP_PERF */
@@ -10709,6 +10698,9 @@ static void __init perf_event_init_all_cpus(void)
1070910698
INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
1071010699
raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
1071110700

10701+
#ifdef CONFIG_CGROUP_PERF
10702+
INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
10703+
#endif
1071210704
INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
1071310705
}
1071410706
}

0 commit comments

Comments
 (0)