perf/core: Make cgroup switch visit only cpuctxs with cgroup events

David Carrillo-Cisneros · Ingo Molnar · commit 058fe1c0440e · 2017-01-30T12:01:13.000+01:00
This patch follows from a conversation in CQM/CMT's last series about speeding up the context switch for cgroup events: https://patchwork.kernel.org/patch/9478617/ This is a low-hanging fruit optimization. It replaces the iteration over the "pmus" list in cgroup switch by an iteration over a new list that contains only cpuctxs with at least one cgroup event. This is necessary because the number of PMUs have increased over the years e.g modern x86 server systems have well above 50 PMUs. The iteration over the full PMU list is unneccessary and can be costly in heavy cache contention scenarios. Below are some instrumentation measurements with 10, 50 and 90 percentiles of the total cost of context switch before and after this optimization for a simple array read/write microbenchark. Contention Level Nr events Before (us) After (us) Median L2 L3 types (10%, 50%, 90%) (10%, 50%, 90% Speedup -------------------------------------------------------------------------- Low Low 1 (1.72, 2.42, 5.85) (1.35, 1.64, 5.46) 29% High Low 1 (2.08, 4.56, 19.8) (1720, 2.20, 13.7) 51% High High 1 (2.86, 10.4, 12.7) (2.54, 4.32, 12.1) 58% Low Low 2 (1.98, 3.20, 6.89) (1.68, 2.41, 8.89) 24% High Low 2 (2.48, 5.28, 22.4) (2150, 3.69, 14.6) 30% High High 2 (3.32, 8.09, 13.9) (2.80, 5.15, 13.7) 36% where: 1 event type = cycles 2 event types = cycles,intel_cqm/llc_occupancy/ Contention L2 Low: workset < L2 cache size. High: " >> L2 " " . Contention L3 Low: workset of task on all sockets < L3 cache size. High: " " " " " " >> L3 " " . Median Speedup is (50%ile Before - 50%ile After) / 50%ile Before Unsurprisingly, the benefits of this optimization decrease with the number of cpuctxs with a cgroup events, yet, is never detrimental. Tested-by: Mark Rutland <mark.rutland@arm.com> Signed-off-by: David Carrillo-Cisneros <davidcc@google.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Acked-by: Mark Rutland <mark.rutland@arm.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Arnaldo Carvalho de Melo <acme@kernel.org> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Borislav Petkov <bp@suse.de> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Kan Liang <kan.liang@intel.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Paul Turner <pjt@google.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com> Cc: Vince Weaver <vince@deater.net> Cc: Vince Weaver <vincent.weaver@maine.edu> Link: http://lkml.kernel.org/r/20170118192454.58008-2-davidcc@google.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
@@ -788,6 +788,7 @@ struct perf_cpu_context {
 	struct pmu			*unique_pmu;
 #ifdef CONFIG_CGROUP_PERF
 	struct perf_cgroup		*cgrp;
+	struct list_head		cgrp_cpuctx_entry;
 #endif
 
 	struct list_head		sched_cb_entry;
diff --git a/kernel/events/core.c b/kernel/events/core.c
@@ -678,6 +678,8 @@ perf_cgroup_set_timestamp(struct task_struct *task,
 	info->timestamp = ctx->timestamp;
 }
 
+static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
+
 #define PERF_CGROUP_SWOUT	0x1 /* cgroup switch out every event */
 #define PERF_CGROUP_SWIN	0x2 /* cgroup switch in events based on task */
 
@@ -690,61 +692,46 @@ perf_cgroup_set_timestamp(struct task_struct *task,
 static void perf_cgroup_switch(struct task_struct *task, int mode)
 {
 	struct perf_cpu_context *cpuctx;
-	struct pmu *pmu;
+	struct list_head *list;
 	unsigned long flags;
 
 	/*
-	 * disable interrupts to avoid geting nr_cgroup
-	 * changes via __perf_event_disable(). Also
-	 * avoids preemption.
+	 * Disable interrupts and preemption to avoid this CPU's
+	 * cgrp_cpuctx_entry to change under us.
 	 */
 	local_irq_save(flags);
 
-	/*
-	 * we reschedule only in the presence of cgroup
-	 * constrained events.
-	 */
+	list = this_cpu_ptr(&cgrp_cpuctx_list);
+	list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) {
+		WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
 
-	list_for_each_entry_rcu(pmu, &pmus, entry) {
-		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-		if (cpuctx->unique_pmu != pmu)
-			continue; /* ensure we process each cpuctx once */
+		perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+		perf_pmu_disable(cpuctx->ctx.pmu);
 
-		/*
-		 * perf_cgroup_events says at least one
-		 * context on this CPU has cgroup events.
-		 *
-		 * ctx->nr_cgroups reports the number of cgroup
-		 * events for a context.
-		 */
-		if (cpuctx->ctx.nr_cgroups > 0) {
-			perf_ctx_lock(cpuctx, cpuctx->task_ctx);
-			perf_pmu_disable(cpuctx->ctx.pmu);
-
-			if (mode & PERF_CGROUP_SWOUT) {
-				cpu_ctx_sched_out(cpuctx, EVENT_ALL);
-				/*
-				 * must not be done before ctxswout due
-				 * to event_filter_match() in event_sched_out()
-				 */
-				cpuctx->cgrp = NULL;
-			}
+		if (mode & PERF_CGROUP_SWOUT) {
+			cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+			/*
+			 * must not be done before ctxswout due
+			 * to event_filter_match() in event_sched_out()
+			 */
+			cpuctx->cgrp = NULL;
+		}
 
-			if (mode & PERF_CGROUP_SWIN) {
-				WARN_ON_ONCE(cpuctx->cgrp);
-				/*
-				 * set cgrp before ctxsw in to allow
-				 * event_filter_match() to not have to pass
-				 * task around
-				 * we pass the cpuctx->ctx to perf_cgroup_from_task()
-				 * because cgorup events are only per-cpu
-				 */
-				cpuctx->cgrp = perf_cgroup_from_task(task, &cpuctx->ctx);
-				cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
-			}
-			perf_pmu_enable(cpuctx->ctx.pmu);
-			perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+		if (mode & PERF_CGROUP_SWIN) {
+			WARN_ON_ONCE(cpuctx->cgrp);
+			/*
+			 * set cgrp before ctxsw in to allow
+			 * event_filter_match() to not have to pass
+			 * task around
+			 * we pass the cpuctx->ctx to perf_cgroup_from_task()
+			 * because cgorup events are only per-cpu
+			 */
+			cpuctx->cgrp = perf_cgroup_from_task(task,
+							     &cpuctx->ctx);
+			cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
 		}
+		perf_pmu_enable(cpuctx->ctx.pmu);
+		perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 	}
 
 	local_irq_restore(flags);
@@ -889,6 +876,7 @@ list_update_cgroup_event(struct perf_event *event,
 			 struct perf_event_context *ctx, bool add)
 {
 	struct perf_cpu_context *cpuctx;
+	struct list_head *cpuctx_entry;
 
 	if (!is_cgroup_event(event))
 		return;
@@ -902,15 +890,16 @@ list_update_cgroup_event(struct perf_event *event,
 	 * this will always be called from the right CPU.
 	 */
 	cpuctx = __get_cpu_context(ctx);
-
-	/*
-	 * cpuctx->cgrp is NULL until a cgroup event is sched in or
-	 * ctx->nr_cgroup == 0 .
-	 */
-	if (add && perf_cgroup_from_task(current, ctx) == event->cgrp)
-		cpuctx->cgrp = event->cgrp;
-	else if (!add)
+	cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
+	/* cpuctx->cgrp is NULL unless a cgroup event is active in this CPU .*/
+	if (add) {
+		list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
+		if (perf_cgroup_from_task(current, ctx) == event->cgrp)
+			cpuctx->cgrp = event->cgrp;
+	} else {
+		list_del(cpuctx_entry);
 		cpuctx->cgrp = NULL;
+	}
 }
 
 #else /* !CONFIG_CGROUP_PERF */
@@ -10709,6 +10698,9 @@ static void __init perf_event_init_all_cpus(void)
 		INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
 		raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
 
+#ifdef CONFIG_CGROUP_PERF
+		INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
+#endif
 		INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
 	}
 }