Skip to content

Commit 10e2f1a

Browse files
Peter ZijlstraIngo Molnar
authored andcommitted
sched/core: Rewrite and improve select_idle_siblings()
select_idle_siblings() is a known pain point for a number of workloads; it either does too much or not enough and sometimes just does plain wrong. This rewrite attempts to address a number of issues (but sadly not all). The current code does an unconditional sched_domain iteration; with the intent of finding an idle core (on SMT hardware). The problems which this patch tries to address are: - its pointless to look for idle cores if the machine is real busy; at which point you're just wasting cycles. - it's behaviour is inconsistent between SMT and !SMT hardware in that !SMT hardware ends up doing a scan for any idle CPU in the LLC domain, while SMT hardware does a scan for idle cores and if that fails, falls back to a scan for idle threads on the 'target' core. The new code replaces the sched_domain scan with 3 explicit scans: 1) search for an idle core in the LLC 2) search for an idle CPU in the LLC 3) search for an idle thread in the 'target' core where 1 and 3 are conditional on SMT support and 1 and 2 have runtime heuristics to skip the step. Step 1) is conditional on sd_llc_shared->has_idle_cores; when a cpu goes idle and sd_llc_shared->has_idle_cores is false, we scan all SMT siblings of the CPU going idle. Similarly, we clear sd_llc_shared->has_idle_cores when we fail to find an idle core. Step 2) tracks the average cost of the scan and compares this to the average idle time guestimate for the CPU doing the wakeup. There is a significant fudge factor involved to deal with the variability of the averages. Esp. hackbench was sensitive to this. Step 3) is unconditional; we assume (also per step 1) that scanning all SMT siblings in a core is 'cheap'. With this; SMT systems gain step 2, which cures a few benchmarks -- notably one from Facebook. One 'feature' of the sched_domain iteration, which we preserve in the new code, is that it would start scanning from the 'target' CPU, instead of scanning the cpumask in cpu id order. This avoids multiple CPUs in the LLC scanning for idle to gang up and find the same CPU quite as much. The down side is that tasks can end up hopping across the LLC for no apparent reason. Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Cc: Linus Torvalds <[email protected]> Cc: Mike Galbraith <[email protected]> Cc: Peter Zijlstra <[email protected]> Cc: Thomas Gleixner <[email protected]> Cc: [email protected] Signed-off-by: Ingo Molnar <[email protected]>
1 parent 0e369d7 commit 10e2f1a

File tree

5 files changed

+234
-47
lines changed

5 files changed

+234
-47
lines changed

include/linux/sched.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1070,6 +1070,7 @@ struct sched_group;
10701070
struct sched_domain_shared {
10711071
atomic_t ref;
10721072
atomic_t nr_busy_cpus;
1073+
int has_idle_cores;
10731074
};
10741075

10751076
struct sched_domain {
@@ -1102,6 +1103,8 @@ struct sched_domain {
11021103
u64 max_newidle_lb_cost;
11031104
unsigned long next_decay_max_lb_cost;
11041105

1106+
u64 avg_scan_cost; /* select_idle_sibling */
1107+
11051108
#ifdef CONFIG_SCHEDSTATS
11061109
/* load_balance() stats */
11071110
unsigned int lb_count[CPU_MAX_IDLE_TYPES];

kernel/sched/core.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7478,6 +7478,7 @@ static struct kmem_cache *task_group_cache __read_mostly;
74787478
#endif
74797479

74807480
DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
7481+
DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
74817482

74827483
void __init sched_init(void)
74837484
{
@@ -7514,6 +7515,8 @@ void __init sched_init(void)
75147515
for_each_possible_cpu(i) {
75157516
per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
75167517
cpumask_size(), GFP_KERNEL, cpu_to_node(i));
7518+
per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
7519+
cpumask_size(), GFP_KERNEL, cpu_to_node(i));
75177520
}
75187521
#endif /* CONFIG_CPUMASK_OFFSTACK */
75197522

kernel/sched/fair.c

Lines changed: 221 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1582,9 +1582,16 @@ static void task_numa_compare(struct task_numa_env *env,
15821582
* One idle CPU per node is evaluated for a task numa move.
15831583
* Call select_idle_sibling to maybe find a better one.
15841584
*/
1585-
if (!cur)
1585+
if (!cur) {
1586+
/*
1587+
* select_idle_siblings() uses an per-cpu cpumask that
1588+
* can be used from IRQ context.
1589+
*/
1590+
local_irq_disable();
15861591
env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
15871592
env->dst_cpu);
1593+
local_irq_enable();
1594+
}
15881595

15891596
assign:
15901597
task_numa_assign(env, cur, imp);
@@ -4616,6 +4623,11 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
46164623
}
46174624

46184625
#ifdef CONFIG_SMP
4626+
4627+
/* Working cpumask for: load_balance, load_balance_newidle. */
4628+
DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
4629+
DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
4630+
46194631
#ifdef CONFIG_NO_HZ_COMMON
46204632
/*
46214633
* per rq 'load' arrray crap; XXX kill this.
@@ -5280,65 +5292,231 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
52805292
}
52815293

52825294
/*
5283-
* Try and locate an idle CPU in the sched_domain.
5295+
* Implement a for_each_cpu() variant that starts the scan at a given cpu
5296+
* (@start), and wraps around.
5297+
*
5298+
* This is used to scan for idle CPUs; such that not all CPUs looking for an
5299+
* idle CPU find the same CPU. The down-side is that tasks tend to cycle
5300+
* through the LLC domain.
5301+
*
5302+
* Especially tbench is found sensitive to this.
5303+
*/
5304+
5305+
static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped)
5306+
{
5307+
int next;
5308+
5309+
again:
5310+
next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1);
5311+
5312+
if (*wrapped) {
5313+
if (next >= start)
5314+
return nr_cpumask_bits;
5315+
} else {
5316+
if (next >= nr_cpumask_bits) {
5317+
*wrapped = 1;
5318+
n = -1;
5319+
goto again;
5320+
}
5321+
}
5322+
5323+
return next;
5324+
}
5325+
5326+
#define for_each_cpu_wrap(cpu, mask, start, wrap) \
5327+
for ((wrap) = 0, (cpu) = (start)-1; \
5328+
(cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)), \
5329+
(cpu) < nr_cpumask_bits; )
5330+
5331+
#ifdef CONFIG_SCHED_SMT
5332+
5333+
static inline void set_idle_cores(int cpu, int val)
5334+
{
5335+
struct sched_domain_shared *sds;
5336+
5337+
sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
5338+
if (sds)
5339+
WRITE_ONCE(sds->has_idle_cores, val);
5340+
}
5341+
5342+
static inline bool test_idle_cores(int cpu, bool def)
5343+
{
5344+
struct sched_domain_shared *sds;
5345+
5346+
sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
5347+
if (sds)
5348+
return READ_ONCE(sds->has_idle_cores);
5349+
5350+
return def;
5351+
}
5352+
5353+
/*
5354+
* Scans the local SMT mask to see if the entire core is idle, and records this
5355+
* information in sd_llc_shared->has_idle_cores.
5356+
*
5357+
* Since SMT siblings share all cache levels, inspecting this limited remote
5358+
* state should be fairly cheap.
5359+
*/
5360+
void update_idle_core(struct rq *rq)
5361+
{
5362+
int core = cpu_of(rq);
5363+
int cpu;
5364+
5365+
rcu_read_lock();
5366+
if (test_idle_cores(core, true))
5367+
goto unlock;
5368+
5369+
for_each_cpu(cpu, cpu_smt_mask(core)) {
5370+
if (cpu == core)
5371+
continue;
5372+
5373+
if (!idle_cpu(cpu))
5374+
goto unlock;
5375+
}
5376+
5377+
set_idle_cores(core, 1);
5378+
unlock:
5379+
rcu_read_unlock();
5380+
}
5381+
5382+
/*
5383+
* Scan the entire LLC domain for idle cores; this dynamically switches off if
5384+
* there are no idle cores left in the system; tracked through
5385+
* sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
5386+
*/
5387+
static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
5388+
{
5389+
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
5390+
int core, cpu, wrap;
5391+
5392+
if (!test_idle_cores(target, false))
5393+
return -1;
5394+
5395+
cpumask_and(cpus, sched_domain_span(sd), tsk_cpus_allowed(p));
5396+
5397+
for_each_cpu_wrap(core, cpus, target, wrap) {
5398+
bool idle = true;
5399+
5400+
for_each_cpu(cpu, cpu_smt_mask(core)) {
5401+
cpumask_clear_cpu(cpu, cpus);
5402+
if (!idle_cpu(cpu))
5403+
idle = false;
5404+
}
5405+
5406+
if (idle)
5407+
return core;
5408+
}
5409+
5410+
/*
5411+
* Failed to find an idle core; stop looking for one.
5412+
*/
5413+
set_idle_cores(target, 0);
5414+
5415+
return -1;
5416+
}
5417+
5418+
/*
5419+
* Scan the local SMT mask for idle CPUs.
5420+
*/
5421+
static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
5422+
{
5423+
int cpu;
5424+
5425+
for_each_cpu(cpu, cpu_smt_mask(target)) {
5426+
if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
5427+
continue;
5428+
if (idle_cpu(cpu))
5429+
return cpu;
5430+
}
5431+
5432+
return -1;
5433+
}
5434+
5435+
#else /* CONFIG_SCHED_SMT */
5436+
5437+
static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
5438+
{
5439+
return -1;
5440+
}
5441+
5442+
static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
5443+
{
5444+
return -1;
5445+
}
5446+
5447+
#endif /* CONFIG_SCHED_SMT */
5448+
5449+
/*
5450+
* Scan the LLC domain for idle CPUs; this is dynamically regulated by
5451+
* comparing the average scan cost (tracked in sd->avg_scan_cost) against the
5452+
* average idle time for this rq (as found in rq->avg_idle).
5453+
*/
5454+
static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
5455+
{
5456+
struct sched_domain *this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
5457+
u64 avg_idle = this_rq()->avg_idle;
5458+
u64 avg_cost = this_sd->avg_scan_cost;
5459+
u64 time, cost;
5460+
s64 delta;
5461+
int cpu, wrap;
5462+
5463+
/*
5464+
* Due to large variance we need a large fuzz factor; hackbench in
5465+
* particularly is sensitive here.
5466+
*/
5467+
if ((avg_idle / 512) < avg_cost)
5468+
return -1;
5469+
5470+
time = local_clock();
5471+
5472+
for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) {
5473+
if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
5474+
continue;
5475+
if (idle_cpu(cpu))
5476+
break;
5477+
}
5478+
5479+
time = local_clock() - time;
5480+
cost = this_sd->avg_scan_cost;
5481+
delta = (s64)(time - cost) / 8;
5482+
this_sd->avg_scan_cost += delta;
5483+
5484+
return cpu;
5485+
}
5486+
5487+
/*
5488+
* Try and locate an idle core/thread in the LLC cache domain.
52845489
*/
52855490
static int select_idle_sibling(struct task_struct *p, int prev, int target)
52865491
{
52875492
struct sched_domain *sd;
5288-
struct sched_group *sg;
5493+
int i;
52895494

52905495
if (idle_cpu(target))
52915496
return target;
52925497

52935498
/*
5294-
* If the prevous cpu is cache affine and idle, don't be stupid.
5499+
* If the previous cpu is cache affine and idle, don't be stupid.
52955500
*/
52965501
if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
52975502
return prev;
52985503

5299-
/*
5300-
* Otherwise, iterate the domains and find an eligible idle cpu.
5301-
*
5302-
* A completely idle sched group at higher domains is more
5303-
* desirable than an idle group at a lower level, because lower
5304-
* domains have smaller groups and usually share hardware
5305-
* resources which causes tasks to contend on them, e.g. x86
5306-
* hyperthread siblings in the lowest domain (SMT) can contend
5307-
* on the shared cpu pipeline.
5308-
*
5309-
* However, while we prefer idle groups at higher domains
5310-
* finding an idle cpu at the lowest domain is still better than
5311-
* returning 'target', which we've already established, isn't
5312-
* idle.
5313-
*/
53145504
sd = rcu_dereference(per_cpu(sd_llc, target));
5315-
for_each_lower_domain(sd) {
5316-
sg = sd->groups;
5317-
do {
5318-
int i;
5505+
if (!sd)
5506+
return target;
53195507

5320-
if (!cpumask_intersects(sched_group_cpus(sg),
5321-
tsk_cpus_allowed(p)))
5322-
goto next;
5508+
i = select_idle_core(p, sd, target);
5509+
if ((unsigned)i < nr_cpumask_bits)
5510+
return i;
53235511

5324-
/* Ensure the entire group is idle */
5325-
for_each_cpu(i, sched_group_cpus(sg)) {
5326-
if (i == target || !idle_cpu(i))
5327-
goto next;
5328-
}
5512+
i = select_idle_cpu(p, sd, target);
5513+
if ((unsigned)i < nr_cpumask_bits)
5514+
return i;
5515+
5516+
i = select_idle_smt(p, sd, target);
5517+
if ((unsigned)i < nr_cpumask_bits)
5518+
return i;
53295519

5330-
/*
5331-
* It doesn't matter which cpu we pick, the
5332-
* whole group is idle.
5333-
*/
5334-
target = cpumask_first_and(sched_group_cpus(sg),
5335-
tsk_cpus_allowed(p));
5336-
goto done;
5337-
next:
5338-
sg = sg->next;
5339-
} while (sg != sd->groups);
5340-
}
5341-
done:
53425520
return target;
53435521
}
53445522

@@ -7397,9 +7575,6 @@ static struct rq *find_busiest_queue(struct lb_env *env,
73977575
*/
73987576
#define MAX_PINNED_INTERVAL 512
73997577

7400-
/* Working cpumask for load_balance and load_balance_newidle. */
7401-
DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
7402-
74037578
static int need_active_balance(struct lb_env *env)
74047579
{
74057580
struct sched_domain *sd = env->sd;

kernel/sched/idle_task.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ static struct task_struct *
2727
pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
2828
{
2929
put_prev_task(rq, prev);
30-
30+
update_idle_core(rq);
3131
schedstat_inc(rq->sched_goidle);
3232
return rq->idle;
3333
}

kernel/sched/sched.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,12 @@ extern void cpu_load_update_active(struct rq *this_rq);
3636
static inline void cpu_load_update_active(struct rq *this_rq) { }
3737
#endif
3838

39+
#ifdef CONFIG_SCHED_SMT
40+
extern void update_idle_core(struct rq *rq);
41+
#else
42+
static inline void update_idle_core(struct rq *rq) { }
43+
#endif
44+
3945
/*
4046
* Helpers for converting nanosecond timing to jiffy resolution
4147
*/

0 commit comments

Comments
 (0)