Skip to content

Commit d153b15

Browse files
Peter ZijlstraIngo Molnar
authored andcommitted
sched/core: Fix wake_affine() performance regression
Eric reported a sysbench regression against commit: 3fed382 ("sched/numa: Implement NUMA node level wake_affine()") Similarly, Rik was looking at the NAS-lu.C benchmark, which regressed against his v3.10 enterprise kernel. PRE (current tip/master): ivb-ep sysbench: 2: [30 secs] transactions: 64110 (2136.94 per sec.) 5: [30 secs] transactions: 143644 (4787.99 per sec.) 10: [30 secs] transactions: 274298 (9142.93 per sec.) 20: [30 secs] transactions: 418683 (13955.45 per sec.) 40: [30 secs] transactions: 320731 (10690.15 per sec.) 80: [30 secs] transactions: 355096 (11834.28 per sec.) hsw-ex NAS: OMP_PROC_BIND/lu.C.x_threads_144_run_1.log: Time in seconds = 18.01 OMP_PROC_BIND/lu.C.x_threads_144_run_2.log: Time in seconds = 17.89 OMP_PROC_BIND/lu.C.x_threads_144_run_3.log: Time in seconds = 17.93 lu.C.x_threads_144_run_1.log: Time in seconds = 434.68 lu.C.x_threads_144_run_2.log: Time in seconds = 405.36 lu.C.x_threads_144_run_3.log: Time in seconds = 433.83 POST (+patch): ivb-ep sysbench: 2: [30 secs] transactions: 64494 (2149.75 per sec.) 5: [30 secs] transactions: 145114 (4836.99 per sec.) 10: [30 secs] transactions: 278311 (9276.69 per sec.) 20: [30 secs] transactions: 437169 (14571.60 per sec.) 40: [30 secs] transactions: 669837 (22326.73 per sec.) 80: [30 secs] transactions: 631739 (21055.88 per sec.) hsw-ex NAS: lu.C.x_threads_144_run_1.log: Time in seconds = 23.36 lu.C.x_threads_144_run_2.log: Time in seconds = 22.96 lu.C.x_threads_144_run_3.log: Time in seconds = 22.52 This patch takes out all the shiny wake_affine() stuff and goes back to utter basics. Between the two CPUs involved with the wakeup (the CPU doing the wakeup and the CPU we ran on previously) pick the CPU we can run on _now_. This restores much of the regressions against the older kernels, but leaves some ground in the overloaded case. The default-enabled WA_WEIGHT (which will be introduced in the next patch) is an attempt to address the overloaded situation. Reported-by: Eric Farman <[email protected]> Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Cc: Christian Borntraeger <[email protected]> Cc: Linus Torvalds <[email protected]> Cc: Matthew Rosato <[email protected]> Cc: Mike Galbraith <[email protected]> Cc: Peter Zijlstra <[email protected]> Cc: Rik van Riel <[email protected]> Cc: Thomas Gleixner <[email protected]> Cc: [email protected] Cc: [email protected] Fixes: 3fed382 ("sched/numa: Implement NUMA node level wake_affine()") Signed-off-by: Ingo Molnar <[email protected]>
1 parent 529a86e commit d153b15

File tree

3 files changed

+16
-119
lines changed

3 files changed

+16
-119
lines changed

include/linux/sched/topology.h

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -71,14 +71,6 @@ struct sched_domain_shared {
7171
atomic_t ref;
7272
atomic_t nr_busy_cpus;
7373
int has_idle_cores;
74-
75-
/*
76-
* Some variables from the most recent sd_lb_stats for this domain,
77-
* used by wake_affine().
78-
*/
79-
unsigned long nr_running;
80-
unsigned long load;
81-
unsigned long capacity;
8274
};
8375

8476
struct sched_domain {

kernel/sched/fair.c

Lines changed: 15 additions & 111 deletions
Original file line numberDiff line numberDiff line change
@@ -5356,115 +5356,36 @@ static int wake_wide(struct task_struct *p)
53565356
return 1;
53575357
}
53585358

5359-
struct llc_stats {
5360-
unsigned long nr_running;
5361-
unsigned long load;
5362-
unsigned long capacity;
5363-
int has_capacity;
5364-
};
5365-
5366-
static bool get_llc_stats(struct llc_stats *stats, int cpu)
5367-
{
5368-
struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
5369-
5370-
if (!sds)
5371-
return false;
5372-
5373-
stats->nr_running = READ_ONCE(sds->nr_running);
5374-
stats->load = READ_ONCE(sds->load);
5375-
stats->capacity = READ_ONCE(sds->capacity);
5376-
stats->has_capacity = stats->nr_running < per_cpu(sd_llc_size, cpu);
5377-
5378-
return true;
5379-
}
5380-
53815359
/*
5382-
* Can a task be moved from prev_cpu to this_cpu without causing a load
5383-
* imbalance that would trigger the load balancer?
5360+
* The purpose of wake_affine() is to quickly determine on which CPU we can run
5361+
* soonest. For the purpose of speed we only consider the waking and previous
5362+
* CPU.
53845363
*
5385-
* Since we're running on 'stale' values, we might in fact create an imbalance
5386-
* but recomputing these values is expensive, as that'd mean iteration 2 cache
5387-
* domains worth of CPUs.
5364+
* wake_affine_idle() - only considers 'now', it check if the waking CPU is (or
5365+
* will be) idle.
53885366
*/
5367+
53895368
static bool
5390-
wake_affine_llc(struct sched_domain *sd, struct task_struct *p,
5391-
int this_cpu, int prev_cpu, int sync)
5369+
wake_affine_idle(struct sched_domain *sd, struct task_struct *p,
5370+
int this_cpu, int prev_cpu, int sync)
53925371
{
5393-
struct llc_stats prev_stats, this_stats;
5394-
s64 this_eff_load, prev_eff_load;
5395-
unsigned long task_load;
5396-
5397-
if (!get_llc_stats(&prev_stats, prev_cpu) ||
5398-
!get_llc_stats(&this_stats, this_cpu))
5399-
return false;
5400-
5401-
/*
5402-
* If sync wakeup then subtract the (maximum possible)
5403-
* effect of the currently running task from the load
5404-
* of the current LLC.
5405-
*/
5406-
if (sync) {
5407-
unsigned long current_load = task_h_load(current);
5408-
5409-
/* in this case load hits 0 and this LLC is considered 'idle' */
5410-
if (current_load > this_stats.load)
5411-
return true;
5412-
5413-
this_stats.load -= current_load;
5414-
}
5415-
5416-
/*
5417-
* The has_capacity stuff is not SMT aware, but by trying to balance
5418-
* the nr_running on both ends we try and fill the domain at equal
5419-
* rates, thereby first consuming cores before siblings.
5420-
*/
5421-
5422-
/* if the old cache has capacity, stay there */
5423-
if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1)
5424-
return false;
5425-
5426-
/* if this cache has capacity, come here */
5427-
if (this_stats.has_capacity && this_stats.nr_running+1 < prev_stats.nr_running)
5372+
if (idle_cpu(this_cpu))
54285373
return true;
54295374

5430-
/*
5431-
* Check to see if we can move the load without causing too much
5432-
* imbalance.
5433-
*/
5434-
task_load = task_h_load(p);
5435-
5436-
this_eff_load = 100;
5437-
this_eff_load *= prev_stats.capacity;
5438-
5439-
prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
5440-
prev_eff_load *= this_stats.capacity;
5441-
5442-
this_eff_load *= this_stats.load + task_load;
5443-
prev_eff_load *= prev_stats.load - task_load;
5375+
if (sync && cpu_rq(this_cpu)->nr_running == 1)
5376+
return true;
54445377

5445-
return this_eff_load <= prev_eff_load;
5378+
return false;
54465379
}
54475380

54485381
static int wake_affine(struct sched_domain *sd, struct task_struct *p,
54495382
int prev_cpu, int sync)
54505383
{
54515384
int this_cpu = smp_processor_id();
5452-
bool affine;
5453-
5454-
/*
5455-
* Default to no affine wakeups; wake_affine() should not effect a task
5456-
* placement the load-balancer feels inclined to undo. The conservative
5457-
* option is therefore to not move tasks when they wake up.
5458-
*/
5459-
affine = false;
5385+
bool affine = false;
54605386

5461-
/*
5462-
* If the wakeup is across cache domains, try to evaluate if movement
5463-
* makes sense, otherwise rely on select_idle_siblings() to do
5464-
* placement inside the cache domain.
5465-
*/
5466-
if (!cpus_share_cache(prev_cpu, this_cpu))
5467-
affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync);
5387+
if (sched_feat(WA_IDLE) && !affine)
5388+
affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync);
54685389

54695390
schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
54705391
if (affine) {
@@ -7600,7 +7521,6 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
76007521
*/
76017522
static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
76027523
{
7603-
struct sched_domain_shared *shared = env->sd->shared;
76047524
struct sched_domain *child = env->sd->child;
76057525
struct sched_group *sg = env->sd->groups;
76067526
struct sg_lb_stats *local = &sds->local_stat;
@@ -7672,22 +7592,6 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
76727592
if (env->dst_rq->rd->overload != overload)
76737593
env->dst_rq->rd->overload = overload;
76747594
}
7675-
7676-
if (!shared)
7677-
return;
7678-
7679-
/*
7680-
* Since these are sums over groups they can contain some CPUs
7681-
* multiple times for the NUMA domains.
7682-
*
7683-
* Currently only wake_affine_llc() and find_busiest_group()
7684-
* uses these numbers, only the last is affected by this problem.
7685-
*
7686-
* XXX fix that.
7687-
*/
7688-
WRITE_ONCE(shared->nr_running, sds->total_running);
7689-
WRITE_ONCE(shared->load, sds->total_load);
7690-
WRITE_ONCE(shared->capacity, sds->total_capacity);
76917595
}
76927596

76937597
/**

kernel/sched/features.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,3 +81,4 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true)
8181
SCHED_FEAT(LB_MIN, false)
8282
SCHED_FEAT(ATTACH_AGE_LOAD, true)
8383

84+
SCHED_FEAT(WA_IDLE, true)

0 commit comments

Comments
 (0)