Skip to content

Commit a339b35

Browse files
committed
Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler fixes from Ingo Molnar: "Three fixes that address an SMP balancing performance regression" * 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/core: Ensure load_balance() respects the active_mask sched/core: Address more wake_affine() regressions sched/core: Fix wake_affine() performance regression
2 parents 7b764ce + 024c9d2 commit a339b35

File tree

3 files changed

+49
-102
lines changed

3 files changed

+49
-102
lines changed

include/linux/sched/topology.h

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -71,14 +71,6 @@ struct sched_domain_shared {
7171
atomic_t ref;
7272
atomic_t nr_busy_cpus;
7373
int has_idle_cores;
74-
75-
/*
76-
* Some variables from the most recent sd_lb_stats for this domain,
77-
* used by wake_affine().
78-
*/
79-
unsigned long nr_running;
80-
unsigned long load;
81-
unsigned long capacity;
8274
};
8375

8476
struct sched_domain {

kernel/sched/fair.c

Lines changed: 46 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -5356,91 +5356,62 @@ static int wake_wide(struct task_struct *p)
53565356
return 1;
53575357
}
53585358

5359-
struct llc_stats {
5360-
unsigned long nr_running;
5361-
unsigned long load;
5362-
unsigned long capacity;
5363-
int has_capacity;
5364-
};
5359+
/*
5360+
* The purpose of wake_affine() is to quickly determine on which CPU we can run
5361+
* soonest. For the purpose of speed we only consider the waking and previous
5362+
* CPU.
5363+
*
5364+
* wake_affine_idle() - only considers 'now', it check if the waking CPU is (or
5365+
* will be) idle.
5366+
*
5367+
* wake_affine_weight() - considers the weight to reflect the average
5368+
* scheduling latency of the CPUs. This seems to work
5369+
* for the overloaded case.
5370+
*/
53655371

5366-
static bool get_llc_stats(struct llc_stats *stats, int cpu)
5372+
static bool
5373+
wake_affine_idle(struct sched_domain *sd, struct task_struct *p,
5374+
int this_cpu, int prev_cpu, int sync)
53675375
{
5368-
struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
5369-
5370-
if (!sds)
5371-
return false;
5376+
if (idle_cpu(this_cpu))
5377+
return true;
53725378

5373-
stats->nr_running = READ_ONCE(sds->nr_running);
5374-
stats->load = READ_ONCE(sds->load);
5375-
stats->capacity = READ_ONCE(sds->capacity);
5376-
stats->has_capacity = stats->nr_running < per_cpu(sd_llc_size, cpu);
5379+
if (sync && cpu_rq(this_cpu)->nr_running == 1)
5380+
return true;
53775381

5378-
return true;
5382+
return false;
53795383
}
53805384

5381-
/*
5382-
* Can a task be moved from prev_cpu to this_cpu without causing a load
5383-
* imbalance that would trigger the load balancer?
5384-
*
5385-
* Since we're running on 'stale' values, we might in fact create an imbalance
5386-
* but recomputing these values is expensive, as that'd mean iteration 2 cache
5387-
* domains worth of CPUs.
5388-
*/
53895385
static bool
5390-
wake_affine_llc(struct sched_domain *sd, struct task_struct *p,
5391-
int this_cpu, int prev_cpu, int sync)
5386+
wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
5387+
int this_cpu, int prev_cpu, int sync)
53925388
{
5393-
struct llc_stats prev_stats, this_stats;
53945389
s64 this_eff_load, prev_eff_load;
53955390
unsigned long task_load;
53965391

5397-
if (!get_llc_stats(&prev_stats, prev_cpu) ||
5398-
!get_llc_stats(&this_stats, this_cpu))
5399-
return false;
5392+
this_eff_load = target_load(this_cpu, sd->wake_idx);
5393+
prev_eff_load = source_load(prev_cpu, sd->wake_idx);
54005394

5401-
/*
5402-
* If sync wakeup then subtract the (maximum possible)
5403-
* effect of the currently running task from the load
5404-
* of the current LLC.
5405-
*/
54065395
if (sync) {
54075396
unsigned long current_load = task_h_load(current);
54085397

5409-
/* in this case load hits 0 and this LLC is considered 'idle' */
5410-
if (current_load > this_stats.load)
5398+
if (current_load > this_eff_load)
54115399
return true;
54125400

5413-
this_stats.load -= current_load;
5401+
this_eff_load -= current_load;
54145402
}
54155403

5416-
/*
5417-
* The has_capacity stuff is not SMT aware, but by trying to balance
5418-
* the nr_running on both ends we try and fill the domain at equal
5419-
* rates, thereby first consuming cores before siblings.
5420-
*/
5421-
5422-
/* if the old cache has capacity, stay there */
5423-
if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1)
5424-
return false;
5425-
5426-
/* if this cache has capacity, come here */
5427-
if (this_stats.has_capacity && this_stats.nr_running+1 < prev_stats.nr_running)
5428-
return true;
5429-
5430-
/*
5431-
* Check to see if we can move the load without causing too much
5432-
* imbalance.
5433-
*/
54345404
task_load = task_h_load(p);
54355405

5436-
this_eff_load = 100;
5437-
this_eff_load *= prev_stats.capacity;
5438-
5439-
prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
5440-
prev_eff_load *= this_stats.capacity;
5406+
this_eff_load += task_load;
5407+
if (sched_feat(WA_BIAS))
5408+
this_eff_load *= 100;
5409+
this_eff_load *= capacity_of(prev_cpu);
54415410

5442-
this_eff_load *= this_stats.load + task_load;
5443-
prev_eff_load *= prev_stats.load - task_load;
5411+
prev_eff_load -= task_load;
5412+
if (sched_feat(WA_BIAS))
5413+
prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
5414+
prev_eff_load *= capacity_of(this_cpu);
54445415

54455416
return this_eff_load <= prev_eff_load;
54465417
}
@@ -5449,22 +5420,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
54495420
int prev_cpu, int sync)
54505421
{
54515422
int this_cpu = smp_processor_id();
5452-
bool affine;
5423+
bool affine = false;
54535424

5454-
/*
5455-
* Default to no affine wakeups; wake_affine() should not effect a task
5456-
* placement the load-balancer feels inclined to undo. The conservative
5457-
* option is therefore to not move tasks when they wake up.
5458-
*/
5459-
affine = false;
5425+
if (sched_feat(WA_IDLE) && !affine)
5426+
affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync);
54605427

5461-
/*
5462-
* If the wakeup is across cache domains, try to evaluate if movement
5463-
* makes sense, otherwise rely on select_idle_siblings() to do
5464-
* placement inside the cache domain.
5465-
*/
5466-
if (!cpus_share_cache(prev_cpu, this_cpu))
5467-
affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync);
5428+
if (sched_feat(WA_WEIGHT) && !affine)
5429+
affine = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
54685430

54695431
schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
54705432
if (affine) {
@@ -7600,7 +7562,6 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
76007562
*/
76017563
static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
76027564
{
7603-
struct sched_domain_shared *shared = env->sd->shared;
76047565
struct sched_domain *child = env->sd->child;
76057566
struct sched_group *sg = env->sd->groups;
76067567
struct sg_lb_stats *local = &sds->local_stat;
@@ -7672,22 +7633,6 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
76727633
if (env->dst_rq->rd->overload != overload)
76737634
env->dst_rq->rd->overload = overload;
76747635
}
7675-
7676-
if (!shared)
7677-
return;
7678-
7679-
/*
7680-
* Since these are sums over groups they can contain some CPUs
7681-
* multiple times for the NUMA domains.
7682-
*
7683-
* Currently only wake_affine_llc() and find_busiest_group()
7684-
* uses these numbers, only the last is affected by this problem.
7685-
*
7686-
* XXX fix that.
7687-
*/
7688-
WRITE_ONCE(shared->nr_running, sds->total_running);
7689-
WRITE_ONCE(shared->load, sds->total_load);
7690-
WRITE_ONCE(shared->capacity, sds->total_capacity);
76917636
}
76927637

76937638
/**
@@ -8097,6 +8042,13 @@ static int should_we_balance(struct lb_env *env)
80978042
struct sched_group *sg = env->sd->groups;
80988043
int cpu, balance_cpu = -1;
80998044

8045+
/*
8046+
* Ensure the balancing environment is consistent; can happen
8047+
* when the softirq triggers 'during' hotplug.
8048+
*/
8049+
if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
8050+
return 0;
8051+
81008052
/*
81018053
* In the newly idle case, we will allow all the cpu's
81028054
* to do the newly idle load balance.

kernel/sched/features.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,3 +81,6 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true)
8181
SCHED_FEAT(LB_MIN, false)
8282
SCHED_FEAT(ATTACH_AGE_LOAD, true)
8383

84+
SCHED_FEAT(WA_IDLE, true)
85+
SCHED_FEAT(WA_WEIGHT, true)
86+
SCHED_FEAT(WA_BIAS, true)

0 commit comments

Comments
 (0)