@@ -5356,91 +5356,62 @@ static int wake_wide(struct task_struct *p)
5356
5356
return 1 ;
5357
5357
}
5358
5358
5359
- struct llc_stats {
5360
- unsigned long nr_running ;
5361
- unsigned long load ;
5362
- unsigned long capacity ;
5363
- int has_capacity ;
5364
- };
5359
+ /*
5360
+ * The purpose of wake_affine() is to quickly determine on which CPU we can run
5361
+ * soonest. For the purpose of speed we only consider the waking and previous
5362
+ * CPU.
5363
+ *
5364
+ * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or
5365
+ * will be) idle.
5366
+ *
5367
+ * wake_affine_weight() - considers the weight to reflect the average
5368
+ * scheduling latency of the CPUs. This seems to work
5369
+ * for the overloaded case.
5370
+ */
5365
5371
5366
- static bool get_llc_stats (struct llc_stats * stats , int cpu )
5372
+ static bool
5373
+ wake_affine_idle (struct sched_domain * sd , struct task_struct * p ,
5374
+ int this_cpu , int prev_cpu , int sync )
5367
5375
{
5368
- struct sched_domain_shared * sds = rcu_dereference (per_cpu (sd_llc_shared , cpu ));
5369
-
5370
- if (!sds )
5371
- return false;
5376
+ if (idle_cpu (this_cpu ))
5377
+ return true;
5372
5378
5373
- stats -> nr_running = READ_ONCE (sds -> nr_running );
5374
- stats -> load = READ_ONCE (sds -> load );
5375
- stats -> capacity = READ_ONCE (sds -> capacity );
5376
- stats -> has_capacity = stats -> nr_running < per_cpu (sd_llc_size , cpu );
5379
+ if (sync && cpu_rq (this_cpu )-> nr_running == 1 )
5380
+ return true;
5377
5381
5378
- return true ;
5382
+ return false ;
5379
5383
}
5380
5384
5381
- /*
5382
- * Can a task be moved from prev_cpu to this_cpu without causing a load
5383
- * imbalance that would trigger the load balancer?
5384
- *
5385
- * Since we're running on 'stale' values, we might in fact create an imbalance
5386
- * but recomputing these values is expensive, as that'd mean iteration 2 cache
5387
- * domains worth of CPUs.
5388
- */
5389
5385
static bool
5390
- wake_affine_llc (struct sched_domain * sd , struct task_struct * p ,
5391
- int this_cpu , int prev_cpu , int sync )
5386
+ wake_affine_weight (struct sched_domain * sd , struct task_struct * p ,
5387
+ int this_cpu , int prev_cpu , int sync )
5392
5388
{
5393
- struct llc_stats prev_stats , this_stats ;
5394
5389
s64 this_eff_load , prev_eff_load ;
5395
5390
unsigned long task_load ;
5396
5391
5397
- if (!get_llc_stats (& prev_stats , prev_cpu ) ||
5398
- !get_llc_stats (& this_stats , this_cpu ))
5399
- return false;
5392
+ this_eff_load = target_load (this_cpu , sd -> wake_idx );
5393
+ prev_eff_load = source_load (prev_cpu , sd -> wake_idx );
5400
5394
5401
- /*
5402
- * If sync wakeup then subtract the (maximum possible)
5403
- * effect of the currently running task from the load
5404
- * of the current LLC.
5405
- */
5406
5395
if (sync ) {
5407
5396
unsigned long current_load = task_h_load (current );
5408
5397
5409
- /* in this case load hits 0 and this LLC is considered 'idle' */
5410
- if (current_load > this_stats .load )
5398
+ if (current_load > this_eff_load )
5411
5399
return true;
5412
5400
5413
- this_stats . load -= current_load ;
5401
+ this_eff_load -= current_load ;
5414
5402
}
5415
5403
5416
- /*
5417
- * The has_capacity stuff is not SMT aware, but by trying to balance
5418
- * the nr_running on both ends we try and fill the domain at equal
5419
- * rates, thereby first consuming cores before siblings.
5420
- */
5421
-
5422
- /* if the old cache has capacity, stay there */
5423
- if (prev_stats .has_capacity && prev_stats .nr_running < this_stats .nr_running + 1 )
5424
- return false;
5425
-
5426
- /* if this cache has capacity, come here */
5427
- if (this_stats .has_capacity && this_stats .nr_running + 1 < prev_stats .nr_running )
5428
- return true;
5429
-
5430
- /*
5431
- * Check to see if we can move the load without causing too much
5432
- * imbalance.
5433
- */
5434
5404
task_load = task_h_load (p );
5435
5405
5436
- this_eff_load = 100 ;
5437
- this_eff_load *= prev_stats .capacity ;
5438
-
5439
- prev_eff_load = 100 + (sd -> imbalance_pct - 100 ) / 2 ;
5440
- prev_eff_load *= this_stats .capacity ;
5406
+ this_eff_load += task_load ;
5407
+ if (sched_feat (WA_BIAS ))
5408
+ this_eff_load *= 100 ;
5409
+ this_eff_load *= capacity_of (prev_cpu );
5441
5410
5442
- this_eff_load *= this_stats .load + task_load ;
5443
- prev_eff_load *= prev_stats .load - task_load ;
5411
+ prev_eff_load -= task_load ;
5412
+ if (sched_feat (WA_BIAS ))
5413
+ prev_eff_load *= 100 + (sd -> imbalance_pct - 100 ) / 2 ;
5414
+ prev_eff_load *= capacity_of (this_cpu );
5444
5415
5445
5416
return this_eff_load <= prev_eff_load ;
5446
5417
}
@@ -5449,22 +5420,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
5449
5420
int prev_cpu , int sync )
5450
5421
{
5451
5422
int this_cpu = smp_processor_id ();
5452
- bool affine ;
5423
+ bool affine = false ;
5453
5424
5454
- /*
5455
- * Default to no affine wakeups; wake_affine() should not effect a task
5456
- * placement the load-balancer feels inclined to undo. The conservative
5457
- * option is therefore to not move tasks when they wake up.
5458
- */
5459
- affine = false;
5425
+ if (sched_feat (WA_IDLE ) && !affine )
5426
+ affine = wake_affine_idle (sd , p , this_cpu , prev_cpu , sync );
5460
5427
5461
- /*
5462
- * If the wakeup is across cache domains, try to evaluate if movement
5463
- * makes sense, otherwise rely on select_idle_siblings() to do
5464
- * placement inside the cache domain.
5465
- */
5466
- if (!cpus_share_cache (prev_cpu , this_cpu ))
5467
- affine = wake_affine_llc (sd , p , this_cpu , prev_cpu , sync );
5428
+ if (sched_feat (WA_WEIGHT ) && !affine )
5429
+ affine = wake_affine_weight (sd , p , this_cpu , prev_cpu , sync );
5468
5430
5469
5431
schedstat_inc (p -> se .statistics .nr_wakeups_affine_attempts );
5470
5432
if (affine ) {
@@ -7600,7 +7562,6 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
7600
7562
*/
7601
7563
static inline void update_sd_lb_stats (struct lb_env * env , struct sd_lb_stats * sds )
7602
7564
{
7603
- struct sched_domain_shared * shared = env -> sd -> shared ;
7604
7565
struct sched_domain * child = env -> sd -> child ;
7605
7566
struct sched_group * sg = env -> sd -> groups ;
7606
7567
struct sg_lb_stats * local = & sds -> local_stat ;
@@ -7672,22 +7633,6 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
7672
7633
if (env -> dst_rq -> rd -> overload != overload )
7673
7634
env -> dst_rq -> rd -> overload = overload ;
7674
7635
}
7675
-
7676
- if (!shared )
7677
- return ;
7678
-
7679
- /*
7680
- * Since these are sums over groups they can contain some CPUs
7681
- * multiple times for the NUMA domains.
7682
- *
7683
- * Currently only wake_affine_llc() and find_busiest_group()
7684
- * uses these numbers, only the last is affected by this problem.
7685
- *
7686
- * XXX fix that.
7687
- */
7688
- WRITE_ONCE (shared -> nr_running , sds -> total_running );
7689
- WRITE_ONCE (shared -> load , sds -> total_load );
7690
- WRITE_ONCE (shared -> capacity , sds -> total_capacity );
7691
7636
}
7692
7637
7693
7638
/**
@@ -8097,6 +8042,13 @@ static int should_we_balance(struct lb_env *env)
8097
8042
struct sched_group * sg = env -> sd -> groups ;
8098
8043
int cpu , balance_cpu = -1 ;
8099
8044
8045
+ /*
8046
+ * Ensure the balancing environment is consistent; can happen
8047
+ * when the softirq triggers 'during' hotplug.
8048
+ */
8049
+ if (!cpumask_test_cpu (env -> dst_cpu , env -> cpus ))
8050
+ return 0 ;
8051
+
8100
8052
/*
8101
8053
* In the newly idle case, we will allow all the cpu's
8102
8054
* to do the newly idle load balance.
0 commit comments