Skip to content

Commit fee1759

Browse files
pdxChenPeter Zijlstra
authored andcommitted
sched/fair: Determine active load balance for SMT sched groups
On hybrid CPUs with scheduling cluster enabled, we will need to consider balancing between SMT CPU cluster, and Atom core cluster. Below shows such a hybrid x86 CPU with 4 big cores and 8 atom cores. Each scheduling cluster span a L2 cache. --L2-- --L2-- --L2-- --L2-- ----L2---- -----L2------ [0, 1] [2, 3] [4, 5] [5, 6] [7 8 9 10] [11 12 13 14] Big Big Big Big Atom Atom core core core core Module Module If the busiest group is a big core with both SMT CPUs busy, we should active load balance if destination group has idle CPU cores. Such condition is considered by asym_active_balance() in load balancing but not considered when looking for busiest group and computing load imbalance. Add this consideration in find_busiest_group() and calculate_imbalance(). In addition, update the logic determining the busier group when one group is SMT and the other group is non SMT but both groups are partially busy with idle CPU. The busier group should be the group with idle cores rather than the group with one busy SMT CPU. We do not want to make the SMT group the busiest one to pull the only task off SMT CPU and causing the whole core to go empty. Otherwise suppose in the search for the busiest group, we first encounter an SMT group with 1 task and set it as the busiest. The destination group is an atom cluster with 1 task and we next encounter an atom cluster group with 3 tasks, we will not pick this atom cluster over the SMT group, even though we should. As a result, we do not load balance the busier Atom cluster (with 3 tasks) towards the local atom cluster (with 1 task). And it doesn't make sense to pick the 1 task SMT group as the busier group as we also should not pull task off the SMT towards the 1 task atom cluster and make the SMT core completely empty. Signed-off-by: Tim Chen <[email protected]> Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Link: https://lore.kernel.org/r/e24f35d142308790f69be65930b82794ef6658a2.1688770494.git.tim.c.chen@linux.intel.com
1 parent 35cd21f commit fee1759

File tree

1 file changed

+77
-3
lines changed

1 file changed

+77
-3
lines changed

kernel/sched/fair.c

Lines changed: 77 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8446,6 +8446,11 @@ enum group_type {
84468446
* more powerful CPU.
84478447
*/
84488448
group_misfit_task,
8449+
/*
8450+
* Balance SMT group that's fully busy. Can benefit from migration
8451+
* a task on SMT with busy sibling to another CPU on idle core.
8452+
*/
8453+
group_smt_balance,
84498454
/*
84508455
* SD_ASYM_PACKING only: One local CPU with higher capacity is available,
84518456
* and the task should be migrated to it instead of running on the
@@ -9154,6 +9159,7 @@ struct sg_lb_stats {
91549159
unsigned int group_weight;
91559160
enum group_type group_type;
91569161
unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
9162+
unsigned int group_smt_balance; /* Task on busy SMT be moved */
91579163
unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
91589164
#ifdef CONFIG_NUMA_BALANCING
91599165
unsigned int nr_numa_running;
@@ -9427,6 +9433,9 @@ group_type group_classify(unsigned int imbalance_pct,
94279433
if (sgs->group_asym_packing)
94289434
return group_asym_packing;
94299435

9436+
if (sgs->group_smt_balance)
9437+
return group_smt_balance;
9438+
94309439
if (sgs->group_misfit_task_load)
94319440
return group_misfit_task;
94329441

@@ -9496,6 +9505,36 @@ sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs
94969505
return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu);
94979506
}
94989507

9508+
/* One group has more than one SMT CPU while the other group does not */
9509+
static inline bool smt_vs_nonsmt_groups(struct sched_group *sg1,
9510+
struct sched_group *sg2)
9511+
{
9512+
if (!sg1 || !sg2)
9513+
return false;
9514+
9515+
return (sg1->flags & SD_SHARE_CPUCAPACITY) !=
9516+
(sg2->flags & SD_SHARE_CPUCAPACITY);
9517+
}
9518+
9519+
static inline bool smt_balance(struct lb_env *env, struct sg_lb_stats *sgs,
9520+
struct sched_group *group)
9521+
{
9522+
if (env->idle == CPU_NOT_IDLE)
9523+
return false;
9524+
9525+
/*
9526+
* For SMT source group, it is better to move a task
9527+
* to a CPU that doesn't have multiple tasks sharing its CPU capacity.
9528+
* Note that if a group has a single SMT, SD_SHARE_CPUCAPACITY
9529+
* will not be on.
9530+
*/
9531+
if (group->flags & SD_SHARE_CPUCAPACITY &&
9532+
sgs->sum_h_nr_running > 1)
9533+
return true;
9534+
9535+
return false;
9536+
}
9537+
94999538
static inline bool
95009539
sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
95019540
{
@@ -9588,6 +9627,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
95889627
sgs->group_asym_packing = 1;
95899628
}
95909629

9630+
/* Check for loaded SMT group to be balanced to dst CPU */
9631+
if (!local_group && smt_balance(env, sgs, group))
9632+
sgs->group_smt_balance = 1;
9633+
95919634
sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
95929635

95939636
/* Computing avg_load makes sense only when group is overloaded */
@@ -9672,6 +9715,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
96729715
return false;
96739716
break;
96749717

9718+
case group_smt_balance:
96759719
case group_fully_busy:
96769720
/*
96779721
* Select the fully busy group with highest avg_load. In
@@ -9700,6 +9744,18 @@ static bool update_sd_pick_busiest(struct lb_env *env,
97009744
break;
97019745

97029746
case group_has_spare:
9747+
/*
9748+
* Do not pick sg with SMT CPUs over sg with pure CPUs,
9749+
* as we do not want to pull task off SMT core with one task
9750+
* and make the core idle.
9751+
*/
9752+
if (smt_vs_nonsmt_groups(sds->busiest, sg)) {
9753+
if (sg->flags & SD_SHARE_CPUCAPACITY && sgs->sum_h_nr_running <= 1)
9754+
return false;
9755+
else
9756+
return true;
9757+
}
9758+
97039759
/*
97049760
* Select not overloaded group with lowest number of idle cpus
97059761
* and highest number of running tasks. We could also compare
@@ -9896,6 +9952,7 @@ static bool update_pick_idlest(struct sched_group *idlest,
98969952

98979953
case group_imbalanced:
98989954
case group_asym_packing:
9955+
case group_smt_balance:
98999956
/* Those types are not used in the slow wakeup path */
99009957
return false;
99019958

@@ -10027,6 +10084,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
1002710084

1002810085
case group_imbalanced:
1002910086
case group_asym_packing:
10087+
case group_smt_balance:
1003010088
/* Those type are not used in the slow wakeup path */
1003110089
return NULL;
1003210090

@@ -10281,6 +10339,13 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
1028110339
return;
1028210340
}
1028310341

10342+
if (busiest->group_type == group_smt_balance) {
10343+
/* Reduce number of tasks sharing CPU capacity */
10344+
env->migration_type = migrate_task;
10345+
env->imbalance = 1;
10346+
return;
10347+
}
10348+
1028410349
if (busiest->group_type == group_imbalanced) {
1028510350
/*
1028610351
* In the group_imb case we cannot rely on group-wide averages
@@ -10536,16 +10601,23 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
1053610601
goto force_balance;
1053710602

1053810603
if (busiest->group_type != group_overloaded) {
10539-
if (env->idle == CPU_NOT_IDLE)
10604+
if (env->idle == CPU_NOT_IDLE) {
1054010605
/*
1054110606
* If the busiest group is not overloaded (and as a
1054210607
* result the local one too) but this CPU is already
1054310608
* busy, let another idle CPU try to pull task.
1054410609
*/
1054510610
goto out_balanced;
10611+
}
10612+
10613+
if (busiest->group_type == group_smt_balance &&
10614+
smt_vs_nonsmt_groups(sds.local, sds.busiest)) {
10615+
/* Let non SMT CPU pull from SMT CPU sharing with sibling */
10616+
goto force_balance;
10617+
}
1054610618

1054710619
if (busiest->group_weight > 1 &&
10548-
local->idle_cpus <= (busiest->idle_cpus + 1))
10620+
local->idle_cpus <= (busiest->idle_cpus + 1)) {
1054910621
/*
1055010622
* If the busiest group is not overloaded
1055110623
* and there is no imbalance between this and busiest
@@ -10556,12 +10628,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
1055610628
* there is more than 1 CPU per group.
1055710629
*/
1055810630
goto out_balanced;
10631+
}
1055910632

10560-
if (busiest->sum_h_nr_running == 1)
10633+
if (busiest->sum_h_nr_running == 1) {
1056110634
/*
1056210635
* busiest doesn't have any tasks waiting to run
1056310636
*/
1056410637
goto out_balanced;
10638+
}
1056510639
}
1056610640

1056710641
force_balance:

0 commit comments

Comments
 (0)