Skip to content

Commit 367456c

Browse files
Peter ZijlstraIngo Molnar
authored andcommitted
sched: Ditch per cgroup task lists for load-balancing
Per cgroup load-balance has numerous problems, chief amongst them that there is no real sane order in them. So stop pretending it makes sense and enqueue all tasks on a single list. This also allows us to more easily fix the fwd progress issue uncovered by the lock-break stuff. Rotate the list on failure to migreate and limit the total iterations to nr_running (which with releasing the lock isn't strictly accurate but close enough). Also add a filter that skips very light tasks on the first attempt around the list, this attempts to avoid shooting whole cgroups around without affecting over balance. Signed-off-by: Peter Zijlstra <[email protected]> Cc: [email protected] Link: http://lkml.kernel.org/n/[email protected] Signed-off-by: Ingo Molnar <[email protected]>
1 parent ddcdf6e commit 367456c

File tree

3 files changed

+80
-109
lines changed

3 files changed

+80
-109
lines changed

kernel/sched/core.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6959,6 +6959,9 @@ void __init sched_init(void)
69596959
rq->online = 0;
69606960
rq->idle_stamp = 0;
69616961
rq->avg_idle = 2*sysctl_sched_migration_cost;
6962+
6963+
INIT_LIST_HEAD(&rq->cfs_tasks);
6964+
69626965
rq_attach_root(rq, &def_root_domain);
69636966
#ifdef CONFIG_NO_HZ
69646967
rq->nohz_flags = 0;

kernel/sched/fair.c

Lines changed: 75 additions & 101 deletions
Original file line numberDiff line numberDiff line change
@@ -776,29 +776,16 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
776776
* Scheduling class queueing methods:
777777
*/
778778

779-
#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
780-
static void
781-
add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
782-
{
783-
cfs_rq->task_weight += weight;
784-
}
785-
#else
786-
static inline void
787-
add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
788-
{
789-
}
790-
#endif
791-
792779
static void
793780
account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
794781
{
795782
update_load_add(&cfs_rq->load, se->load.weight);
796783
if (!parent_entity(se))
797784
update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
798-
if (entity_is_task(se)) {
799-
add_cfs_task_weight(cfs_rq, se->load.weight);
800-
list_add(&se->group_node, &cfs_rq->tasks);
801-
}
785+
#ifdef CONFIG_SMP
786+
if (entity_is_task(se))
787+
list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
788+
#endif
802789
cfs_rq->nr_running++;
803790
}
804791

@@ -808,10 +795,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
808795
update_load_sub(&cfs_rq->load, se->load.weight);
809796
if (!parent_entity(se))
810797
update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
811-
if (entity_is_task(se)) {
812-
add_cfs_task_weight(cfs_rq, -se->load.weight);
798+
if (entity_is_task(se))
813799
list_del_init(&se->group_node);
814-
}
815800
cfs_rq->nr_running--;
816801
}
817802

@@ -3085,24 +3070,25 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
30853070
static unsigned long __read_mostly max_load_balance_interval = HZ/10;
30863071

30873072
#define LBF_ALL_PINNED 0x01
3088-
#define LBF_NEED_BREAK 0x02 /* clears into HAD_BREAK */
3089-
#define LBF_HAD_BREAK 0x04
3090-
#define LBF_HAD_BREAKS 0x0C /* count HAD_BREAKs overflows into ABORT */
3091-
#define LBF_ABORT 0x10
3073+
#define LBF_NEED_BREAK 0x02
3074+
#define LBF_ABORT 0x04
30923075

30933076
struct lb_env {
30943077
struct sched_domain *sd;
30953078

30963079
int src_cpu;
30973080
struct rq *src_rq;
3098-
struct cfs_rq *src_cfs_rq;
30993081

31003082
int dst_cpu;
31013083
struct rq *dst_rq;
31023084

31033085
enum cpu_idle_type idle;
31043086
unsigned long max_load_move;
31053087
unsigned int flags;
3088+
3089+
unsigned int loop;
3090+
unsigned int loop_break;
3091+
unsigned int loop_max;
31063092
};
31073093

31083094
/*
@@ -3208,53 +3194,69 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
32083194
static int move_one_task(struct lb_env *env)
32093195
{
32103196
struct task_struct *p, *n;
3211-
struct cfs_rq *cfs_rq;
32123197

3213-
for_each_leaf_cfs_rq(env->src_rq, cfs_rq) {
3214-
list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
3215-
if (throttled_lb_pair(task_group(p),
3216-
env->src_cpu, env->dst_cpu))
3217-
break;
3198+
list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
3199+
if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu))
3200+
continue;
32183201

3219-
if (!can_migrate_task(p, env))
3220-
continue;
3202+
if (!can_migrate_task(p, env))
3203+
continue;
32213204

3222-
move_task(p, env);
3223-
/*
3224-
* Right now, this is only the second place move_task()
3225-
* is called, so we can safely collect move_task()
3226-
* stats here rather than inside move_task().
3227-
*/
3228-
schedstat_inc(env->sd, lb_gained[env->idle]);
3229-
return 1;
3230-
}
3205+
move_task(p, env);
3206+
/*
3207+
* Right now, this is only the second place move_task()
3208+
* is called, so we can safely collect move_task()
3209+
* stats here rather than inside move_task().
3210+
*/
3211+
schedstat_inc(env->sd, lb_gained[env->idle]);
3212+
return 1;
32313213
}
3232-
32333214
return 0;
32343215
}
32353216

3217+
static unsigned long task_h_load(struct task_struct *p);
3218+
32363219
static unsigned long balance_tasks(struct lb_env *env)
32373220
{
3238-
int loops = 0, pulled = 0;
32393221
long rem_load_move = env->max_load_move;
32403222
struct task_struct *p, *n;
3223+
unsigned long load;
3224+
int pulled = 0;
32413225

32423226
if (env->max_load_move == 0)
32433227
goto out;
32443228

3245-
list_for_each_entry_safe(p, n, &env->src_cfs_rq->tasks, se.group_node) {
3246-
if (loops++ > sysctl_sched_nr_migrate) {
3229+
list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
3230+
env->loop++;
3231+
/* We've more or less seen every task there is, call it quits */
3232+
if (env->loop > env->loop_max) {
3233+
env->flags |= LBF_ABORT;
3234+
break;
3235+
}
3236+
/* take a beather every nr_migrate tasks */
3237+
if (env->loop > env->loop_break) {
3238+
env->loop_break += sysctl_sched_nr_migrate;
32473239
env->flags |= LBF_NEED_BREAK;
32483240
break;
32493241
}
32503242

3251-
if ((p->se.load.weight >> 1) > rem_load_move ||
3252-
!can_migrate_task(p, env))
3253-
continue;
3243+
if (throttled_lb_pair(task_group(p), env->src_rq->cpu,
3244+
env->dst_cpu))
3245+
goto next;
3246+
3247+
load = task_h_load(p);
3248+
if (load < 16 && !env->sd->nr_balance_failed)
3249+
goto next;
3250+
3251+
if ((load * 2) > rem_load_move)
3252+
goto next;
3253+
3254+
if (!can_migrate_task(p, env))
3255+
goto next;
32543256

32553257
move_task(p, env);
32563258
pulled++;
3257-
rem_load_move -= p->se.load.weight;
3259+
rem_load_move -= load;
32583260

32593261
#ifdef CONFIG_PREEMPT
32603262
/*
@@ -3274,6 +3276,10 @@ static unsigned long balance_tasks(struct lb_env *env)
32743276
*/
32753277
if (rem_load_move <= 0)
32763278
break;
3279+
3280+
continue;
3281+
next:
3282+
list_move_tail(&p->se.group_node, &env->src_rq->cfs_tasks);
32773283
}
32783284
out:
32793285
/*
@@ -3363,65 +3369,33 @@ static int tg_load_down(struct task_group *tg, void *data)
33633369

33643370
static void update_h_load(long cpu)
33653371
{
3372+
rcu_read_lock();
33663373
walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
3374+
rcu_read_unlock();
33673375
}
33683376

3369-
static unsigned long load_balance_fair(struct lb_env *env)
3377+
static unsigned long task_h_load(struct task_struct *p)
33703378
{
3371-
unsigned long max_load_move = env->max_load_move;
3372-
long rem_load_move = env->max_load_move;
3373-
3374-
rcu_read_lock();
3375-
update_h_load(cpu_of(env->src_rq));
3376-
3377-
for_each_leaf_cfs_rq(env->src_rq, env->src_cfs_rq) {
3378-
unsigned long busiest_h_load = env->src_cfs_rq->h_load;
3379-
unsigned long busiest_weight = env->src_cfs_rq->load.weight;
3380-
u64 rem_load, moved_load;
3381-
3382-
if (env->flags & (LBF_NEED_BREAK|LBF_ABORT))
3383-
break;
3384-
3385-
/*
3386-
* empty group or part of a throttled hierarchy
3387-
*/
3388-
if (!env->src_cfs_rq->task_weight)
3389-
continue;
3390-
3391-
if (throttled_lb_pair(env->src_cfs_rq->tg,
3392-
cpu_of(env->src_rq),
3393-
env->dst_cpu))
3394-
continue;
3395-
3396-
rem_load = (u64)rem_load_move * busiest_weight;
3397-
rem_load = div_u64(rem_load, busiest_h_load + 1);
3398-
3399-
env->max_load_move = rem_load;
3400-
3401-
moved_load = balance_tasks(env);
3402-
if (!moved_load)
3403-
continue;
3404-
3405-
moved_load *= busiest_h_load;
3406-
moved_load = div_u64(moved_load, busiest_weight + 1);
3379+
struct cfs_rq *cfs_rq = task_cfs_rq(p);
3380+
unsigned long load;
34073381

3408-
rem_load_move -= moved_load;
3409-
if (rem_load_move < 0)
3410-
break;
3411-
}
3412-
rcu_read_unlock();
3382+
load = p->se.load.weight;
3383+
load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1);
34133384

3414-
return max_load_move - rem_load_move;
3385+
return load;
34153386
}
34163387
#else
34173388
static inline void update_shares(int cpu)
34183389
{
34193390
}
34203391

3421-
static unsigned long load_balance_fair(struct lb_env *env)
3392+
static inline void update_h_load(long cpu)
34223393
{
3423-
env->src_cfs_rq = &env->src_rq->cfs;
3424-
return balance_tasks(env);
3394+
}
3395+
3396+
static unsigned long task_h_load(struct task_struct *p)
3397+
{
3398+
return p->se.load.weight;
34253399
}
34263400
#endif
34273401

@@ -3437,9 +3411,10 @@ static int move_tasks(struct lb_env *env)
34373411
unsigned long max_load_move = env->max_load_move;
34383412
unsigned long total_load_moved = 0, load_moved;
34393413

3414+
update_h_load(cpu_of(env->src_rq));
34403415
do {
34413416
env->max_load_move = max_load_move - total_load_moved;
3442-
load_moved = load_balance_fair(env);
3417+
load_moved = balance_tasks(env);
34433418
total_load_moved += load_moved;
34443419

34453420
if (env->flags & (LBF_NEED_BREAK|LBF_ABORT))
@@ -4464,6 +4439,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
44644439
.dst_cpu = this_cpu,
44654440
.dst_rq = this_rq,
44664441
.idle = idle,
4442+
.loop_break = sysctl_sched_nr_migrate,
44674443
};
44684444

44694445
cpumask_copy(cpus, cpu_active_mask);
@@ -4504,6 +4480,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
45044480
env.max_load_move = imbalance;
45054481
env.src_cpu = busiest->cpu;
45064482
env.src_rq = busiest;
4483+
env.loop_max = busiest->nr_running;
45074484

45084485
local_irq_save(flags);
45094486
double_rq_lock(this_rq, busiest);
@@ -4521,9 +4498,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
45214498
goto out_balanced;
45224499

45234500
if (env.flags & LBF_NEED_BREAK) {
4524-
env.flags += LBF_HAD_BREAK - LBF_NEED_BREAK;
4525-
if (env.flags & LBF_ABORT)
4526-
goto out_balanced;
4501+
env.flags &= ~LBF_NEED_BREAK;
45274502
goto redo;
45284503
}
45294504

@@ -5357,7 +5332,6 @@ static void set_curr_task_fair(struct rq *rq)
53575332
void init_cfs_rq(struct cfs_rq *cfs_rq)
53585333
{
53595334
cfs_rq->tasks_timeline = RB_ROOT;
5360-
INIT_LIST_HEAD(&cfs_rq->tasks);
53615335
cfs_rq->min_vruntime = (u64)(-(1LL << 20));
53625336
#ifndef CONFIG_64BIT
53635337
cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;

kernel/sched/sched.h

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -212,9 +212,6 @@ struct cfs_rq {
212212
struct rb_root tasks_timeline;
213213
struct rb_node *rb_leftmost;
214214

215-
struct list_head tasks;
216-
struct list_head *balance_iterator;
217-
218215
/*
219216
* 'curr' points to currently running entity on this cfs_rq.
220217
* It is set to NULL otherwise (i.e when none are currently running).
@@ -241,11 +238,6 @@ struct cfs_rq {
241238
struct task_group *tg; /* group that "owns" this runqueue */
242239

243240
#ifdef CONFIG_SMP
244-
/*
245-
* the part of load.weight contributed by tasks
246-
*/
247-
unsigned long task_weight;
248-
249241
/*
250242
* h_load = weight * f(tg)
251243
*
@@ -420,6 +412,8 @@ struct rq {
420412
int cpu;
421413
int online;
422414

415+
struct list_head cfs_tasks;
416+
423417
u64 rt_avg;
424418
u64 age_stamp;
425419
u64 idle_stamp;

0 commit comments

Comments
 (0)