Skip to content

Commit e8f331b

Browse files
Peter ZijlstraIngo Molnar
authored andcommitted
sched/smp: Use lag to simplify cross-runqueue placement
Using lag is both more correct and simpler when moving between runqueues. Notable, min_vruntime() was invented as a cheap approximation of avg_vruntime() for this very purpose (SMP migration). Since we now have the real thing; use it. Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Signed-off-by: Ingo Molnar <[email protected]> Link: https://lore.kernel.org/r/[email protected]
1 parent 76cae9d commit e8f331b

File tree

1 file changed

+19
-126
lines changed

1 file changed

+19
-126
lines changed

kernel/sched/fair.c

Lines changed: 19 additions & 126 deletions
Original file line numberDiff line numberDiff line change
@@ -5083,7 +5083,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
50835083
*
50845084
* EEVDF: placement strategy #1 / #2
50855085
*/
5086-
if (sched_feat(PLACE_LAG) && cfs_rq->nr_running > 1) {
5086+
if (sched_feat(PLACE_LAG) && cfs_rq->nr_running) {
50875087
struct sched_entity *curr = cfs_rq->curr;
50885088
unsigned long load;
50895089

@@ -5172,60 +5172,20 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
51725172

51735173
static inline bool cfs_bandwidth_used(void);
51745174

5175-
/*
5176-
* MIGRATION
5177-
*
5178-
* dequeue
5179-
* update_curr()
5180-
* update_min_vruntime()
5181-
* vruntime -= min_vruntime
5182-
*
5183-
* enqueue
5184-
* update_curr()
5185-
* update_min_vruntime()
5186-
* vruntime += min_vruntime
5187-
*
5188-
* this way the vruntime transition between RQs is done when both
5189-
* min_vruntime are up-to-date.
5190-
*
5191-
* WAKEUP (remote)
5192-
*
5193-
* ->migrate_task_rq_fair() (p->state == TASK_WAKING)
5194-
* vruntime -= min_vruntime
5195-
*
5196-
* enqueue
5197-
* update_curr()
5198-
* update_min_vruntime()
5199-
* vruntime += min_vruntime
5200-
*
5201-
* this way we don't have the most up-to-date min_vruntime on the originating
5202-
* CPU and an up-to-date min_vruntime on the destination CPU.
5203-
*/
5204-
52055175
static void
52065176
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
52075177
{
5208-
bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
52095178
bool curr = cfs_rq->curr == se;
52105179

52115180
/*
52125181
* If we're the current task, we must renormalise before calling
52135182
* update_curr().
52145183
*/
5215-
if (renorm && curr)
5216-
se->vruntime += cfs_rq->min_vruntime;
5184+
if (curr)
5185+
place_entity(cfs_rq, se, 0);
52175186

52185187
update_curr(cfs_rq);
52195188

5220-
/*
5221-
* Otherwise, renormalise after, such that we're placed at the current
5222-
* moment in time, instead of some random moment in the past. Being
5223-
* placed in the past could significantly boost this task to the
5224-
* fairness detriment of existing tasks.
5225-
*/
5226-
if (renorm && !curr)
5227-
se->vruntime += cfs_rq->min_vruntime;
5228-
52295189
/*
52305190
* When enqueuing a sched_entity, we must:
52315191
* - Update loads to have both entity and cfs_rq synced with now.
@@ -5237,11 +5197,22 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
52375197
*/
52385198
update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
52395199
se_update_runnable(se);
5200+
/*
5201+
* XXX update_load_avg() above will have attached us to the pelt sum;
5202+
* but update_cfs_group() here will re-adjust the weight and have to
5203+
* undo/redo all that. Seems wasteful.
5204+
*/
52405205
update_cfs_group(se);
5241-
account_entity_enqueue(cfs_rq, se);
52425206

5243-
if (flags & ENQUEUE_WAKEUP)
5207+
/*
5208+
* XXX now that the entity has been re-weighted, and it's lag adjusted,
5209+
* we can place the entity.
5210+
*/
5211+
if (!curr)
52445212
place_entity(cfs_rq, se, 0);
5213+
5214+
account_entity_enqueue(cfs_rq, se);
5215+
52455216
/* Entity has migrated, no longer consider this task hot */
52465217
if (flags & ENQUEUE_MIGRATED)
52475218
se->exec_start = 0;
@@ -5346,23 +5317,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
53465317

53475318
clear_buddies(cfs_rq, se);
53485319

5349-
if (flags & DEQUEUE_SLEEP)
5350-
update_entity_lag(cfs_rq, se);
5351-
5320+
update_entity_lag(cfs_rq, se);
53525321
if (se != cfs_rq->curr)
53535322
__dequeue_entity(cfs_rq, se);
53545323
se->on_rq = 0;
53555324
account_entity_dequeue(cfs_rq, se);
53565325

5357-
/*
5358-
* Normalize after update_curr(); which will also have moved
5359-
* min_vruntime if @se is the one holding it back. But before doing
5360-
* update_min_vruntime() again, which will discount @se's position and
5361-
* can move min_vruntime forward still more.
5362-
*/
5363-
if (!(flags & DEQUEUE_SLEEP))
5364-
se->vruntime -= cfs_rq->min_vruntime;
5365-
53665326
/* return excess runtime on last dequeue */
53675327
return_cfs_rq_runtime(cfs_rq);
53685328

@@ -8208,18 +8168,6 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
82088168
{
82098169
struct sched_entity *se = &p->se;
82108170

8211-
/*
8212-
* As blocked tasks retain absolute vruntime the migration needs to
8213-
* deal with this by subtracting the old and adding the new
8214-
* min_vruntime -- the latter is done by enqueue_entity() when placing
8215-
* the task on the new runqueue.
8216-
*/
8217-
if (READ_ONCE(p->__state) == TASK_WAKING) {
8218-
struct cfs_rq *cfs_rq = cfs_rq_of(se);
8219-
8220-
se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
8221-
}
8222-
82238171
if (!task_on_rq_migrating(p)) {
82248172
remove_entity_load_avg(se);
82258173

@@ -12709,8 +12657,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
1270912657
*/
1271012658
static void task_fork_fair(struct task_struct *p)
1271112659
{
12712-
struct cfs_rq *cfs_rq;
1271312660
struct sched_entity *se = &p->se, *curr;
12661+
struct cfs_rq *cfs_rq;
1271412662
struct rq *rq = this_rq();
1271512663
struct rq_flags rf;
1271612664

@@ -12719,22 +12667,9 @@ static void task_fork_fair(struct task_struct *p)
1271912667

1272012668
cfs_rq = task_cfs_rq(current);
1272112669
curr = cfs_rq->curr;
12722-
if (curr) {
12670+
if (curr)
1272312671
update_curr(cfs_rq);
12724-
se->vruntime = curr->vruntime;
12725-
}
1272612672
place_entity(cfs_rq, se, 1);
12727-
12728-
if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
12729-
/*
12730-
* Upon rescheduling, sched_class::put_prev_task() will place
12731-
* 'current' within the tree based on its new key value.
12732-
*/
12733-
swap(curr->vruntime, se->vruntime);
12734-
resched_curr(rq);
12735-
}
12736-
12737-
se->vruntime -= cfs_rq->min_vruntime;
1273812673
rq_unlock(rq, &rf);
1273912674
}
1274012675

@@ -12763,34 +12698,6 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
1276312698
check_preempt_curr(rq, p, 0);
1276412699
}
1276512700

12766-
static inline bool vruntime_normalized(struct task_struct *p)
12767-
{
12768-
struct sched_entity *se = &p->se;
12769-
12770-
/*
12771-
* In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
12772-
* the dequeue_entity(.flags=0) will already have normalized the
12773-
* vruntime.
12774-
*/
12775-
if (p->on_rq)
12776-
return true;
12777-
12778-
/*
12779-
* When !on_rq, vruntime of the task has usually NOT been normalized.
12780-
* But there are some cases where it has already been normalized:
12781-
*
12782-
* - A forked child which is waiting for being woken up by
12783-
* wake_up_new_task().
12784-
* - A task which has been woken up by try_to_wake_up() and
12785-
* waiting for actually being woken up by sched_ttwu_pending().
12786-
*/
12787-
if (!se->sum_exec_runtime ||
12788-
(READ_ONCE(p->__state) == TASK_WAKING && p->sched_remote_wakeup))
12789-
return true;
12790-
12791-
return false;
12792-
}
12793-
1279412701
#ifdef CONFIG_FAIR_GROUP_SCHED
1279512702
/*
1279612703
* Propagate the changes of the sched_entity across the tg tree to make it
@@ -12861,29 +12768,15 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
1286112768
static void detach_task_cfs_rq(struct task_struct *p)
1286212769
{
1286312770
struct sched_entity *se = &p->se;
12864-
struct cfs_rq *cfs_rq = cfs_rq_of(se);
12865-
12866-
if (!vruntime_normalized(p)) {
12867-
/*
12868-
* Fix up our vruntime so that the current sleep doesn't
12869-
* cause 'unlimited' sleep bonus.
12870-
*/
12871-
place_entity(cfs_rq, se, 0);
12872-
se->vruntime -= cfs_rq->min_vruntime;
12873-
}
1287412771

1287512772
detach_entity_cfs_rq(se);
1287612773
}
1287712774

1287812775
static void attach_task_cfs_rq(struct task_struct *p)
1287912776
{
1288012777
struct sched_entity *se = &p->se;
12881-
struct cfs_rq *cfs_rq = cfs_rq_of(se);
1288212778

1288312779
attach_entity_cfs_rq(se);
12884-
12885-
if (!vruntime_normalized(p))
12886-
se->vruntime += cfs_rq->min_vruntime;
1288712780
}
1288812781

1288912782
static void switched_from_fair(struct rq *rq, struct task_struct *p)

0 commit comments

Comments
 (0)