Skip to content
This repository was archived by the owner on Nov 8, 2023. It is now read-only.

Commit 86bfbb7

Browse files
Peter ZijlstraIngo Molnar
authored andcommitted
sched/fair: Add lag based placement
With the introduction of avg_vruntime, it is possible to approximate lag (the entire purpose of introducing it in fact). Use this to do lag based placement over sleep+wake. Specifically, the FAIR_SLEEPERS thing places things too far to the left and messes up the deadline aspect of EEVDF. Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Signed-off-by: Ingo Molnar <[email protected]> Link: https://lore.kernel.org/r/[email protected]
1 parent e0c2ff9 commit 86bfbb7

File tree

4 files changed

+141
-39
lines changed

4 files changed

+141
-39
lines changed

include/linux/sched.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -554,8 +554,9 @@ struct sched_entity {
554554

555555
u64 exec_start;
556556
u64 sum_exec_runtime;
557-
u64 vruntime;
558557
u64 prev_sum_exec_runtime;
558+
u64 vruntime;
559+
s64 vlag;
559560

560561
u64 nr_migrations;
561562

kernel/sched/core.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4501,6 +4501,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
45014501
p->se.prev_sum_exec_runtime = 0;
45024502
p->se.nr_migrations = 0;
45034503
p->se.vruntime = 0;
4504+
p->se.vlag = 0;
45044505
INIT_LIST_HEAD(&p->se.group_node);
45054506

45064507
#ifdef CONFIG_FAIR_GROUP_SCHED

kernel/sched/fair.c

Lines changed: 130 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -715,6 +715,15 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
715715
return cfs_rq->min_vruntime + avg;
716716
}
717717

718+
/*
719+
* lag_i = S - s_i = w_i * (V - v_i)
720+
*/
721+
void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
722+
{
723+
SCHED_WARN_ON(!se->on_rq);
724+
se->vlag = avg_vruntime(cfs_rq) - se->vruntime;
725+
}
726+
718727
static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
719728
{
720729
u64 min_vruntime = cfs_rq->min_vruntime;
@@ -3492,6 +3501,8 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
34923501
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
34933502
unsigned long weight)
34943503
{
3504+
unsigned long old_weight = se->load.weight;
3505+
34953506
if (se->on_rq) {
34963507
/* commit outstanding execution time */
34973508
if (cfs_rq->curr == se)
@@ -3504,6 +3515,14 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
35043515

35053516
update_load_set(&se->load, weight);
35063517

3518+
if (!se->on_rq) {
3519+
/*
3520+
* Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
3521+
* we need to scale se->vlag when w_i changes.
3522+
*/
3523+
se->vlag = div_s64(se->vlag * old_weight, weight);
3524+
}
3525+
35073526
#ifdef CONFIG_SMP
35083527
do {
35093528
u32 divider = get_pelt_divider(&se->avg);
@@ -4853,49 +4872,119 @@ static void
48534872
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
48544873
{
48554874
u64 vruntime = avg_vruntime(cfs_rq);
4875+
s64 lag = 0;
48564876

4857-
/* sleeps up to a single latency don't count. */
4858-
if (!initial) {
4859-
unsigned long thresh;
4877+
/*
4878+
* Due to how V is constructed as the weighted average of entities,
4879+
* adding tasks with positive lag, or removing tasks with negative lag
4880+
* will move 'time' backwards, this can screw around with the lag of
4881+
* other tasks.
4882+
*
4883+
* EEVDF: placement strategy #1 / #2
4884+
*/
4885+
if (sched_feat(PLACE_LAG) && cfs_rq->nr_running > 1) {
4886+
struct sched_entity *curr = cfs_rq->curr;
4887+
unsigned long load;
48604888

4861-
if (se_is_idle(se))
4862-
thresh = sysctl_sched_min_granularity;
4863-
else
4864-
thresh = sysctl_sched_latency;
4889+
lag = se->vlag;
48654890

48664891
/*
4867-
* Halve their sleep time's effect, to allow
4868-
* for a gentler effect of sleepers:
4892+
* If we want to place a task and preserve lag, we have to
4893+
* consider the effect of the new entity on the weighted
4894+
* average and compensate for this, otherwise lag can quickly
4895+
* evaporate.
4896+
*
4897+
* Lag is defined as:
4898+
*
4899+
* lag_i = S - s_i = w_i * (V - v_i)
4900+
*
4901+
* To avoid the 'w_i' term all over the place, we only track
4902+
* the virtual lag:
4903+
*
4904+
* vl_i = V - v_i <=> v_i = V - vl_i
4905+
*
4906+
* And we take V to be the weighted average of all v:
4907+
*
4908+
* V = (\Sum w_j*v_j) / W
4909+
*
4910+
* Where W is: \Sum w_j
4911+
*
4912+
* Then, the weighted average after adding an entity with lag
4913+
* vl_i is given by:
4914+
*
4915+
* V' = (\Sum w_j*v_j + w_i*v_i) / (W + w_i)
4916+
* = (W*V + w_i*(V - vl_i)) / (W + w_i)
4917+
* = (W*V + w_i*V - w_i*vl_i) / (W + w_i)
4918+
* = (V*(W + w_i) - w_i*l) / (W + w_i)
4919+
* = V - w_i*vl_i / (W + w_i)
4920+
*
4921+
* And the actual lag after adding an entity with vl_i is:
4922+
*
4923+
* vl'_i = V' - v_i
4924+
* = V - w_i*vl_i / (W + w_i) - (V - vl_i)
4925+
* = vl_i - w_i*vl_i / (W + w_i)
4926+
*
4927+
* Which is strictly less than vl_i. So in order to preserve lag
4928+
* we should inflate the lag before placement such that the
4929+
* effective lag after placement comes out right.
4930+
*
4931+
* As such, invert the above relation for vl'_i to get the vl_i
4932+
* we need to use such that the lag after placement is the lag
4933+
* we computed before dequeue.
4934+
*
4935+
* vl'_i = vl_i - w_i*vl_i / (W + w_i)
4936+
* = ((W + w_i)*vl_i - w_i*vl_i) / (W + w_i)
4937+
*
4938+
* (W + w_i)*vl'_i = (W + w_i)*vl_i - w_i*vl_i
4939+
* = W*vl_i
4940+
*
4941+
* vl_i = (W + w_i)*vl'_i / W
48694942
*/
4870-
if (sched_feat(GENTLE_FAIR_SLEEPERS))
4871-
thresh >>= 1;
4872-
4873-
vruntime -= thresh;
4874-
}
4875-
4876-
/*
4877-
* Pull vruntime of the entity being placed to the base level of
4878-
* cfs_rq, to prevent boosting it if placed backwards.
4879-
* However, min_vruntime can advance much faster than real time, with
4880-
* the extreme being when an entity with the minimal weight always runs
4881-
* on the cfs_rq. If the waking entity slept for a long time, its
4882-
* vruntime difference from min_vruntime may overflow s64 and their
4883-
* comparison may get inversed, so ignore the entity's original
4884-
* vruntime in that case.
4885-
* The maximal vruntime speedup is given by the ratio of normal to
4886-
* minimal weight: scale_load_down(NICE_0_LOAD) / MIN_SHARES.
4887-
* When placing a migrated waking entity, its exec_start has been set
4888-
* from a different rq. In order to take into account a possible
4889-
* divergence between new and prev rq's clocks task because of irq and
4890-
* stolen time, we take an additional margin.
4891-
* So, cutting off on the sleep time of
4892-
* 2^63 / scale_load_down(NICE_0_LOAD) ~ 104 days
4893-
* should be safe.
4894-
*/
4895-
if (entity_is_long_sleeper(se))
4896-
se->vruntime = vruntime;
4897-
else
4898-
se->vruntime = max_vruntime(se->vruntime, vruntime);
4943+
load = cfs_rq->avg_load;
4944+
if (curr && curr->on_rq)
4945+
load += curr->load.weight;
4946+
4947+
lag *= load + se->load.weight;
4948+
if (WARN_ON_ONCE(!load))
4949+
load = 1;
4950+
lag = div_s64(lag, load);
4951+
4952+
vruntime -= lag;
4953+
}
4954+
4955+
if (sched_feat(FAIR_SLEEPERS)) {
4956+
4957+
/* sleeps up to a single latency don't count. */
4958+
if (!initial) {
4959+
unsigned long thresh;
4960+
4961+
if (se_is_idle(se))
4962+
thresh = sysctl_sched_min_granularity;
4963+
else
4964+
thresh = sysctl_sched_latency;
4965+
4966+
/*
4967+
* Halve their sleep time's effect, to allow
4968+
* for a gentler effect of sleepers:
4969+
*/
4970+
if (sched_feat(GENTLE_FAIR_SLEEPERS))
4971+
thresh >>= 1;
4972+
4973+
vruntime -= thresh;
4974+
}
4975+
4976+
/*
4977+
* Pull vruntime of the entity being placed to the base level of
4978+
* cfs_rq, to prevent boosting it if placed backwards. If the entity
4979+
* slept for a long time, don't even try to compare its vruntime with
4980+
* the base as it may be too far off and the comparison may get
4981+
* inversed due to s64 overflow.
4982+
*/
4983+
if (!entity_is_long_sleeper(se))
4984+
vruntime = max_vruntime(se->vruntime, vruntime);
4985+
}
4986+
4987+
se->vruntime = vruntime;
48994988
}
49004989

49014990
static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
@@ -5077,6 +5166,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
50775166

50785167
clear_buddies(cfs_rq, se);
50795168

5169+
if (flags & DEQUEUE_SLEEP)
5170+
update_entity_lag(cfs_rq, se);
5171+
50805172
if (se != cfs_rq->curr)
50815173
__dequeue_entity(cfs_rq, se);
50825174
se->on_rq = 0;

kernel/sched/features.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,19 @@
11
/* SPDX-License-Identifier: GPL-2.0 */
2+
23
/*
34
* Only give sleepers 50% of their service deficit. This allows
45
* them to run sooner, but does not allow tons of sleepers to
56
* rip the spread apart.
67
*/
8+
SCHED_FEAT(FAIR_SLEEPERS, false)
79
SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
810

11+
/*
12+
* Using the avg_vruntime, do the right thing and preserve lag across
13+
* sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled.
14+
*/
15+
SCHED_FEAT(PLACE_LAG, true)
16+
917
/*
1018
* Prefer to schedule the task we woke last (assuming it failed
1119
* wakeup-preemption), since its likely going to consume data we

0 commit comments

Comments
 (0)