Skip to content
This repository was archived by the owner on Nov 8, 2023. It is now read-only.

Commit af4cf40

Browse files
Peter ZijlstraIngo Molnar
authored andcommitted
sched/fair: Add cfs_rq::avg_vruntime
In order to move to an eligibility based scheduling policy, we need to have a better approximation of the ideal scheduler. Specifically, for a virtual time weighted fair queueing based scheduler the ideal scheduler will be the weighted average of the individual virtual runtimes (math in the comment). As such, compute the weighted average to approximate the ideal scheduler -- note that the approximation is in the individual task behaviour, which isn't strictly conformant. Specifically consider adding a task with a vruntime left of center, in this case the average will move backwards in time -- something the ideal scheduler would of course never do. Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Signed-off-by: Ingo Molnar <[email protected]> Link: https://lore.kernel.org/r/[email protected]
1 parent 752182b commit af4cf40

File tree

3 files changed

+154
-20
lines changed

3 files changed

+154
-20
lines changed

kernel/sched/debug.c

Lines changed: 15 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -627,10 +627,9 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
627627

628628
void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
629629
{
630-
s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
631-
spread, rq0_min_vruntime, spread0;
630+
s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, spread;
631+
struct sched_entity *last, *first;
632632
struct rq *rq = cpu_rq(cpu);
633-
struct sched_entity *last;
634633
unsigned long flags;
635634

636635
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -644,26 +643,25 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
644643
SPLIT_NS(cfs_rq->exec_clock));
645644

646645
raw_spin_rq_lock_irqsave(rq, flags);
647-
if (rb_first_cached(&cfs_rq->tasks_timeline))
648-
MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
646+
first = __pick_first_entity(cfs_rq);
647+
if (first)
648+
left_vruntime = first->vruntime;
649649
last = __pick_last_entity(cfs_rq);
650650
if (last)
651-
max_vruntime = last->vruntime;
651+
right_vruntime = last->vruntime;
652652
min_vruntime = cfs_rq->min_vruntime;
653-
rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
654653
raw_spin_rq_unlock_irqrestore(rq, flags);
655-
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
656-
SPLIT_NS(MIN_vruntime));
654+
655+
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_vruntime",
656+
SPLIT_NS(left_vruntime));
657657
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime",
658658
SPLIT_NS(min_vruntime));
659-
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime",
660-
SPLIT_NS(max_vruntime));
661-
spread = max_vruntime - MIN_vruntime;
662-
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread",
663-
SPLIT_NS(spread));
664-
spread0 = min_vruntime - rq0_min_vruntime;
665-
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0",
666-
SPLIT_NS(spread0));
659+
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime",
660+
SPLIT_NS(avg_vruntime(cfs_rq)));
661+
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime",
662+
SPLIT_NS(right_vruntime));
663+
spread = right_vruntime - left_vruntime;
664+
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread));
667665
SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
668666
cfs_rq->nr_spread_over);
669667
SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);

kernel/sched/fair.c

Lines changed: 134 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -601,9 +601,134 @@ static inline bool entity_before(const struct sched_entity *a,
601601
return (s64)(a->vruntime - b->vruntime) < 0;
602602
}
603603

604+
static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
605+
{
606+
return (s64)(se->vruntime - cfs_rq->min_vruntime);
607+
}
608+
604609
#define __node_2_se(node) \
605610
rb_entry((node), struct sched_entity, run_node)
606611

612+
/*
613+
* Compute virtual time from the per-task service numbers:
614+
*
615+
* Fair schedulers conserve lag:
616+
*
617+
* \Sum lag_i = 0
618+
*
619+
* Where lag_i is given by:
620+
*
621+
* lag_i = S - s_i = w_i * (V - v_i)
622+
*
623+
* Where S is the ideal service time and V is it's virtual time counterpart.
624+
* Therefore:
625+
*
626+
* \Sum lag_i = 0
627+
* \Sum w_i * (V - v_i) = 0
628+
* \Sum w_i * V - w_i * v_i = 0
629+
*
630+
* From which we can solve an expression for V in v_i (which we have in
631+
* se->vruntime):
632+
*
633+
* \Sum v_i * w_i \Sum v_i * w_i
634+
* V = -------------- = --------------
635+
* \Sum w_i W
636+
*
637+
* Specifically, this is the weighted average of all entity virtual runtimes.
638+
*
639+
* [[ NOTE: this is only equal to the ideal scheduler under the condition
640+
* that join/leave operations happen at lag_i = 0, otherwise the
641+
* virtual time has non-continguous motion equivalent to:
642+
*
643+
* V +-= lag_i / W
644+
*
645+
* Also see the comment in place_entity() that deals with this. ]]
646+
*
647+
* However, since v_i is u64, and the multiplcation could easily overflow
648+
* transform it into a relative form that uses smaller quantities:
649+
*
650+
* Substitute: v_i == (v_i - v0) + v0
651+
*
652+
* \Sum ((v_i - v0) + v0) * w_i \Sum (v_i - v0) * w_i
653+
* V = ---------------------------- = --------------------- + v0
654+
* W W
655+
*
656+
* Which we track using:
657+
*
658+
* v0 := cfs_rq->min_vruntime
659+
* \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime
660+
* \Sum w_i := cfs_rq->avg_load
661+
*
662+
* Since min_vruntime is a monotonic increasing variable that closely tracks
663+
* the per-task service, these deltas: (v_i - v), will be in the order of the
664+
* maximal (virtual) lag induced in the system due to quantisation.
665+
*
666+
* Also, we use scale_load_down() to reduce the size.
667+
*
668+
* As measured, the max (key * weight) value was ~44 bits for a kernel build.
669+
*/
670+
static void
671+
avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
672+
{
673+
unsigned long weight = scale_load_down(se->load.weight);
674+
s64 key = entity_key(cfs_rq, se);
675+
676+
cfs_rq->avg_vruntime += key * weight;
677+
cfs_rq->avg_load += weight;
678+
}
679+
680+
static void
681+
avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
682+
{
683+
unsigned long weight = scale_load_down(se->load.weight);
684+
s64 key = entity_key(cfs_rq, se);
685+
686+
cfs_rq->avg_vruntime -= key * weight;
687+
cfs_rq->avg_load -= weight;
688+
}
689+
690+
static inline
691+
void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
692+
{
693+
/*
694+
* v' = v + d ==> avg_vruntime' = avg_runtime - d*avg_load
695+
*/
696+
cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta;
697+
}
698+
699+
u64 avg_vruntime(struct cfs_rq *cfs_rq)
700+
{
701+
struct sched_entity *curr = cfs_rq->curr;
702+
s64 avg = cfs_rq->avg_vruntime;
703+
long load = cfs_rq->avg_load;
704+
705+
if (curr && curr->on_rq) {
706+
unsigned long weight = scale_load_down(curr->load.weight);
707+
708+
avg += entity_key(cfs_rq, curr) * weight;
709+
load += weight;
710+
}
711+
712+
if (load)
713+
avg = div_s64(avg, load);
714+
715+
return cfs_rq->min_vruntime + avg;
716+
}
717+
718+
static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
719+
{
720+
u64 min_vruntime = cfs_rq->min_vruntime;
721+
/*
722+
* open coded max_vruntime() to allow updating avg_vruntime
723+
*/
724+
s64 delta = (s64)(vruntime - min_vruntime);
725+
if (delta > 0) {
726+
avg_vruntime_update(cfs_rq, delta);
727+
min_vruntime = vruntime;
728+
}
729+
return min_vruntime;
730+
}
731+
607732
static void update_min_vruntime(struct cfs_rq *cfs_rq)
608733
{
609734
struct sched_entity *curr = cfs_rq->curr;
@@ -629,7 +754,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
629754

630755
/* ensure we never gain time by being placed backwards. */
631756
u64_u32_store(cfs_rq->min_vruntime,
632-
max_vruntime(cfs_rq->min_vruntime, vruntime));
757+
__update_min_vruntime(cfs_rq, vruntime));
633758
}
634759

635760
static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
@@ -642,12 +767,14 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
642767
*/
643768
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
644769
{
770+
avg_vruntime_add(cfs_rq, se);
645771
rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less);
646772
}
647773

648774
static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
649775
{
650776
rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
777+
avg_vruntime_sub(cfs_rq, se);
651778
}
652779

653780
struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
@@ -3379,6 +3506,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
33793506
/* commit outstanding execution time */
33803507
if (cfs_rq->curr == se)
33813508
update_curr(cfs_rq);
3509+
else
3510+
avg_vruntime_sub(cfs_rq, se);
33823511
update_load_sub(&cfs_rq->load, se->load.weight);
33833512
}
33843513
dequeue_load_avg(cfs_rq, se);
@@ -3394,9 +3523,11 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
33943523
#endif
33953524

33963525
enqueue_load_avg(cfs_rq, se);
3397-
if (se->on_rq)
3526+
if (se->on_rq) {
33983527
update_load_add(&cfs_rq->load, se->load.weight);
3399-
3528+
if (cfs_rq->curr != se)
3529+
avg_vruntime_add(cfs_rq, se);
3530+
}
34003531
}
34013532

34023533
void reweight_task(struct task_struct *p, int prio)

kernel/sched/sched.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -548,6 +548,9 @@ struct cfs_rq {
548548
unsigned int idle_nr_running; /* SCHED_IDLE */
549549
unsigned int idle_h_nr_running; /* SCHED_IDLE */
550550

551+
s64 avg_vruntime;
552+
u64 avg_load;
553+
551554
u64 exec_clock;
552555
u64 min_vruntime;
553556
#ifdef CONFIG_SCHED_CORE
@@ -3483,4 +3486,6 @@ static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { }
34833486
static inline void init_sched_mm_cid(struct task_struct *t) { }
34843487
#endif
34853488

3489+
extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
3490+
34863491
#endif /* _KERNEL_SCHED_SCHED_H */

0 commit comments

Comments
 (0)