@@ -715,6 +715,15 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
715
715
return cfs_rq -> min_vruntime + avg ;
716
716
}
717
717
718
+ /*
719
+ * lag_i = S - s_i = w_i * (V - v_i)
720
+ */
721
+ void update_entity_lag (struct cfs_rq * cfs_rq , struct sched_entity * se )
722
+ {
723
+ SCHED_WARN_ON (!se -> on_rq );
724
+ se -> vlag = avg_vruntime (cfs_rq ) - se -> vruntime ;
725
+ }
726
+
718
727
static u64 __update_min_vruntime (struct cfs_rq * cfs_rq , u64 vruntime )
719
728
{
720
729
u64 min_vruntime = cfs_rq -> min_vruntime ;
@@ -3492,6 +3501,8 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
3492
3501
static void reweight_entity (struct cfs_rq * cfs_rq , struct sched_entity * se ,
3493
3502
unsigned long weight )
3494
3503
{
3504
+ unsigned long old_weight = se -> load .weight ;
3505
+
3495
3506
if (se -> on_rq ) {
3496
3507
/* commit outstanding execution time */
3497
3508
if (cfs_rq -> curr == se )
@@ -3504,6 +3515,14 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
3504
3515
3505
3516
update_load_set (& se -> load , weight );
3506
3517
3518
+ if (!se -> on_rq ) {
3519
+ /*
3520
+ * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
3521
+ * we need to scale se->vlag when w_i changes.
3522
+ */
3523
+ se -> vlag = div_s64 (se -> vlag * old_weight , weight );
3524
+ }
3525
+
3507
3526
#ifdef CONFIG_SMP
3508
3527
do {
3509
3528
u32 divider = get_pelt_divider (& se -> avg );
@@ -4853,49 +4872,119 @@ static void
4853
4872
place_entity (struct cfs_rq * cfs_rq , struct sched_entity * se , int initial )
4854
4873
{
4855
4874
u64 vruntime = avg_vruntime (cfs_rq );
4875
+ s64 lag = 0 ;
4856
4876
4857
- /* sleeps up to a single latency don't count. */
4858
- if (!initial ) {
4859
- unsigned long thresh ;
4877
+ /*
4878
+ * Due to how V is constructed as the weighted average of entities,
4879
+ * adding tasks with positive lag, or removing tasks with negative lag
4880
+ * will move 'time' backwards, this can screw around with the lag of
4881
+ * other tasks.
4882
+ *
4883
+ * EEVDF: placement strategy #1 / #2
4884
+ */
4885
+ if (sched_feat (PLACE_LAG ) && cfs_rq -> nr_running > 1 ) {
4886
+ struct sched_entity * curr = cfs_rq -> curr ;
4887
+ unsigned long load ;
4860
4888
4861
- if (se_is_idle (se ))
4862
- thresh = sysctl_sched_min_granularity ;
4863
- else
4864
- thresh = sysctl_sched_latency ;
4889
+ lag = se -> vlag ;
4865
4890
4866
4891
/*
4867
- * Halve their sleep time's effect, to allow
4868
- * for a gentler effect of sleepers:
4892
+ * If we want to place a task and preserve lag, we have to
4893
+ * consider the effect of the new entity on the weighted
4894
+ * average and compensate for this, otherwise lag can quickly
4895
+ * evaporate.
4896
+ *
4897
+ * Lag is defined as:
4898
+ *
4899
+ * lag_i = S - s_i = w_i * (V - v_i)
4900
+ *
4901
+ * To avoid the 'w_i' term all over the place, we only track
4902
+ * the virtual lag:
4903
+ *
4904
+ * vl_i = V - v_i <=> v_i = V - vl_i
4905
+ *
4906
+ * And we take V to be the weighted average of all v:
4907
+ *
4908
+ * V = (\Sum w_j*v_j) / W
4909
+ *
4910
+ * Where W is: \Sum w_j
4911
+ *
4912
+ * Then, the weighted average after adding an entity with lag
4913
+ * vl_i is given by:
4914
+ *
4915
+ * V' = (\Sum w_j*v_j + w_i*v_i) / (W + w_i)
4916
+ * = (W*V + w_i*(V - vl_i)) / (W + w_i)
4917
+ * = (W*V + w_i*V - w_i*vl_i) / (W + w_i)
4918
+ * = (V*(W + w_i) - w_i*l) / (W + w_i)
4919
+ * = V - w_i*vl_i / (W + w_i)
4920
+ *
4921
+ * And the actual lag after adding an entity with vl_i is:
4922
+ *
4923
+ * vl'_i = V' - v_i
4924
+ * = V - w_i*vl_i / (W + w_i) - (V - vl_i)
4925
+ * = vl_i - w_i*vl_i / (W + w_i)
4926
+ *
4927
+ * Which is strictly less than vl_i. So in order to preserve lag
4928
+ * we should inflate the lag before placement such that the
4929
+ * effective lag after placement comes out right.
4930
+ *
4931
+ * As such, invert the above relation for vl'_i to get the vl_i
4932
+ * we need to use such that the lag after placement is the lag
4933
+ * we computed before dequeue.
4934
+ *
4935
+ * vl'_i = vl_i - w_i*vl_i / (W + w_i)
4936
+ * = ((W + w_i)*vl_i - w_i*vl_i) / (W + w_i)
4937
+ *
4938
+ * (W + w_i)*vl'_i = (W + w_i)*vl_i - w_i*vl_i
4939
+ * = W*vl_i
4940
+ *
4941
+ * vl_i = (W + w_i)*vl'_i / W
4869
4942
*/
4870
- if (sched_feat (GENTLE_FAIR_SLEEPERS ))
4871
- thresh >>= 1 ;
4872
-
4873
- vruntime -= thresh ;
4874
- }
4875
-
4876
- /*
4877
- * Pull vruntime of the entity being placed to the base level of
4878
- * cfs_rq, to prevent boosting it if placed backwards.
4879
- * However, min_vruntime can advance much faster than real time, with
4880
- * the extreme being when an entity with the minimal weight always runs
4881
- * on the cfs_rq. If the waking entity slept for a long time, its
4882
- * vruntime difference from min_vruntime may overflow s64 and their
4883
- * comparison may get inversed, so ignore the entity's original
4884
- * vruntime in that case.
4885
- * The maximal vruntime speedup is given by the ratio of normal to
4886
- * minimal weight: scale_load_down(NICE_0_LOAD) / MIN_SHARES.
4887
- * When placing a migrated waking entity, its exec_start has been set
4888
- * from a different rq. In order to take into account a possible
4889
- * divergence between new and prev rq's clocks task because of irq and
4890
- * stolen time, we take an additional margin.
4891
- * So, cutting off on the sleep time of
4892
- * 2^63 / scale_load_down(NICE_0_LOAD) ~ 104 days
4893
- * should be safe.
4894
- */
4895
- if (entity_is_long_sleeper (se ))
4896
- se -> vruntime = vruntime ;
4897
- else
4898
- se -> vruntime = max_vruntime (se -> vruntime , vruntime );
4943
+ load = cfs_rq -> avg_load ;
4944
+ if (curr && curr -> on_rq )
4945
+ load += curr -> load .weight ;
4946
+
4947
+ lag *= load + se -> load .weight ;
4948
+ if (WARN_ON_ONCE (!load ))
4949
+ load = 1 ;
4950
+ lag = div_s64 (lag , load );
4951
+
4952
+ vruntime -= lag ;
4953
+ }
4954
+
4955
+ if (sched_feat (FAIR_SLEEPERS )) {
4956
+
4957
+ /* sleeps up to a single latency don't count. */
4958
+ if (!initial ) {
4959
+ unsigned long thresh ;
4960
+
4961
+ if (se_is_idle (se ))
4962
+ thresh = sysctl_sched_min_granularity ;
4963
+ else
4964
+ thresh = sysctl_sched_latency ;
4965
+
4966
+ /*
4967
+ * Halve their sleep time's effect, to allow
4968
+ * for a gentler effect of sleepers:
4969
+ */
4970
+ if (sched_feat (GENTLE_FAIR_SLEEPERS ))
4971
+ thresh >>= 1 ;
4972
+
4973
+ vruntime -= thresh ;
4974
+ }
4975
+
4976
+ /*
4977
+ * Pull vruntime of the entity being placed to the base level of
4978
+ * cfs_rq, to prevent boosting it if placed backwards. If the entity
4979
+ * slept for a long time, don't even try to compare its vruntime with
4980
+ * the base as it may be too far off and the comparison may get
4981
+ * inversed due to s64 overflow.
4982
+ */
4983
+ if (!entity_is_long_sleeper (se ))
4984
+ vruntime = max_vruntime (se -> vruntime , vruntime );
4985
+ }
4986
+
4987
+ se -> vruntime = vruntime ;
4899
4988
}
4900
4989
4901
4990
static void check_enqueue_throttle (struct cfs_rq * cfs_rq );
@@ -5077,6 +5166,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
5077
5166
5078
5167
clear_buddies (cfs_rq , se );
5079
5168
5169
+ if (flags & DEQUEUE_SLEEP )
5170
+ update_entity_lag (cfs_rq , se );
5171
+
5080
5172
if (se != cfs_rq -> curr )
5081
5173
__dequeue_entity (cfs_rq , se );
5082
5174
se -> on_rq = 0 ;
0 commit comments