Skip to content

Commit 590680d

Browse files
committed
Merge tag 'sched-urgent-2020-04-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler fixes/updates from Thomas Gleixner: - Deduplicate the average computations in the scheduler core and the fair class code. - Fix a raise between runtime distribution and assignement which can cause exceeding the quota by up to 70%. - Prevent negative results in the imbalanace calculation - Remove a stale warning in the workqueue code which can be triggered since the call site was moved out of preempt disabled code. It's a false positive. - Deduplicate the print macros for procfs - Add the ucmap values to the SCHED_DEBUG procfs output for completness * tag 'sched-urgent-2020-04-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/debug: Add task uclamp values to SCHED_DEBUG procfs sched/debug: Factor out printing formats into common macros sched/debug: Remove redundant macro define sched/core: Remove unused rq::last_load_update_tick workqueue: Remove the warning in wq_worker_sleeping() sched/fair: Fix negative imbalance in imbalance calculation sched/fair: Fix race between runtime distribution and assignment sched/fair: Align rq->avg_idle and rq->avg_scan_cost
2 parents 20e2aa8 + 96e74eb commit 590680d

File tree

5 files changed

+51
-62
lines changed

5 files changed

+51
-62
lines changed

kernel/sched/core.c

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2119,12 +2119,6 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
21192119
return cpu;
21202120
}
21212121

2122-
static void update_avg(u64 *avg, u64 sample)
2123-
{
2124-
s64 diff = sample - *avg;
2125-
*avg += diff >> 3;
2126-
}
2127-
21282122
void sched_set_stop_task(int cpu, struct task_struct *stop)
21292123
{
21302124
struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
@@ -4126,7 +4120,8 @@ static inline void sched_submit_work(struct task_struct *tsk)
41264120
* it wants to wake up a task to maintain concurrency.
41274121
* As this function is called inside the schedule() context,
41284122
* we disable preemption to avoid it calling schedule() again
4129-
* in the possible wakeup of a kworker.
4123+
* in the possible wakeup of a kworker and because wq_worker_sleeping()
4124+
* requires it.
41304125
*/
41314126
if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
41324127
preempt_disable();
@@ -6699,7 +6694,6 @@ void __init sched_init(void)
66996694

67006695
rq_attach_root(rq, &def_root_domain);
67016696
#ifdef CONFIG_NO_HZ_COMMON
6702-
rq->last_load_update_tick = jiffies;
67036697
rq->last_blocked_load_update_tick = jiffies;
67046698
atomic_set(&rq->nohz_flags, 0);
67056699
#endif

kernel/sched/debug.c

Lines changed: 18 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -816,10 +816,12 @@ static int __init init_sched_debug_procfs(void)
816816

817817
__initcall(init_sched_debug_procfs);
818818

819-
#define __P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
820-
#define P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
821-
#define __PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
822-
#define PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
819+
#define __PS(S, F) SEQ_printf(m, "%-45s:%21Ld\n", S, (long long)(F))
820+
#define __P(F) __PS(#F, F)
821+
#define P(F) __PS(#F, p->F)
822+
#define __PSN(S, F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", S, SPLIT_NS((long long)(F)))
823+
#define __PN(F) __PSN(#F, F)
824+
#define PN(F) __PSN(#F, p->F)
823825

824826

825827
#ifdef CONFIG_NUMA_BALANCING
@@ -868,18 +870,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
868870
SEQ_printf(m,
869871
"---------------------------------------------------------"
870872
"----------\n");
871-
#define __P(F) \
872-
SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
873-
#define P(F) \
874-
SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
875-
#define P_SCHEDSTAT(F) \
876-
SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)schedstat_val(p->F))
877-
#define __PN(F) \
878-
SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
879-
#define PN(F) \
880-
SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
881-
#define PN_SCHEDSTAT(F) \
882-
SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(p->F)))
873+
874+
#define P_SCHEDSTAT(F) __PS(#F, schedstat_val(p->F))
875+
#define PN_SCHEDSTAT(F) __PSN(#F, schedstat_val(p->F))
883876

884877
PN(se.exec_start);
885878
PN(se.vruntime);
@@ -939,10 +932,8 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
939932
}
940933

941934
__P(nr_switches);
942-
SEQ_printf(m, "%-45s:%21Ld\n",
943-
"nr_voluntary_switches", (long long)p->nvcsw);
944-
SEQ_printf(m, "%-45s:%21Ld\n",
945-
"nr_involuntary_switches", (long long)p->nivcsw);
935+
__PS("nr_voluntary_switches", p->nvcsw);
936+
__PS("nr_involuntary_switches", p->nivcsw);
946937

947938
P(se.load.weight);
948939
#ifdef CONFIG_SMP
@@ -955,6 +946,12 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
955946
P(se.avg.last_update_time);
956947
P(se.avg.util_est.ewma);
957948
P(se.avg.util_est.enqueued);
949+
#endif
950+
#ifdef CONFIG_UCLAMP_TASK
951+
__PS("uclamp.min", p->uclamp[UCLAMP_MIN].value);
952+
__PS("uclamp.max", p->uclamp[UCLAMP_MAX].value);
953+
__PS("effective uclamp.min", uclamp_eff_value(p, UCLAMP_MIN));
954+
__PS("effective uclamp.max", uclamp_eff_value(p, UCLAMP_MAX));
958955
#endif
959956
P(policy);
960957
P(prio);
@@ -963,20 +960,15 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
963960
P(dl.deadline);
964961
}
965962
#undef PN_SCHEDSTAT
966-
#undef PN
967-
#undef __PN
968963
#undef P_SCHEDSTAT
969-
#undef P
970-
#undef __P
971964

972965
{
973966
unsigned int this_cpu = raw_smp_processor_id();
974967
u64 t0, t1;
975968

976969
t0 = cpu_clock(this_cpu);
977970
t1 = cpu_clock(this_cpu);
978-
SEQ_printf(m, "%-45s:%21Ld\n",
979-
"clock-delta", (long long)(t1-t0));
971+
__PS("clock-delta", t1-t0);
980972
}
981973

982974
sched_show_numa(p, m);

kernel/sched/fair.c

Lines changed: 21 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -4836,11 +4836,10 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
48364836
resched_curr(rq);
48374837
}
48384838

4839-
static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining)
4839+
static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
48404840
{
48414841
struct cfs_rq *cfs_rq;
4842-
u64 runtime;
4843-
u64 starting_runtime = remaining;
4842+
u64 runtime, remaining = 1;
48444843

48454844
rcu_read_lock();
48464845
list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
@@ -4855,10 +4854,13 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining)
48554854
/* By the above check, this should never be true */
48564855
SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
48574856

4857+
raw_spin_lock(&cfs_b->lock);
48584858
runtime = -cfs_rq->runtime_remaining + 1;
4859-
if (runtime > remaining)
4860-
runtime = remaining;
4861-
remaining -= runtime;
4859+
if (runtime > cfs_b->runtime)
4860+
runtime = cfs_b->runtime;
4861+
cfs_b->runtime -= runtime;
4862+
remaining = cfs_b->runtime;
4863+
raw_spin_unlock(&cfs_b->lock);
48624864

48634865
cfs_rq->runtime_remaining += runtime;
48644866

@@ -4873,8 +4875,6 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining)
48734875
break;
48744876
}
48754877
rcu_read_unlock();
4876-
4877-
return starting_runtime - remaining;
48784878
}
48794879

48804880
/*
@@ -4885,7 +4885,6 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining)
48854885
*/
48864886
static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
48874887
{
4888-
u64 runtime;
48894888
int throttled;
48904889

48914890
/* no need to continue the timer with no bandwidth constraint */
@@ -4914,24 +4913,17 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
49144913
cfs_b->nr_throttled += overrun;
49154914

49164915
/*
4917-
* This check is repeated as we are holding onto the new bandwidth while
4918-
* we unthrottle. This can potentially race with an unthrottled group
4919-
* trying to acquire new bandwidth from the global pool. This can result
4920-
* in us over-using our runtime if it is all used during this loop, but
4921-
* only by limited amounts in that extreme case.
4916+
* This check is repeated as we release cfs_b->lock while we unthrottle.
49224917
*/
49234918
while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
4924-
runtime = cfs_b->runtime;
49254919
cfs_b->distribute_running = 1;
49264920
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
49274921
/* we can't nest cfs_b->lock while distributing bandwidth */
4928-
runtime = distribute_cfs_runtime(cfs_b, runtime);
4922+
distribute_cfs_runtime(cfs_b);
49294923
raw_spin_lock_irqsave(&cfs_b->lock, flags);
49304924

49314925
cfs_b->distribute_running = 0;
49324926
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
4933-
4934-
lsub_positive(&cfs_b->runtime, runtime);
49354927
}
49364928

49374929
/*
@@ -5065,10 +5057,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
50655057
if (!runtime)
50665058
return;
50675059

5068-
runtime = distribute_cfs_runtime(cfs_b, runtime);
5060+
distribute_cfs_runtime(cfs_b);
50695061

50705062
raw_spin_lock_irqsave(&cfs_b->lock, flags);
5071-
lsub_positive(&cfs_b->runtime, runtime);
50725063
cfs_b->distribute_running = 0;
50735064
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
50745065
}
@@ -6080,8 +6071,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
60806071
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
60816072
struct sched_domain *this_sd;
60826073
u64 avg_cost, avg_idle;
6083-
u64 time, cost;
6084-
s64 delta;
6074+
u64 time;
60856075
int this = smp_processor_id();
60866076
int cpu, nr = INT_MAX;
60876077

@@ -6119,9 +6109,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
61196109
}
61206110

61216111
time = cpu_clock(this) - time;
6122-
cost = this_sd->avg_scan_cost;
6123-
delta = (s64)(time - cost) / 8;
6124-
this_sd->avg_scan_cost += delta;
6112+
update_avg(&this_sd->avg_scan_cost, time);
61256113

61266114
return cpu;
61276115
}
@@ -9048,6 +9036,14 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
90489036

90499037
sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
90509038
sds->total_capacity;
9039+
/*
9040+
* If the local group is more loaded than the selected
9041+
* busiest group don't try to pull any tasks.
9042+
*/
9043+
if (local->avg_load >= busiest->avg_load) {
9044+
env->imbalance = 0;
9045+
return;
9046+
}
90519047
}
90529048

90539049
/*

kernel/sched/sched.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,12 @@ static inline int task_has_dl_policy(struct task_struct *p)
195195

196196
#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
197197

198+
static inline void update_avg(u64 *avg, u64 sample)
199+
{
200+
s64 diff = sample - *avg;
201+
*avg += diff / 8;
202+
}
203+
198204
/*
199205
* !! For sched_setattr_nocheck() (kernel) only !!
200206
*
@@ -882,7 +888,6 @@ struct rq {
882888
#endif
883889
#ifdef CONFIG_NO_HZ_COMMON
884890
#ifdef CONFIG_SMP
885-
unsigned long last_load_update_tick;
886891
unsigned long last_blocked_load_update_tick;
887892
unsigned int has_blocked_load;
888893
#endif /* CONFIG_SMP */

kernel/workqueue.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -858,7 +858,8 @@ void wq_worker_running(struct task_struct *task)
858858
* @task: task going to sleep
859859
*
860860
* This function is called from schedule() when a busy worker is
861-
* going to sleep.
861+
* going to sleep. Preemption needs to be disabled to protect ->sleeping
862+
* assignment.
862863
*/
863864
void wq_worker_sleeping(struct task_struct *task)
864865
{
@@ -875,7 +876,8 @@ void wq_worker_sleeping(struct task_struct *task)
875876

876877
pool = worker->pool;
877878

878-
if (WARN_ON_ONCE(worker->sleeping))
879+
/* Return if preempted before wq_worker_running() was reached */
880+
if (worker->sleeping)
879881
return;
880882

881883
worker->sleeping = 1;

0 commit comments

Comments
 (0)