Skip to content

Commit 77a4d1a

Browse files
Peter ZijlstraKAGA-KOKO
authored andcommitted
sched: Cleanup bandwidth timers
Roman reported a 3 cpu lockup scenario involving __start_cfs_bandwidth(). The more I look at that code the more I'm convinced its crack, that entire __start_cfs_bandwidth() thing is brain melting, we don't need to cancel a timer before starting it, *hrtimer_start*() will happily remove the timer for you if its still enqueued. Removing that, removes a big part of the problem, no more ugly cancel loop to get stuck in. So now, if I understand things right, the entire reason you have this cfs_b->lock guarded ->timer_active nonsense is to make sure we don't accidentally lose the timer. It appears to me that it should be possible to guarantee that same by unconditionally (re)starting the timer when !queued. Because regardless what hrtimer::function will return, if we beat it to (re)enqueue the timer, it doesn't matter. Now, because hrtimers don't come with any serialization guarantees we must ensure both handler and (re)start loop serialize their access to the hrtimer to avoid both trying to forward the timer at the same time. Update the rt bandwidth timer to match. This effectively reverts: 09dc4ab ("sched/fair: Fix tg_set_cfs_bandwidth() deadlock on rq->lock"). Reported-by: Roman Gushchin <[email protected]> Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Reviewed-by: Ben Segall <[email protected]> Cc: Paul Turner <[email protected]> Link: http://lkml.kernel.org/r/[email protected] Signed-off-by: Thomas Gleixner <[email protected]>
1 parent 5de2755 commit 77a4d1a

File tree

4 files changed

+31
-61
lines changed

4 files changed

+31
-61
lines changed

kernel/sched/core.c

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -92,10 +92,13 @@
9292

9393
void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
9494
{
95-
if (hrtimer_active(period_timer))
96-
return;
95+
/*
96+
* Do not forward the expiration time of active timers;
97+
* we do not want to loose an overrun.
98+
*/
99+
if (!hrtimer_active(period_timer))
100+
hrtimer_forward_now(period_timer, period);
97101

98-
hrtimer_forward_now(period_timer, period);
99102
hrtimer_start_expires(period_timer, HRTIMER_MODE_ABS_PINNED);
100103
}
101104

@@ -8113,10 +8116,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
81138116

81148117
__refill_cfs_bandwidth_runtime(cfs_b);
81158118
/* restart the period timer (if active) to handle new period expiry */
8116-
if (runtime_enabled && cfs_b->timer_active) {
8117-
/* force a reprogram */
8118-
__start_cfs_bandwidth(cfs_b, true);
8119-
}
8119+
if (runtime_enabled)
8120+
start_cfs_bandwidth(cfs_b);
81208121
raw_spin_unlock_irq(&cfs_b->lock);
81218122

81228123
for_each_online_cpu(i) {

kernel/sched/fair.c

Lines changed: 15 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -3476,16 +3476,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
34763476
if (cfs_b->quota == RUNTIME_INF)
34773477
amount = min_amount;
34783478
else {
3479-
/*
3480-
* If the bandwidth pool has become inactive, then at least one
3481-
* period must have elapsed since the last consumption.
3482-
* Refresh the global state and ensure bandwidth timer becomes
3483-
* active.
3484-
*/
3485-
if (!cfs_b->timer_active) {
3486-
__refill_cfs_bandwidth_runtime(cfs_b);
3487-
__start_cfs_bandwidth(cfs_b, false);
3488-
}
3479+
start_cfs_bandwidth(cfs_b);
34893480

34903481
if (cfs_b->runtime > 0) {
34913482
amount = min(cfs_b->runtime, min_amount);
@@ -3634,6 +3625,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
36343625
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
36353626
struct sched_entity *se;
36363627
long task_delta, dequeue = 1;
3628+
bool empty;
36373629

36383630
se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
36393631

@@ -3663,13 +3655,21 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
36633655
cfs_rq->throttled = 1;
36643656
cfs_rq->throttled_clock = rq_clock(rq);
36653657
raw_spin_lock(&cfs_b->lock);
3658+
empty = list_empty(&cfs_rq->throttled_list);
3659+
36663660
/*
36673661
* Add to the _head_ of the list, so that an already-started
36683662
* distribute_cfs_runtime will not see us
36693663
*/
36703664
list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
3671-
if (!cfs_b->timer_active)
3672-
__start_cfs_bandwidth(cfs_b, false);
3665+
3666+
/*
3667+
* If we're the first throttled task, make sure the bandwidth
3668+
* timer is running.
3669+
*/
3670+
if (empty)
3671+
start_cfs_bandwidth(cfs_b);
3672+
36733673
raw_spin_unlock(&cfs_b->lock);
36743674
}
36753675

@@ -3784,13 +3784,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
37843784
if (cfs_b->idle && !throttled)
37853785
goto out_deactivate;
37863786

3787-
/*
3788-
* if we have relooped after returning idle once, we need to update our
3789-
* status as actually running, so that other cpus doing
3790-
* __start_cfs_bandwidth will stop trying to cancel us.
3791-
*/
3792-
cfs_b->timer_active = 1;
3793-
37943787
__refill_cfs_bandwidth_runtime(cfs_b);
37953788

37963789
if (!throttled) {
@@ -3835,7 +3828,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
38353828
return 0;
38363829

38373830
out_deactivate:
3838-
cfs_b->timer_active = 0;
38393831
return 1;
38403832
}
38413833

@@ -3999,6 +3991,7 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
39993991
{
40003992
struct cfs_bandwidth *cfs_b =
40013993
container_of(timer, struct cfs_bandwidth, slack_timer);
3994+
40023995
do_sched_cfs_slack_timer(cfs_b);
40033996

40043997
return HRTIMER_NORESTART;
@@ -4008,15 +4001,12 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
40084001
{
40094002
struct cfs_bandwidth *cfs_b =
40104003
container_of(timer, struct cfs_bandwidth, period_timer);
4011-
ktime_t now;
40124004
int overrun;
40134005
int idle = 0;
40144006

40154007
raw_spin_lock(&cfs_b->lock);
40164008
for (;;) {
4017-
now = hrtimer_cb_get_time(timer);
4018-
overrun = hrtimer_forward(timer, now, cfs_b->period);
4019-
4009+
overrun = hrtimer_forward_now(timer, cfs_b->period);
40204010
if (!overrun)
40214011
break;
40224012

@@ -4047,27 +4037,8 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
40474037
INIT_LIST_HEAD(&cfs_rq->throttled_list);
40484038
}
40494039

4050-
/* requires cfs_b->lock, may release to reprogram timer */
4051-
void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force)
4040+
void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
40524041
{
4053-
/*
4054-
* The timer may be active because we're trying to set a new bandwidth
4055-
* period or because we're racing with the tear-down path
4056-
* (timer_active==0 becomes visible before the hrtimer call-back
4057-
* terminates). In either case we ensure that it's re-programmed
4058-
*/
4059-
while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
4060-
hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
4061-
/* bounce the lock to allow do_sched_cfs_period_timer to run */
4062-
raw_spin_unlock(&cfs_b->lock);
4063-
cpu_relax();
4064-
raw_spin_lock(&cfs_b->lock);
4065-
/* if someone else restarted the timer then we're done */
4066-
if (!force && cfs_b->timer_active)
4067-
return;
4068-
}
4069-
4070-
cfs_b->timer_active = 1;
40714042
start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
40724043
}
40734044

kernel/sched/rt.c

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,19 +18,20 @@ static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
1818
{
1919
struct rt_bandwidth *rt_b =
2020
container_of(timer, struct rt_bandwidth, rt_period_timer);
21-
ktime_t now;
22-
int overrun;
2321
int idle = 0;
22+
int overrun;
2423

24+
raw_spin_lock(&rt_b->rt_runtime_lock);
2525
for (;;) {
26-
now = hrtimer_cb_get_time(timer);
27-
overrun = hrtimer_forward(timer, now, rt_b->rt_period);
28-
26+
overrun = hrtimer_forward_now(timer, rt_b->rt_period);
2927
if (!overrun)
3028
break;
3129

30+
raw_spin_unlock(&rt_b->rt_runtime_lock);
3231
idle = do_sched_rt_period_timer(rt_b, overrun);
32+
raw_spin_lock(&rt_b->rt_runtime_lock);
3333
}
34+
raw_spin_unlock(&rt_b->rt_runtime_lock);
3435

3536
return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
3637
}
@@ -52,9 +53,6 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
5253
if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
5354
return;
5455

55-
if (hrtimer_active(&rt_b->rt_period_timer))
56-
return;
57-
5856
raw_spin_lock(&rt_b->rt_runtime_lock);
5957
start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
6058
raw_spin_unlock(&rt_b->rt_runtime_lock);

kernel/sched/sched.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@ struct cfs_bandwidth {
215215
s64 hierarchical_quota;
216216
u64 runtime_expires;
217217

218-
int idle, timer_active;
218+
int idle;
219219
struct hrtimer period_timer, slack_timer;
220220
struct list_head throttled_cfs_rq;
221221

@@ -306,7 +306,7 @@ extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
306306
extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
307307

308308
extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
309-
extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force);
309+
extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
310310
extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
311311

312312
extern void free_rt_sched_group(struct task_group *tg);

0 commit comments

Comments
 (0)