Skip to content

Commit 2279f54

Browse files
jlelliPeter Zijlstra
authored andcommitted
sched/deadline: Fix priority inheritance with multiple scheduling classes
Glenn reported that "an application [he developed produces] a BUG in deadline.c when a SCHED_DEADLINE task contends with CFS tasks on nested PTHREAD_PRIO_INHERIT mutexes. I believe the bug is triggered when a CFS task that was boosted by a SCHED_DEADLINE task boosts another CFS task (nested priority inheritance). ------------[ cut here ]------------ kernel BUG at kernel/sched/deadline.c:1462! invalid opcode: 0000 [#1] PREEMPT SMP CPU: 12 PID: 19171 Comm: dl_boost_bug Tainted: ... Hardware name: ... RIP: 0010:enqueue_task_dl+0x335/0x910 Code: ... RSP: 0018:ffffc9000c2bbc68 EFLAGS: 00010002 RAX: 0000000000000009 RBX: ffff888c0af94c00 RCX: ffffffff81e12500 RDX: 000000000000002e RSI: ffff888c0af94c00 RDI: ffff888c10b22600 RBP: ffffc9000c2bbd08 R08: 0000000000000009 R09: 0000000000000078 R10: ffffffff81e12440 R11: ffffffff81e1236c R12: ffff888bc8932600 R13: ffff888c0af94eb8 R14: ffff888c10b22600 R15: ffff888bc8932600 FS: 00007fa58ac55700(0000) GS:ffff888c10b00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fa58b523230 CR3: 0000000bf44ab003 CR4: 00000000007606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: ? intel_pstate_update_util_hwp+0x13/0x170 rt_mutex_setprio+0x1cc/0x4b0 task_blocks_on_rt_mutex+0x225/0x260 rt_spin_lock_slowlock_locked+0xab/0x2d0 rt_spin_lock_slowlock+0x50/0x80 hrtimer_grab_expiry_lock+0x20/0x30 hrtimer_cancel+0x13/0x30 do_nanosleep+0xa0/0x150 hrtimer_nanosleep+0xe1/0x230 ? __hrtimer_init_sleeper+0x60/0x60 __x64_sys_nanosleep+0x8d/0xa0 do_syscall_64+0x4a/0x100 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x7fa58b52330d ... ---[ end trace 0000000000000002 ]— He also provided a simple reproducer creating the situation below: So the execution order of locking steps are the following (N1 and N2 are non-deadline tasks. D1 is a deadline task. M1 and M2 are mutexes that are enabled * with priority inheritance.) Time moves forward as this timeline goes down: N1 N2 D1 | | | | | | Lock(M1) | | | | | | Lock(M2) | | | | | | Lock(M2) | | | | Lock(M1) | | (!!bug triggered!) | Daniel reported a similar situation as well, by just letting ksoftirqd run with DEADLINE (and eventually block on a mutex). Problem is that boosted entities (Priority Inheritance) use static DEADLINE parameters of the top priority waiter. However, there might be cases where top waiter could be a non-DEADLINE entity that is currently boosted by a DEADLINE entity from a different lock chain (i.e., nested priority chains involving entities of non-DEADLINE classes). In this case, top waiter static DEADLINE parameters could be null (initialized to 0 at fork()) and replenish_dl_entity() would hit a BUG(). Fix this by keeping track of the original donor and using its parameters when a task is boosted. Reported-by: Glenn Elliott <[email protected]> Reported-by: Daniel Bristot de Oliveira <[email protected]> Signed-off-by: Juri Lelli <[email protected]> Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Tested-by: Daniel Bristot de Oliveira <[email protected]> Link: https://lkml.kernel.org/r/[email protected]
1 parent ec618b8 commit 2279f54

File tree

3 files changed

+68
-50
lines changed

3 files changed

+68
-50
lines changed

include/linux/sched.h

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -551,7 +551,6 @@ struct sched_dl_entity {
551551
* overruns.
552552
*/
553553
unsigned int dl_throttled : 1;
554-
unsigned int dl_boosted : 1;
555554
unsigned int dl_yielded : 1;
556555
unsigned int dl_non_contending : 1;
557556
unsigned int dl_overrun : 1;
@@ -570,6 +569,15 @@ struct sched_dl_entity {
570569
* time.
571570
*/
572571
struct hrtimer inactive_timer;
572+
573+
#ifdef CONFIG_RT_MUTEXES
574+
/*
575+
* Priority Inheritance. When a DEADLINE scheduling entity is boosted
576+
* pi_se points to the donor, otherwise points to the dl_se it belongs
577+
* to (the original one/itself).
578+
*/
579+
struct sched_dl_entity *pi_se;
580+
#endif
573581
};
574582

575583
#ifdef CONFIG_UCLAMP_TASK

kernel/sched/core.c

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4912,20 +4912,21 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
49124912
if (!dl_prio(p->normal_prio) ||
49134913
(pi_task && dl_prio(pi_task->prio) &&
49144914
dl_entity_preempt(&pi_task->dl, &p->dl))) {
4915-
p->dl.dl_boosted = 1;
4915+
p->dl.pi_se = pi_task->dl.pi_se;
49164916
queue_flag |= ENQUEUE_REPLENISH;
4917-
} else
4918-
p->dl.dl_boosted = 0;
4917+
} else {
4918+
p->dl.pi_se = &p->dl;
4919+
}
49194920
p->sched_class = &dl_sched_class;
49204921
} else if (rt_prio(prio)) {
49214922
if (dl_prio(oldprio))
4922-
p->dl.dl_boosted = 0;
4923+
p->dl.pi_se = &p->dl;
49234924
if (oldprio < prio)
49244925
queue_flag |= ENQUEUE_HEAD;
49254926
p->sched_class = &rt_sched_class;
49264927
} else {
49274928
if (dl_prio(oldprio))
4928-
p->dl.dl_boosted = 0;
4929+
p->dl.pi_se = &p->dl;
49294930
if (rt_prio(oldprio))
49304931
p->rt.timeout = 0;
49314932
p->sched_class = &fair_sched_class;

kernel/sched/deadline.c

Lines changed: 53 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,28 @@ static inline int on_dl_rq(struct sched_dl_entity *dl_se)
4343
return !RB_EMPTY_NODE(&dl_se->rb_node);
4444
}
4545

46+
#ifdef CONFIG_RT_MUTEXES
47+
static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se)
48+
{
49+
return dl_se->pi_se;
50+
}
51+
52+
static inline bool is_dl_boosted(struct sched_dl_entity *dl_se)
53+
{
54+
return pi_of(dl_se) != dl_se;
55+
}
56+
#else
57+
static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se)
58+
{
59+
return dl_se;
60+
}
61+
62+
static inline bool is_dl_boosted(struct sched_dl_entity *dl_se)
63+
{
64+
return false;
65+
}
66+
#endif
67+
4668
#ifdef CONFIG_SMP
4769
static inline struct dl_bw *dl_bw_of(int i)
4870
{
@@ -698,7 +720,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
698720
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
699721
struct rq *rq = rq_of_dl_rq(dl_rq);
700722

701-
WARN_ON(dl_se->dl_boosted);
723+
WARN_ON(is_dl_boosted(dl_se));
702724
WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
703725

704726
/*
@@ -736,21 +758,20 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
736758
* could happen are, typically, a entity voluntarily trying to overcome its
737759
* runtime, or it just underestimated it during sched_setattr().
738760
*/
739-
static void replenish_dl_entity(struct sched_dl_entity *dl_se,
740-
struct sched_dl_entity *pi_se)
761+
static void replenish_dl_entity(struct sched_dl_entity *dl_se)
741762
{
742763
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
743764
struct rq *rq = rq_of_dl_rq(dl_rq);
744765

745-
BUG_ON(pi_se->dl_runtime <= 0);
766+
BUG_ON(pi_of(dl_se)->dl_runtime <= 0);
746767

747768
/*
748769
* This could be the case for a !-dl task that is boosted.
749770
* Just go with full inherited parameters.
750771
*/
751772
if (dl_se->dl_deadline == 0) {
752-
dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
753-
dl_se->runtime = pi_se->dl_runtime;
773+
dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
774+
dl_se->runtime = pi_of(dl_se)->dl_runtime;
754775
}
755776

756777
if (dl_se->dl_yielded && dl_se->runtime > 0)
@@ -763,8 +784,8 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
763784
* arbitrary large.
764785
*/
765786
while (dl_se->runtime <= 0) {
766-
dl_se->deadline += pi_se->dl_period;
767-
dl_se->runtime += pi_se->dl_runtime;
787+
dl_se->deadline += pi_of(dl_se)->dl_period;
788+
dl_se->runtime += pi_of(dl_se)->dl_runtime;
768789
}
769790

770791
/*
@@ -778,8 +799,8 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
778799
*/
779800
if (dl_time_before(dl_se->deadline, rq_clock(rq))) {
780801
printk_deferred_once("sched: DL replenish lagged too much\n");
781-
dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
782-
dl_se->runtime = pi_se->dl_runtime;
802+
dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
803+
dl_se->runtime = pi_of(dl_se)->dl_runtime;
783804
}
784805

785806
if (dl_se->dl_yielded)
@@ -812,8 +833,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
812833
* task with deadline equal to period this is the same of using
813834
* dl_period instead of dl_deadline in the equation above.
814835
*/
815-
static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
816-
struct sched_dl_entity *pi_se, u64 t)
836+
static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t)
817837
{
818838
u64 left, right;
819839

@@ -835,9 +855,9 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
835855
* of anything below microseconds resolution is actually fiction
836856
* (but still we want to give the user that illusion >;).
837857
*/
838-
left = (pi_se->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
858+
left = (pi_of(dl_se)->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
839859
right = ((dl_se->deadline - t) >> DL_SCALE) *
840-
(pi_se->dl_runtime >> DL_SCALE);
860+
(pi_of(dl_se)->dl_runtime >> DL_SCALE);
841861

842862
return dl_time_before(right, left);
843863
}
@@ -922,24 +942,23 @@ static inline bool dl_is_implicit(struct sched_dl_entity *dl_se)
922942
* Please refer to the comments update_dl_revised_wakeup() function to find
923943
* more about the Revised CBS rule.
924944
*/
925-
static void update_dl_entity(struct sched_dl_entity *dl_se,
926-
struct sched_dl_entity *pi_se)
945+
static void update_dl_entity(struct sched_dl_entity *dl_se)
927946
{
928947
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
929948
struct rq *rq = rq_of_dl_rq(dl_rq);
930949

931950
if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
932-
dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) {
951+
dl_entity_overflow(dl_se, rq_clock(rq))) {
933952

934953
if (unlikely(!dl_is_implicit(dl_se) &&
935954
!dl_time_before(dl_se->deadline, rq_clock(rq)) &&
936-
!dl_se->dl_boosted)){
955+
!is_dl_boosted(dl_se))) {
937956
update_dl_revised_wakeup(dl_se, rq);
938957
return;
939958
}
940959

941-
dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
942-
dl_se->runtime = pi_se->dl_runtime;
960+
dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
961+
dl_se->runtime = pi_of(dl_se)->dl_runtime;
943962
}
944963
}
945964

@@ -1038,7 +1057,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
10381057
* The task might have been boosted by someone else and might be in the
10391058
* boosting/deboosting path, its not throttled.
10401059
*/
1041-
if (dl_se->dl_boosted)
1060+
if (is_dl_boosted(dl_se))
10421061
goto unlock;
10431062

10441063
/*
@@ -1066,7 +1085,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
10661085
* but do not enqueue -- wait for our wakeup to do that.
10671086
*/
10681087
if (!task_on_rq_queued(p)) {
1069-
replenish_dl_entity(dl_se, dl_se);
1088+
replenish_dl_entity(dl_se);
10701089
goto unlock;
10711090
}
10721091

@@ -1156,7 +1175,7 @@ static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se)
11561175

11571176
if (dl_time_before(dl_se->deadline, rq_clock(rq)) &&
11581177
dl_time_before(rq_clock(rq), dl_next_period(dl_se))) {
1159-
if (unlikely(dl_se->dl_boosted || !start_dl_timer(p)))
1178+
if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(p)))
11601179
return;
11611180
dl_se->dl_throttled = 1;
11621181
if (dl_se->runtime > 0)
@@ -1287,7 +1306,7 @@ static void update_curr_dl(struct rq *rq)
12871306
dl_se->dl_overrun = 1;
12881307

12891308
__dequeue_task_dl(rq, curr, 0);
1290-
if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr)))
1309+
if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(curr)))
12911310
enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
12921311

12931312
if (!is_leftmost(curr, &rq->dl))
@@ -1481,8 +1500,7 @@ static void __dequeue_dl_entity(struct sched_dl_entity *dl_se)
14811500
}
14821501

14831502
static void
1484-
enqueue_dl_entity(struct sched_dl_entity *dl_se,
1485-
struct sched_dl_entity *pi_se, int flags)
1503+
enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags)
14861504
{
14871505
BUG_ON(on_dl_rq(dl_se));
14881506

@@ -1493,9 +1511,9 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
14931511
*/
14941512
if (flags & ENQUEUE_WAKEUP) {
14951513
task_contending(dl_se, flags);
1496-
update_dl_entity(dl_se, pi_se);
1514+
update_dl_entity(dl_se);
14971515
} else if (flags & ENQUEUE_REPLENISH) {
1498-
replenish_dl_entity(dl_se, pi_se);
1516+
replenish_dl_entity(dl_se);
14991517
} else if ((flags & ENQUEUE_RESTORE) &&
15001518
dl_time_before(dl_se->deadline,
15011519
rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) {
@@ -1512,19 +1530,7 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se)
15121530

15131531
static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
15141532
{
1515-
struct task_struct *pi_task = rt_mutex_get_top_task(p);
1516-
struct sched_dl_entity *pi_se = &p->dl;
1517-
1518-
/*
1519-
* Use the scheduling parameters of the top pi-waiter task if:
1520-
* - we have a top pi-waiter which is a SCHED_DEADLINE task AND
1521-
* - our dl_boosted is set (i.e. the pi-waiter's (absolute) deadline is
1522-
* smaller than our deadline OR we are a !SCHED_DEADLINE task getting
1523-
* boosted due to a SCHED_DEADLINE pi-waiter).
1524-
* Otherwise we keep our runtime and deadline.
1525-
*/
1526-
if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) {
1527-
pi_se = &pi_task->dl;
1533+
if (is_dl_boosted(&p->dl)) {
15281534
/*
15291535
* Because of delays in the detection of the overrun of a
15301536
* thread's runtime, it might be the case that a thread
@@ -1557,7 +1563,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
15571563
* the throttle.
15581564
*/
15591565
p->dl.dl_throttled = 0;
1560-
BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH);
1566+
BUG_ON(!is_dl_boosted(&p->dl) || flags != ENQUEUE_REPLENISH);
15611567
return;
15621568
}
15631569

@@ -1594,7 +1600,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
15941600
return;
15951601
}
15961602

1597-
enqueue_dl_entity(&p->dl, pi_se, flags);
1603+
enqueue_dl_entity(&p->dl, flags);
15981604

15991605
if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
16001606
enqueue_pushable_dl_task(rq, p);
@@ -2787,11 +2793,14 @@ void __dl_clear_params(struct task_struct *p)
27872793
dl_se->dl_bw = 0;
27882794
dl_se->dl_density = 0;
27892795

2790-
dl_se->dl_boosted = 0;
27912796
dl_se->dl_throttled = 0;
27922797
dl_se->dl_yielded = 0;
27932798
dl_se->dl_non_contending = 0;
27942799
dl_se->dl_overrun = 0;
2800+
2801+
#ifdef CONFIG_RT_MUTEXES
2802+
dl_se->pi_se = dl_se;
2803+
#endif
27952804
}
27962805

27972806
bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)

0 commit comments

Comments
 (0)