Skip to content

Commit 5f220be

Browse files
KAGA-KOKOIngo Molnar
authored andcommitted
sched/wakeup: Prepare for RT sleeping spin/rwlocks
Waiting for spinlocks and rwlocks on non RT enabled kernels is task::state preserving. Any wakeup which matches the state is valid. RT enabled kernels substitutes them with 'sleeping' spinlocks. This creates an issue vs. task::__state. In order to block on the lock, the task has to overwrite task::__state and a consecutive wakeup issued by the unlocker sets the state back to TASK_RUNNING. As a consequence the task loses the state which was set before the lock acquire and also any regular wakeup targeted at the task while it is blocked on the lock. To handle this gracefully, add a 'saved_state' member to task_struct which is used in the following way: 1) When a task blocks on a 'sleeping' spinlock, the current state is saved in task::saved_state before it is set to TASK_RTLOCK_WAIT. 2) When the task unblocks and after acquiring the lock, it restores the saved state. 3) When a regular wakeup happens for a task while it is blocked then the state change of that wakeup is redirected to operate on task::saved_state. This is also required when the task state is running because the task might have been woken up from the lock wait and has not yet restored the saved state. To make it complete, provide the necessary helpers to save and restore the saved state along with the necessary documentation how the RT lock blocking is supposed to work. For non-RT kernels there is no functional change. Signed-off-by: Thomas Gleixner <[email protected]> Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Signed-off-by: Ingo Molnar <[email protected]> Link: https://lore.kernel.org/r/[email protected]
1 parent 85019c1 commit 5f220be

File tree

2 files changed

+99
-0
lines changed

2 files changed

+99
-0
lines changed

include/linux/sched.h

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,9 +143,22 @@ struct task_group;
143143
current->task_state_change = _THIS_IP_; \
144144
} while (0)
145145

146+
# define debug_rtlock_wait_set_state() \
147+
do { \
148+
current->saved_state_change = current->task_state_change;\
149+
current->task_state_change = _THIS_IP_; \
150+
} while (0)
151+
152+
# define debug_rtlock_wait_restore_state() \
153+
do { \
154+
current->task_state_change = current->saved_state_change;\
155+
} while (0)
156+
146157
#else
147158
# define debug_normal_state_change(cond) do { } while (0)
148159
# define debug_special_state_change(cond) do { } while (0)
160+
# define debug_rtlock_wait_set_state() do { } while (0)
161+
# define debug_rtlock_wait_restore_state() do { } while (0)
149162
#endif
150163

151164
/*
@@ -213,6 +226,51 @@ struct task_group;
213226
raw_spin_unlock_irqrestore(&current->pi_lock, flags); \
214227
} while (0)
215228

229+
/*
230+
* PREEMPT_RT specific variants for "sleeping" spin/rwlocks
231+
*
232+
* RT's spin/rwlock substitutions are state preserving. The state of the
233+
* task when blocking on the lock is saved in task_struct::saved_state and
234+
* restored after the lock has been acquired. These operations are
235+
* serialized by task_struct::pi_lock against try_to_wake_up(). Any non RT
236+
* lock related wakeups while the task is blocked on the lock are
237+
* redirected to operate on task_struct::saved_state to ensure that these
238+
* are not dropped. On restore task_struct::saved_state is set to
239+
* TASK_RUNNING so any wakeup attempt redirected to saved_state will fail.
240+
*
241+
* The lock operation looks like this:
242+
*
243+
* current_save_and_set_rtlock_wait_state();
244+
* for (;;) {
245+
* if (try_lock())
246+
* break;
247+
* raw_spin_unlock_irq(&lock->wait_lock);
248+
* schedule_rtlock();
249+
* raw_spin_lock_irq(&lock->wait_lock);
250+
* set_current_state(TASK_RTLOCK_WAIT);
251+
* }
252+
* current_restore_rtlock_saved_state();
253+
*/
254+
#define current_save_and_set_rtlock_wait_state() \
255+
do { \
256+
lockdep_assert_irqs_disabled(); \
257+
raw_spin_lock(&current->pi_lock); \
258+
current->saved_state = current->__state; \
259+
debug_rtlock_wait_set_state(); \
260+
WRITE_ONCE(current->__state, TASK_RTLOCK_WAIT); \
261+
raw_spin_unlock(&current->pi_lock); \
262+
} while (0);
263+
264+
#define current_restore_rtlock_saved_state() \
265+
do { \
266+
lockdep_assert_irqs_disabled(); \
267+
raw_spin_lock(&current->pi_lock); \
268+
debug_rtlock_wait_restore_state(); \
269+
WRITE_ONCE(current->__state, current->saved_state); \
270+
current->saved_state = TASK_RUNNING; \
271+
raw_spin_unlock(&current->pi_lock); \
272+
} while (0);
273+
216274
#define get_current_state() READ_ONCE(current->__state)
217275

218276
/* Task command name length: */
@@ -668,6 +726,11 @@ struct task_struct {
668726
#endif
669727
unsigned int __state;
670728

729+
#ifdef CONFIG_PREEMPT_RT
730+
/* saved state for "spinlock sleepers" */
731+
unsigned int saved_state;
732+
#endif
733+
671734
/*
672735
* This begins the randomizable portion of task_struct. Only
673736
* scheduling-critical items should be added above here.
@@ -1357,6 +1420,9 @@ struct task_struct {
13571420
struct kmap_ctrl kmap_ctrl;
13581421
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
13591422
unsigned long task_state_change;
1423+
# ifdef CONFIG_PREEMPT_RT
1424+
unsigned long saved_state_change;
1425+
# endif
13601426
#endif
13611427
int pagefault_disabled;
13621428
#ifdef CONFIG_MMU

kernel/sched/core.c

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3566,14 +3566,47 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
35663566
*
35673567
* The caller holds p::pi_lock if p != current or has preemption
35683568
* disabled when p == current.
3569+
*
3570+
* The rules of PREEMPT_RT saved_state:
3571+
*
3572+
* The related locking code always holds p::pi_lock when updating
3573+
* p::saved_state, which means the code is fully serialized in both cases.
3574+
*
3575+
* The lock wait and lock wakeups happen via TASK_RTLOCK_WAIT. No other
3576+
* bits set. This allows to distinguish all wakeup scenarios.
35693577
*/
35703578
static __always_inline
35713579
bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
35723580
{
3581+
if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) {
3582+
WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) &&
3583+
state != TASK_RTLOCK_WAIT);
3584+
}
3585+
35733586
if (READ_ONCE(p->__state) & state) {
35743587
*success = 1;
35753588
return true;
35763589
}
3590+
3591+
#ifdef CONFIG_PREEMPT_RT
3592+
/*
3593+
* Saved state preserves the task state across blocking on
3594+
* an RT lock. If the state matches, set p::saved_state to
3595+
* TASK_RUNNING, but do not wake the task because it waits
3596+
* for a lock wakeup. Also indicate success because from
3597+
* the regular waker's point of view this has succeeded.
3598+
*
3599+
* After acquiring the lock the task will restore p::__state
3600+
* from p::saved_state which ensures that the regular
3601+
* wakeup is not lost. The restore will also set
3602+
* p::saved_state to TASK_RUNNING so any further tests will
3603+
* not result in false positives vs. @success
3604+
*/
3605+
if (p->saved_state & state) {
3606+
p->saved_state = TASK_RUNNING;
3607+
*success = 1;
3608+
}
3609+
#endif
35773610
return false;
35783611
}
35793612

0 commit comments

Comments
 (0)