Skip to content

Commit c5b2cbd

Browse files
manfred-colorfutorvalds
authored andcommitted
ipc/mqueue.c: update/document memory barriers
Update and document memory barriers for mqueue.c: - ewp->state is read without any locks, thus READ_ONCE is required. - add smp_aquire__after_ctrl_dep() after the READ_ONCE, we need acquire semantics if the value is STATE_READY. - use wake_q_add_safe() - document why __set_current_state() may be used: Reading task->state cannot happen before the wake_q_add() call, which happens while holding info->lock. Thus the spin_unlock() is the RELEASE, and the spin_lock() is the ACQUIRE. For completeness: there is also a 3 CPU scenario, if the to be woken up task is already on another wake_q. Then: - CPU1: spin_unlock() of the task that goes to sleep is the RELEASE - CPU2: the spin_lock() of the waker is the ACQUIRE - CPU2: smp_mb__before_atomic inside wake_q_add() is the RELEASE - CPU3: smp_mb__after_spinlock() inside try_to_wake_up() is the ACQUIRE Link: http://lkml.kernel.org/r/[email protected] Signed-off-by: Manfred Spraul <[email protected]> Reviewed-by: Davidlohr Bueso <[email protected]> Cc: Waiman Long <[email protected]> Cc: <[email protected]> Cc: Peter Zijlstra <[email protected]> Cc: Will Deacon <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent ed29f17 commit c5b2cbd

File tree

1 file changed

+78
-14
lines changed

1 file changed

+78
-14
lines changed

ipc/mqueue.c

Lines changed: 78 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,66 @@ struct posix_msg_tree_node {
6363
int priority;
6464
};
6565

66+
/*
67+
* Locking:
68+
*
69+
* Accesses to a message queue are synchronized by acquiring info->lock.
70+
*
71+
* There are two notable exceptions:
72+
* - The actual wakeup of a sleeping task is performed using the wake_q
73+
* framework. info->lock is already released when wake_up_q is called.
74+
* - The exit codepaths after sleeping check ext_wait_queue->state without
75+
* any locks. If it is STATE_READY, then the syscall is completed without
76+
* acquiring info->lock.
77+
*
78+
* MQ_BARRIER:
79+
* To achieve proper release/acquire memory barrier pairing, the state is set to
80+
* STATE_READY with smp_store_release(), and it is read with READ_ONCE followed
81+
* by smp_acquire__after_ctrl_dep(). In addition, wake_q_add_safe() is used.
82+
*
83+
* This prevents the following races:
84+
*
85+
* 1) With the simple wake_q_add(), the task could be gone already before
86+
* the increase of the reference happens
87+
* Thread A
88+
* Thread B
89+
* WRITE_ONCE(wait.state, STATE_NONE);
90+
* schedule_hrtimeout()
91+
* wake_q_add(A)
92+
* if (cmpxchg()) // success
93+
* ->state = STATE_READY (reordered)
94+
* <timeout returns>
95+
* if (wait.state == STATE_READY) return;
96+
* sysret to user space
97+
* sys_exit()
98+
* get_task_struct() // UaF
99+
*
100+
* Solution: Use wake_q_add_safe() and perform the get_task_struct() before
101+
* the smp_store_release() that does ->state = STATE_READY.
102+
*
103+
* 2) Without proper _release/_acquire barriers, the woken up task
104+
* could read stale data
105+
*
106+
* Thread A
107+
* Thread B
108+
* do_mq_timedreceive
109+
* WRITE_ONCE(wait.state, STATE_NONE);
110+
* schedule_hrtimeout()
111+
* state = STATE_READY;
112+
* <timeout returns>
113+
* if (wait.state == STATE_READY) return;
114+
* msg_ptr = wait.msg; // Access to stale data!
115+
* receiver->msg = message; (reordered)
116+
*
117+
* Solution: use _release and _acquire barriers.
118+
*
119+
* 3) There is intentionally no barrier when setting current->state
120+
* to TASK_INTERRUPTIBLE: spin_unlock(&info->lock) provides the
121+
* release memory barrier, and the wakeup is triggered when holding
122+
* info->lock, i.e. spin_lock(&info->lock) provided a pairing
123+
* acquire memory barrier.
124+
*/
125+
66126
struct ext_wait_queue { /* queue of sleeping tasks */
67127
struct task_struct *task;
68128
struct list_head list;
@@ -646,18 +706,23 @@ static int wq_sleep(struct mqueue_inode_info *info, int sr,
646706
wq_add(info, sr, ewp);
647707

648708
for (;;) {
709+
/* memory barrier not required, we hold info->lock */
649710
__set_current_state(TASK_INTERRUPTIBLE);
650711

651712
spin_unlock(&info->lock);
652713
time = schedule_hrtimeout_range_clock(timeout, 0,
653714
HRTIMER_MODE_ABS, CLOCK_REALTIME);
654715

655-
if (ewp->state == STATE_READY) {
716+
if (READ_ONCE(ewp->state) == STATE_READY) {
717+
/* see MQ_BARRIER for purpose/pairing */
718+
smp_acquire__after_ctrl_dep();
656719
retval = 0;
657720
goto out;
658721
}
659722
spin_lock(&info->lock);
660-
if (ewp->state == STATE_READY) {
723+
724+
/* we hold info->lock, so no memory barrier required */
725+
if (READ_ONCE(ewp->state) == STATE_READY) {
661726
retval = 0;
662727
goto out_unlock;
663728
}
@@ -923,16 +988,11 @@ static inline void __pipelined_op(struct wake_q_head *wake_q,
923988
struct ext_wait_queue *this)
924989
{
925990
list_del(&this->list);
926-
wake_q_add(wake_q, this->task);
927-
/*
928-
* Rely on the implicit cmpxchg barrier from wake_q_add such
929-
* that we can ensure that updating receiver->state is the last
930-
* write operation: As once set, the receiver can continue,
931-
* and if we don't have the reference count from the wake_q,
932-
* yet, at that point we can later have a use-after-free
933-
* condition and bogus wakeup.
934-
*/
935-
this->state = STATE_READY;
991+
get_task_struct(this->task);
992+
993+
/* see MQ_BARRIER for purpose/pairing */
994+
smp_store_release(&this->state, STATE_READY);
995+
wake_q_add_safe(wake_q, this->task);
936996
}
937997

938998
/* pipelined_send() - send a message directly to the task waiting in
@@ -1049,7 +1109,9 @@ static int do_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr,
10491109
} else {
10501110
wait.task = current;
10511111
wait.msg = (void *) msg_ptr;
1052-
wait.state = STATE_NONE;
1112+
1113+
/* memory barrier not required, we hold info->lock */
1114+
WRITE_ONCE(wait.state, STATE_NONE);
10531115
ret = wq_sleep(info, SEND, timeout, &wait);
10541116
/*
10551117
* wq_sleep must be called with info->lock held, and
@@ -1152,7 +1214,9 @@ static int do_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr,
11521214
ret = -EAGAIN;
11531215
} else {
11541216
wait.task = current;
1155-
wait.state = STATE_NONE;
1217+
1218+
/* memory barrier not required, we hold info->lock */
1219+
WRITE_ONCE(wait.state, STATE_NONE);
11561220
ret = wq_sleep(info, RECV, timeout, &wait);
11571221
msg_ptr = wait.msg;
11581222
}

0 commit comments

Comments
 (0)