Skip to content

Commit d257cc8

Browse files
Waiman-LongPeter Zijlstra
authored andcommitted
locking/rwsem: Make handoff bit handling more consistent
There are some inconsistency in the way that the handoff bit is being handled in readers and writers that lead to a race condition. Firstly, when a queue head writer set the handoff bit, it will clear it when the writer is being killed or interrupted on its way out without acquiring the lock. That is not the case for a queue head reader. The handoff bit will simply be inherited by the next waiter. Secondly, in the out_nolock path of rwsem_down_read_slowpath(), both the waiter and handoff bits are cleared if the wait queue becomes empty. For rwsem_down_write_slowpath(), however, the handoff bit is not checked and cleared if the wait queue is empty. This can potentially make the handoff bit set with empty wait queue. Worse, the situation in rwsem_down_write_slowpath() relies on wstate, a variable set outside of the critical section containing the ->count manipulation, this leads to race condition where RWSEM_FLAG_HANDOFF can be double subtracted, corrupting ->count. To make the handoff bit handling more consistent and robust, extract out handoff bit clearing code into the new rwsem_del_waiter() helper function. Also, completely eradicate wstate; always evaluate everything inside the same critical section. The common function will only use atomic_long_andnot() to clear bits when the wait queue is empty to avoid possible race condition. If the first waiter with handoff bit set is killed or interrupted to exit the slowpath without acquiring the lock, the next waiter will inherit the handoff bit. While at it, simplify the trylock for loop in rwsem_down_write_slowpath() to make it easier to read. Fixes: 4f23dbc ("locking/rwsem: Implement lock handoff to prevent lock starvation") Reported-by: Zhenhua Ma <[email protected]> Suggested-by: Peter Zijlstra <[email protected]> Signed-off-by: Waiman Long <[email protected]> Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Link: https://lkml.kernel.org/r/[email protected]
1 parent 1360572 commit d257cc8

File tree

1 file changed

+85
-86
lines changed

1 file changed

+85
-86
lines changed

kernel/locking/rwsem.c

Lines changed: 85 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -105,9 +105,9 @@
105105
* atomic_long_cmpxchg() will be used to obtain writer lock.
106106
*
107107
* There are three places where the lock handoff bit may be set or cleared.
108-
* 1) rwsem_mark_wake() for readers.
109-
* 2) rwsem_try_write_lock() for writers.
110-
* 3) Error path of rwsem_down_write_slowpath().
108+
* 1) rwsem_mark_wake() for readers -- set, clear
109+
* 2) rwsem_try_write_lock() for writers -- set, clear
110+
* 3) rwsem_del_waiter() -- clear
111111
*
112112
* For all the above cases, wait_lock will be held. A writer must also
113113
* be the first one in the wait_list to be eligible for setting the handoff
@@ -334,6 +334,9 @@ struct rwsem_waiter {
334334
struct task_struct *task;
335335
enum rwsem_waiter_type type;
336336
unsigned long timeout;
337+
338+
/* Writer only, not initialized in reader */
339+
bool handoff_set;
337340
};
338341
#define rwsem_first_waiter(sem) \
339342
list_first_entry(&sem->wait_list, struct rwsem_waiter, list)
@@ -344,12 +347,6 @@ enum rwsem_wake_type {
344347
RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */
345348
};
346349

347-
enum writer_wait_state {
348-
WRITER_NOT_FIRST, /* Writer is not first in wait list */
349-
WRITER_FIRST, /* Writer is first in wait list */
350-
WRITER_HANDOFF /* Writer is first & handoff needed */
351-
};
352-
353350
/*
354351
* The typical HZ value is either 250 or 1000. So set the minimum waiting
355352
* time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait
@@ -365,6 +362,31 @@ enum writer_wait_state {
365362
*/
366363
#define MAX_READERS_WAKEUP 0x100
367364

365+
static inline void
366+
rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
367+
{
368+
lockdep_assert_held(&sem->wait_lock);
369+
list_add_tail(&waiter->list, &sem->wait_list);
370+
/* caller will set RWSEM_FLAG_WAITERS */
371+
}
372+
373+
/*
374+
* Remove a waiter from the wait_list and clear flags.
375+
*
376+
* Both rwsem_mark_wake() and rwsem_try_write_lock() contain a full 'copy' of
377+
* this function. Modify with care.
378+
*/
379+
static inline void
380+
rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
381+
{
382+
lockdep_assert_held(&sem->wait_lock);
383+
list_del(&waiter->list);
384+
if (likely(!list_empty(&sem->wait_list)))
385+
return;
386+
387+
atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count);
388+
}
389+
368390
/*
369391
* handle the lock release when processes blocked on it that can now run
370392
* - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must
@@ -376,6 +398,8 @@ enum writer_wait_state {
376398
* preferably when the wait_lock is released
377399
* - woken process blocks are discarded from the list after having task zeroed
378400
* - writers are only marked woken if downgrading is false
401+
*
402+
* Implies rwsem_del_waiter() for all woken readers.
379403
*/
380404
static void rwsem_mark_wake(struct rw_semaphore *sem,
381405
enum rwsem_wake_type wake_type,
@@ -490,18 +514,25 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
490514

491515
adjustment = woken * RWSEM_READER_BIAS - adjustment;
492516
lockevent_cond_inc(rwsem_wake_reader, woken);
517+
518+
oldcount = atomic_long_read(&sem->count);
493519
if (list_empty(&sem->wait_list)) {
494-
/* hit end of list above */
520+
/*
521+
* Combined with list_move_tail() above, this implies
522+
* rwsem_del_waiter().
523+
*/
495524
adjustment -= RWSEM_FLAG_WAITERS;
525+
if (oldcount & RWSEM_FLAG_HANDOFF)
526+
adjustment -= RWSEM_FLAG_HANDOFF;
527+
} else if (woken) {
528+
/*
529+
* When we've woken a reader, we no longer need to force
530+
* writers to give up the lock and we can clear HANDOFF.
531+
*/
532+
if (oldcount & RWSEM_FLAG_HANDOFF)
533+
adjustment -= RWSEM_FLAG_HANDOFF;
496534
}
497535

498-
/*
499-
* When we've woken a reader, we no longer need to force writers
500-
* to give up the lock and we can clear HANDOFF.
501-
*/
502-
if (woken && (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF))
503-
adjustment -= RWSEM_FLAG_HANDOFF;
504-
505536
if (adjustment)
506537
atomic_long_add(adjustment, &sem->count);
507538

@@ -532,12 +563,12 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
532563
* race conditions between checking the rwsem wait list and setting the
533564
* sem->count accordingly.
534565
*
535-
* If wstate is WRITER_HANDOFF, it will make sure that either the handoff
536-
* bit is set or the lock is acquired with handoff bit cleared.
566+
* Implies rwsem_del_waiter() on success.
537567
*/
538568
static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
539-
enum writer_wait_state wstate)
569+
struct rwsem_waiter *waiter)
540570
{
571+
bool first = rwsem_first_waiter(sem) == waiter;
541572
long count, new;
542573

543574
lockdep_assert_held(&sem->wait_lock);
@@ -546,13 +577,19 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
546577
do {
547578
bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF);
548579

549-
if (has_handoff && wstate == WRITER_NOT_FIRST)
550-
return false;
580+
if (has_handoff) {
581+
if (!first)
582+
return false;
583+
584+
/* First waiter inherits a previously set handoff bit */
585+
waiter->handoff_set = true;
586+
}
551587

552588
new = count;
553589

554590
if (count & RWSEM_LOCK_MASK) {
555-
if (has_handoff || (wstate != WRITER_HANDOFF))
591+
if (has_handoff || (!rt_task(waiter->task) &&
592+
!time_after(jiffies, waiter->timeout)))
556593
return false;
557594

558595
new |= RWSEM_FLAG_HANDOFF;
@@ -569,9 +606,17 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
569606
* We have either acquired the lock with handoff bit cleared or
570607
* set the handoff bit.
571608
*/
572-
if (new & RWSEM_FLAG_HANDOFF)
609+
if (new & RWSEM_FLAG_HANDOFF) {
610+
waiter->handoff_set = true;
611+
lockevent_inc(rwsem_wlock_handoff);
573612
return false;
613+
}
574614

615+
/*
616+
* Have rwsem_try_write_lock() fully imply rwsem_del_waiter() on
617+
* success.
618+
*/
619+
list_del(&waiter->list);
575620
rwsem_set_owner(sem);
576621
return true;
577622
}
@@ -956,7 +1001,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat
9561001
}
9571002
adjustment += RWSEM_FLAG_WAITERS;
9581003
}
959-
list_add_tail(&waiter.list, &sem->wait_list);
1004+
rwsem_add_waiter(sem, &waiter);
9601005

9611006
/* we're now waiting on the lock, but no longer actively locking */
9621007
count = atomic_long_add_return(adjustment, &sem->count);
@@ -1002,11 +1047,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat
10021047
return sem;
10031048

10041049
out_nolock:
1005-
list_del(&waiter.list);
1006-
if (list_empty(&sem->wait_list)) {
1007-
atomic_long_andnot(RWSEM_FLAG_WAITERS|RWSEM_FLAG_HANDOFF,
1008-
&sem->count);
1009-
}
1050+
rwsem_del_waiter(sem, &waiter);
10101051
raw_spin_unlock_irq(&sem->wait_lock);
10111052
__set_current_state(TASK_RUNNING);
10121053
lockevent_inc(rwsem_rlock_fail);
@@ -1020,9 +1061,7 @@ static struct rw_semaphore *
10201061
rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
10211062
{
10221063
long count;
1023-
enum writer_wait_state wstate;
10241064
struct rwsem_waiter waiter;
1025-
struct rw_semaphore *ret = sem;
10261065
DEFINE_WAKE_Q(wake_q);
10271066

10281067
/* do optimistic spinning and steal lock if possible */
@@ -1038,16 +1077,13 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
10381077
waiter.task = current;
10391078
waiter.type = RWSEM_WAITING_FOR_WRITE;
10401079
waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
1080+
waiter.handoff_set = false;
10411081

10421082
raw_spin_lock_irq(&sem->wait_lock);
1043-
1044-
/* account for this before adding a new element to the list */
1045-
wstate = list_empty(&sem->wait_list) ? WRITER_FIRST : WRITER_NOT_FIRST;
1046-
1047-
list_add_tail(&waiter.list, &sem->wait_list);
1083+
rwsem_add_waiter(sem, &waiter);
10481084

10491085
/* we're now waiting on the lock */
1050-
if (wstate == WRITER_NOT_FIRST) {
1086+
if (rwsem_first_waiter(sem) != &waiter) {
10511087
count = atomic_long_read(&sem->count);
10521088

10531089
/*
@@ -1083,13 +1119,16 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
10831119
/* wait until we successfully acquire the lock */
10841120
set_current_state(state);
10851121
for (;;) {
1086-
if (rwsem_try_write_lock(sem, wstate)) {
1122+
if (rwsem_try_write_lock(sem, &waiter)) {
10871123
/* rwsem_try_write_lock() implies ACQUIRE on success */
10881124
break;
10891125
}
10901126

10911127
raw_spin_unlock_irq(&sem->wait_lock);
10921128

1129+
if (signal_pending_state(state, current))
1130+
goto out_nolock;
1131+
10931132
/*
10941133
* After setting the handoff bit and failing to acquire
10951134
* the lock, attempt to spin on owner to accelerate lock
@@ -1098,7 +1137,7 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
10981137
* In this case, we attempt to acquire the lock again
10991138
* without sleeping.
11001139
*/
1101-
if (wstate == WRITER_HANDOFF) {
1140+
if (waiter.handoff_set) {
11021141
enum owner_state owner_state;
11031142

11041143
preempt_disable();
@@ -1109,66 +1148,26 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
11091148
goto trylock_again;
11101149
}
11111150

1112-
/* Block until there are no active lockers. */
1113-
for (;;) {
1114-
if (signal_pending_state(state, current))
1115-
goto out_nolock;
1116-
1117-
schedule();
1118-
lockevent_inc(rwsem_sleep_writer);
1119-
set_current_state(state);
1120-
/*
1121-
* If HANDOFF bit is set, unconditionally do
1122-
* a trylock.
1123-
*/
1124-
if (wstate == WRITER_HANDOFF)
1125-
break;
1126-
1127-
if ((wstate == WRITER_NOT_FIRST) &&
1128-
(rwsem_first_waiter(sem) == &waiter))
1129-
wstate = WRITER_FIRST;
1130-
1131-
count = atomic_long_read(&sem->count);
1132-
if (!(count & RWSEM_LOCK_MASK))
1133-
break;
1134-
1135-
/*
1136-
* The setting of the handoff bit is deferred
1137-
* until rwsem_try_write_lock() is called.
1138-
*/
1139-
if ((wstate == WRITER_FIRST) && (rt_task(current) ||
1140-
time_after(jiffies, waiter.timeout))) {
1141-
wstate = WRITER_HANDOFF;
1142-
lockevent_inc(rwsem_wlock_handoff);
1143-
break;
1144-
}
1145-
}
1151+
schedule();
1152+
lockevent_inc(rwsem_sleep_writer);
1153+
set_current_state(state);
11461154
trylock_again:
11471155
raw_spin_lock_irq(&sem->wait_lock);
11481156
}
11491157
__set_current_state(TASK_RUNNING);
1150-
list_del(&waiter.list);
11511158
raw_spin_unlock_irq(&sem->wait_lock);
11521159
lockevent_inc(rwsem_wlock);
1153-
1154-
return ret;
1160+
return sem;
11551161

11561162
out_nolock:
11571163
__set_current_state(TASK_RUNNING);
11581164
raw_spin_lock_irq(&sem->wait_lock);
1159-
list_del(&waiter.list);
1160-
1161-
if (unlikely(wstate == WRITER_HANDOFF))
1162-
atomic_long_add(-RWSEM_FLAG_HANDOFF, &sem->count);
1163-
1164-
if (list_empty(&sem->wait_list))
1165-
atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count);
1166-
else
1165+
rwsem_del_waiter(sem, &waiter);
1166+
if (!list_empty(&sem->wait_list))
11671167
rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
11681168
raw_spin_unlock_irq(&sem->wait_lock);
11691169
wake_up_q(&wake_q);
11701170
lockevent_inc(rwsem_wlock_fail);
1171-
11721171
return ERR_PTR(-EINTR);
11731172
}
11741173

0 commit comments

Comments
 (0)