Skip to content

Commit 9c5efe9

Browse files
committed
Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler fixes from Ingo Molnar: - Apply a number of membarrier related fixes and cleanups, which fixes a use-after-free race in the membarrier code - Introduce proper RCU protection for tasks on the runqueue - to get rid of the subtle task_rcu_dereference() interface that was easy to get wrong - Misc fixes, but also an EAS speedup * 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/fair: Avoid redundant EAS calculation sched/core: Remove double update_max_interval() call on CPU startup sched/core: Fix preempt_schedule() interrupt return comment sched/fair: Fix -Wunused-but-set-variable warnings sched/core: Fix migration to invalid CPU in __set_cpus_allowed_ptr() sched/membarrier: Return -ENOMEM to userspace on memory allocation failure sched/membarrier: Skip IPIs when mm->mm_users == 1 selftests, sched/membarrier: Add multi-threaded test sched/membarrier: Fix p->mm->membarrier_state racy load sched/membarrier: Call sync_core only before usermode for same mm sched/membarrier: Remove redundant check sched/membarrier: Fix private expedited registration check tasks, sched/core: RCUify the assignment of rq->curr tasks, sched/core: With a grace period after finish_task_switch(), remove unnecessary code tasks, sched/core: Ensure tasks are available for a grace period after leaving the runqueue tasks: Add a count of task RCU users sched/core: Convert vcpu_is_preempted() from macro to an inline function sched/fair: Remove unused cfs_rq_clock_task() function
2 parents aefcf2f + 4892f51 commit 9c5efe9

File tree

17 files changed

+375
-250
lines changed

17 files changed

+375
-250
lines changed

fs/exec.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1033,6 +1033,7 @@ static int exec_mmap(struct mm_struct *mm)
10331033
}
10341034
task_lock(tsk);
10351035
active_mm = tsk->active_mm;
1036+
membarrier_exec_mmap(mm);
10361037
tsk->mm = mm;
10371038
tsk->active_mm = mm;
10381039
activate_mm(active_mm, mm);
@@ -1825,7 +1826,6 @@ static int __do_execve_file(int fd, struct filename *filename,
18251826
/* execve succeeded */
18261827
current->fs->in_exec = 0;
18271828
current->in_execve = 0;
1828-
membarrier_execve(current);
18291829
rseq_execve(current);
18301830
acct_update_integrals(current);
18311831
task_numa_free(current, false);

include/linux/mm_types.h

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -383,6 +383,16 @@ struct mm_struct {
383383
unsigned long highest_vm_end; /* highest vma end address */
384384
pgd_t * pgd;
385385

386+
#ifdef CONFIG_MEMBARRIER
387+
/**
388+
* @membarrier_state: Flags controlling membarrier behavior.
389+
*
390+
* This field is close to @pgd to hopefully fit in the same
391+
* cache-line, which needs to be touched by switch_mm().
392+
*/
393+
atomic_t membarrier_state;
394+
#endif
395+
386396
/**
387397
* @mm_users: The number of users including userspace.
388398
*
@@ -452,9 +462,7 @@ struct mm_struct {
452462
unsigned long flags; /* Must use atomic bitops to access */
453463

454464
struct core_state *core_state; /* coredumping support */
455-
#ifdef CONFIG_MEMBARRIER
456-
atomic_t membarrier_state;
457-
#endif
465+
458466
#ifdef CONFIG_AIO
459467
spinlock_t ioctx_lock;
460468
struct kioctx_table __rcu *ioctx_table;

include/linux/rcuwait.h

Lines changed: 4 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -6,16 +6,11 @@
66

77
/*
88
* rcuwait provides a way of blocking and waking up a single
9-
* task in an rcu-safe manner; where it is forbidden to use
10-
* after exit_notify(). task_struct is not properly rcu protected,
11-
* unless dealing with rcu-aware lists, ie: find_task_by_*().
9+
* task in an rcu-safe manner.
1210
*
13-
* Alternatively we have task_rcu_dereference(), but the return
14-
* semantics have different implications which would break the
15-
* wakeup side. The only time @task is non-nil is when a user is
16-
* blocked (or checking if it needs to) on a condition, and reset
17-
* as soon as we know that the condition has succeeded and are
18-
* awoken.
11+
* The only time @task is non-nil is when a user is blocked (or
12+
* checking if it needs to) on a condition, and reset as soon as we
13+
* know that the condition has succeeded and are awoken.
1914
*/
2015
struct rcuwait {
2116
struct task_struct __rcu *task;
@@ -37,13 +32,6 @@ extern void rcuwait_wake_up(struct rcuwait *w);
3732
*/
3833
#define rcuwait_wait_event(w, condition) \
3934
({ \
40-
/* \
41-
* Complain if we are called after do_exit()/exit_notify(), \
42-
* as we cannot rely on the rcu critical region for the \
43-
* wakeup side. \
44-
*/ \
45-
WARN_ON(current->exit_state); \
46-
\
4735
rcu_assign_pointer((w)->task, current); \
4836
for (;;) { \
4937
/* \

include/linux/sched.h

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1130,7 +1130,10 @@ struct task_struct {
11301130

11311131
struct tlbflush_unmap_batch tlb_ubc;
11321132

1133-
struct rcu_head rcu;
1133+
union {
1134+
refcount_t rcu_users;
1135+
struct rcu_head rcu;
1136+
};
11341137

11351138
/* Cache last used pipe for splice(): */
11361139
struct pipe_inode_info *splice_pipe;
@@ -1839,7 +1842,10 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
18391842
* running or not.
18401843
*/
18411844
#ifndef vcpu_is_preempted
1842-
# define vcpu_is_preempted(cpu) false
1845+
static inline bool vcpu_is_preempted(int cpu)
1846+
{
1847+
return false;
1848+
}
18431849
#endif
18441850

18451851
extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);

include/linux/sched/mm.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -362,16 +362,16 @@ enum {
362362

363363
static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
364364
{
365+
if (current->mm != mm)
366+
return;
365367
if (likely(!(atomic_read(&mm->membarrier_state) &
366368
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE)))
367369
return;
368370
sync_core_before_usermode();
369371
}
370372

371-
static inline void membarrier_execve(struct task_struct *t)
372-
{
373-
atomic_set(&t->mm->membarrier_state, 0);
374-
}
373+
extern void membarrier_exec_mmap(struct mm_struct *mm);
374+
375375
#else
376376
#ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
377377
static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
@@ -380,7 +380,7 @@ static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
380380
{
381381
}
382382
#endif
383-
static inline void membarrier_execve(struct task_struct *t)
383+
static inline void membarrier_exec_mmap(struct mm_struct *mm)
384384
{
385385
}
386386
static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)

include/linux/sched/task.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ static inline void put_task_struct(struct task_struct *t)
119119
__put_task_struct(t);
120120
}
121121

122-
struct task_struct *task_rcu_dereference(struct task_struct **ptask);
122+
void put_task_struct_rcu_user(struct task_struct *task);
123123

124124
#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
125125
extern int arch_task_struct_size __read_mostly;

kernel/exit.c

Lines changed: 6 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,11 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
182182
put_task_struct(tsk);
183183
}
184184

185+
void put_task_struct_rcu_user(struct task_struct *task)
186+
{
187+
if (refcount_dec_and_test(&task->rcu_users))
188+
call_rcu(&task->rcu, delayed_put_task_struct);
189+
}
185190

186191
void release_task(struct task_struct *p)
187192
{
@@ -222,76 +227,13 @@ void release_task(struct task_struct *p)
222227

223228
write_unlock_irq(&tasklist_lock);
224229
release_thread(p);
225-
call_rcu(&p->rcu, delayed_put_task_struct);
230+
put_task_struct_rcu_user(p);
226231

227232
p = leader;
228233
if (unlikely(zap_leader))
229234
goto repeat;
230235
}
231236

232-
/*
233-
* Note that if this function returns a valid task_struct pointer (!NULL)
234-
* task->usage must remain >0 for the duration of the RCU critical section.
235-
*/
236-
struct task_struct *task_rcu_dereference(struct task_struct **ptask)
237-
{
238-
struct sighand_struct *sighand;
239-
struct task_struct *task;
240-
241-
/*
242-
* We need to verify that release_task() was not called and thus
243-
* delayed_put_task_struct() can't run and drop the last reference
244-
* before rcu_read_unlock(). We check task->sighand != NULL,
245-
* but we can read the already freed and reused memory.
246-
*/
247-
retry:
248-
task = rcu_dereference(*ptask);
249-
if (!task)
250-
return NULL;
251-
252-
probe_kernel_address(&task->sighand, sighand);
253-
254-
/*
255-
* Pairs with atomic_dec_and_test() in put_task_struct(). If this task
256-
* was already freed we can not miss the preceding update of this
257-
* pointer.
258-
*/
259-
smp_rmb();
260-
if (unlikely(task != READ_ONCE(*ptask)))
261-
goto retry;
262-
263-
/*
264-
* We've re-checked that "task == *ptask", now we have two different
265-
* cases:
266-
*
267-
* 1. This is actually the same task/task_struct. In this case
268-
* sighand != NULL tells us it is still alive.
269-
*
270-
* 2. This is another task which got the same memory for task_struct.
271-
* We can't know this of course, and we can not trust
272-
* sighand != NULL.
273-
*
274-
* In this case we actually return a random value, but this is
275-
* correct.
276-
*
277-
* If we return NULL - we can pretend that we actually noticed that
278-
* *ptask was updated when the previous task has exited. Or pretend
279-
* that probe_slab_address(&sighand) reads NULL.
280-
*
281-
* If we return the new task (because sighand is not NULL for any
282-
* reason) - this is fine too. This (new) task can't go away before
283-
* another gp pass.
284-
*
285-
* And note: We could even eliminate the false positive if re-read
286-
* task->sighand once again to avoid the falsely NULL. But this case
287-
* is very unlikely so we don't care.
288-
*/
289-
if (!sighand)
290-
return NULL;
291-
292-
return task;
293-
}
294-
295237
void rcuwait_wake_up(struct rcuwait *w)
296238
{
297239
struct task_struct *task;
@@ -311,10 +253,6 @@ void rcuwait_wake_up(struct rcuwait *w)
311253
*/
312254
smp_mb(); /* (B) */
313255

314-
/*
315-
* Avoid using task_rcu_dereference() magic as long as we are careful,
316-
* see comment in rcuwait_wait_event() regarding ->exit_state.
317-
*/
318256
task = rcu_dereference(w->task);
319257
if (task)
320258
wake_up_process(task);

kernel/fork.c

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -915,10 +915,12 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
915915
tsk->cpus_ptr = &tsk->cpus_mask;
916916

917917
/*
918-
* One for us, one for whoever does the "release_task()" (usually
919-
* parent)
918+
* One for the user space visible state that goes away when reaped.
919+
* One for the scheduler.
920920
*/
921-
refcount_set(&tsk->usage, 2);
921+
refcount_set(&tsk->rcu_users, 2);
922+
/* One for the rcu users */
923+
refcount_set(&tsk->usage, 1);
922924
#ifdef CONFIG_BLK_DEV_IO_TRACE
923925
tsk->btrace_seq = 0;
924926
#endif

kernel/sched/core.c

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1656,7 +1656,8 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
16561656
if (cpumask_equal(p->cpus_ptr, new_mask))
16571657
goto out;
16581658

1659-
if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
1659+
dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
1660+
if (dest_cpu >= nr_cpu_ids) {
16601661
ret = -EINVAL;
16611662
goto out;
16621663
}
@@ -1677,7 +1678,6 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
16771678
if (cpumask_test_cpu(task_cpu(p), new_mask))
16781679
goto out;
16791680

1680-
dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
16811681
if (task_running(rq, p) || p->state == TASK_WAKING) {
16821682
struct migration_arg arg = { p, dest_cpu };
16831683
/* Need help from migration thread: drop lock and wait. */
@@ -3254,7 +3254,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
32543254
/* Task is done with its stack. */
32553255
put_task_stack(prev);
32563256

3257-
put_task_struct(prev);
3257+
put_task_struct_rcu_user(prev);
32583258
}
32593259

32603260
tick_nohz_task_switch();
@@ -3358,15 +3358,15 @@ context_switch(struct rq *rq, struct task_struct *prev,
33583358
else
33593359
prev->active_mm = NULL;
33603360
} else { // to user
3361+
membarrier_switch_mm(rq, prev->active_mm, next->mm);
33613362
/*
33623363
* sys_membarrier() requires an smp_mb() between setting
3363-
* rq->curr and returning to userspace.
3364+
* rq->curr / membarrier_switch_mm() and returning to userspace.
33643365
*
33653366
* The below provides this either through switch_mm(), or in
33663367
* case 'prev->active_mm == next->mm' through
33673368
* finish_task_switch()'s mmdrop().
33683369
*/
3369-
33703370
switch_mm_irqs_off(prev->active_mm, next->mm, next);
33713371

33723372
if (!prev->mm) { // from kernel
@@ -4042,7 +4042,11 @@ static void __sched notrace __schedule(bool preempt)
40424042

40434043
if (likely(prev != next)) {
40444044
rq->nr_switches++;
4045-
rq->curr = next;
4045+
/*
4046+
* RCU users of rcu_dereference(rq->curr) may not see
4047+
* changes to task_struct made by pick_next_task().
4048+
*/
4049+
RCU_INIT_POINTER(rq->curr, next);
40464050
/*
40474051
* The membarrier system call requires each architecture
40484052
* to have a full memory barrier after updating
@@ -4223,9 +4227,8 @@ static void __sched notrace preempt_schedule_common(void)
42234227

42244228
#ifdef CONFIG_PREEMPTION
42254229
/*
4226-
* this is the entry point to schedule() from in-kernel preemption
4227-
* off of preempt_enable. Kernel preemptions off return from interrupt
4228-
* occur there and call schedule directly.
4230+
* This is the entry point to schedule() from in-kernel preemption
4231+
* off of preempt_enable.
42294232
*/
42304233
asmlinkage __visible void __sched notrace preempt_schedule(void)
42314234
{
@@ -4296,7 +4299,7 @@ EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
42964299
#endif /* CONFIG_PREEMPTION */
42974300

42984301
/*
4299-
* this is the entry point to schedule() from kernel preemption
4302+
* This is the entry point to schedule() from kernel preemption
43004303
* off of irq context.
43014304
* Note, that this is called and return with irqs disabled. This will
43024305
* protect us against recursive calling from irq.
@@ -6069,7 +6072,8 @@ void init_idle(struct task_struct *idle, int cpu)
60696072
__set_task_cpu(idle, cpu);
60706073
rcu_read_unlock();
60716074

6072-
rq->curr = rq->idle = idle;
6075+
rq->idle = idle;
6076+
rcu_assign_pointer(rq->curr, idle);
60736077
idle->on_rq = TASK_ON_RQ_QUEUED;
60746078
#ifdef CONFIG_SMP
60756079
idle->on_cpu = 1;
@@ -6430,8 +6434,6 @@ int sched_cpu_activate(unsigned int cpu)
64306434
}
64316435
rq_unlock_irqrestore(rq, &rf);
64326436

6433-
update_max_interval();
6434-
64356437
return 0;
64366438
}
64376439

0 commit comments

Comments
 (0)