Skip to content

Commit 9af6528

Browse files
Peter ZijlstraIngo Molnar
authored andcommitted
sched/core: Optimize __schedule()
Oleg noted that by making do_exit() use __schedule() for the TASK_DEAD context switch, we can avoid the TASK_DEAD special case currently in __schedule() because that avoids the extra preempt_disable() from schedule(). In order to facilitate this, create a do_task_dead() helper which we place in the scheduler code, such that it can access __schedule(). Also add some __noreturn annotations to the functions, there's no coming back from do_exit(). Suggested-by: Oleg Nesterov <[email protected]> Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Cc: Cheng Chao <[email protected]> Cc: Linus Torvalds <[email protected]> Cc: Peter Zijlstra <[email protected]> Cc: Thomas Gleixner <[email protected]> Cc: [email protected] Cc: [email protected] Cc: [email protected] Link: http://lkml.kernel.org/r/[email protected] Signed-off-by: Ingo Molnar <[email protected]>
1 parent bf89a30 commit 9af6528

File tree

4 files changed

+34
-41
lines changed

4 files changed

+34
-41
lines changed

include/linux/kernel.h

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -259,17 +259,14 @@ static inline void might_fault(void) { }
259259
extern struct atomic_notifier_head panic_notifier_list;
260260
extern long (*panic_blink)(int state);
261261
__printf(1, 2)
262-
void panic(const char *fmt, ...)
263-
__noreturn __cold;
262+
void panic(const char *fmt, ...) __noreturn __cold;
264263
void nmi_panic(struct pt_regs *regs, const char *msg);
265264
extern void oops_enter(void);
266265
extern void oops_exit(void);
267266
void print_oops_end_marker(void);
268267
extern int oops_may_print(void);
269-
void do_exit(long error_code)
270-
__noreturn;
271-
void complete_and_exit(struct completion *, long)
272-
__noreturn;
268+
void do_exit(long error_code) __noreturn;
269+
void complete_and_exit(struct completion *, long) __noreturn;
273270

274271
/* Internal, do not use. */
275272
int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long *res);

include/linux/sched.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -448,6 +448,8 @@ static inline void io_schedule(void)
448448
io_schedule_timeout(MAX_SCHEDULE_TIMEOUT);
449449
}
450450

451+
void __noreturn do_task_dead(void);
452+
451453
struct nsproxy;
452454
struct user_namespace;
453455

kernel/exit.c

Lines changed: 2 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -725,7 +725,7 @@ static void check_stack_usage(void)
725725
static inline void check_stack_usage(void) {}
726726
#endif
727727

728-
void do_exit(long code)
728+
void __noreturn do_exit(long code)
729729
{
730730
struct task_struct *tsk = current;
731731
int group_dead;
@@ -882,29 +882,7 @@ void do_exit(long code)
882882
exit_rcu();
883883
TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i));
884884

885-
/*
886-
* The setting of TASK_RUNNING by try_to_wake_up() may be delayed
887-
* when the following two conditions become true.
888-
* - There is race condition of mmap_sem (It is acquired by
889-
* exit_mm()), and
890-
* - SMI occurs before setting TASK_RUNINNG.
891-
* (or hypervisor of virtual machine switches to other guest)
892-
* As a result, we may become TASK_RUNNING after becoming TASK_DEAD
893-
*
894-
* To avoid it, we have to wait for releasing tsk->pi_lock which
895-
* is held by try_to_wake_up()
896-
*/
897-
smp_mb();
898-
raw_spin_unlock_wait(&tsk->pi_lock);
899-
900-
/* causes final put_task_struct in finish_task_switch(). */
901-
tsk->state = TASK_DEAD;
902-
tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */
903-
schedule();
904-
BUG();
905-
/* Avoid "noreturn function does return". */
906-
for (;;)
907-
cpu_relax(); /* For when BUG is null */
885+
do_task_dead();
908886
}
909887
EXPORT_SYMBOL_GPL(do_exit);
910888

kernel/sched/core.c

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3331,17 +3331,6 @@ static void __sched notrace __schedule(bool preempt)
33313331
rq = cpu_rq(cpu);
33323332
prev = rq->curr;
33333333

3334-
/*
3335-
* do_exit() calls schedule() with preemption disabled as an exception;
3336-
* however we must fix that up, otherwise the next task will see an
3337-
* inconsistent (higher) preempt count.
3338-
*
3339-
* It also avoids the below schedule_debug() test from complaining
3340-
* about this.
3341-
*/
3342-
if (unlikely(prev->state == TASK_DEAD))
3343-
preempt_enable_no_resched_notrace();
3344-
33453334
schedule_debug(prev);
33463335

33473336
if (sched_feat(HRTICK))
@@ -3409,6 +3398,33 @@ static void __sched notrace __schedule(bool preempt)
34093398
}
34103399
STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */
34113400

3401+
void __noreturn do_task_dead(void)
3402+
{
3403+
/*
3404+
* The setting of TASK_RUNNING by try_to_wake_up() may be delayed
3405+
* when the following two conditions become true.
3406+
* - There is race condition of mmap_sem (It is acquired by
3407+
* exit_mm()), and
3408+
* - SMI occurs before setting TASK_RUNINNG.
3409+
* (or hypervisor of virtual machine switches to other guest)
3410+
* As a result, we may become TASK_RUNNING after becoming TASK_DEAD
3411+
*
3412+
* To avoid it, we have to wait for releasing tsk->pi_lock which
3413+
* is held by try_to_wake_up()
3414+
*/
3415+
smp_mb();
3416+
raw_spin_unlock_wait(&current->pi_lock);
3417+
3418+
/* causes final put_task_struct in finish_task_switch(). */
3419+
__set_current_state(TASK_DEAD);
3420+
current->flags |= PF_NOFREEZE; /* tell freezer to ignore us */
3421+
__schedule(false);
3422+
BUG();
3423+
/* Avoid "noreturn function does return". */
3424+
for (;;)
3425+
cpu_relax(); /* For when BUG is null */
3426+
}
3427+
34123428
static inline void sched_submit_work(struct task_struct *tsk)
34133429
{
34143430
if (!tsk->state || tsk_is_pi_blocked(tsk))

0 commit comments

Comments
 (0)