Skip to content

Commit 0122ec5

Browse files
Peter ZijlstraIngo Molnar
authored andcommitted
sched: Add p->pi_lock to task_rq_lock()
In order to be able to call set_task_cpu() while either holding p->pi_lock or task_rq(p)->lock we need to hold both locks in order to stabilize task_rq(). This makes task_rq_lock() acquire both locks, and have __task_rq_lock() validate that p->pi_lock is held. This increases the locking overhead for most scheduler syscalls but allows reduction of rq->lock contention for some scheduler hot paths (ttwu). Reviewed-by: Frank Rowand <[email protected]> Signed-off-by: Peter Zijlstra <[email protected]> Cc: Mike Galbraith <[email protected]> Cc: Nick Piggin <[email protected]> Cc: Linus Torvalds <[email protected]> Cc: Andrew Morton <[email protected]> Link: http://lkml.kernel.org/r/[email protected] Signed-off-by: Ingo Molnar <[email protected]>
1 parent 2acca55 commit 0122ec5

File tree

1 file changed

+47
-56
lines changed

1 file changed

+47
-56
lines changed

kernel/sched.c

Lines changed: 47 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -599,7 +599,7 @@ static inline int cpu_of(struct rq *rq)
599599
* Return the group to which this tasks belongs.
600600
*
601601
* We use task_subsys_state_check() and extend the RCU verification
602-
* with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach()
602+
* with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach()
603603
* holds that lock for each task it moves into the cgroup. Therefore
604604
* by holding that lock, we pin the task to the current cgroup.
605605
*/
@@ -609,7 +609,7 @@ static inline struct task_group *task_group(struct task_struct *p)
609609
struct cgroup_subsys_state *css;
610610

611611
css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
612-
lockdep_is_held(&task_rq(p)->lock));
612+
lockdep_is_held(&p->pi_lock));
613613
tg = container_of(css, struct task_group, css);
614614

615615
return autogroup_task_group(p, tg);
@@ -924,23 +924,15 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
924924
#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
925925

926926
/*
927-
* Check whether the task is waking, we use this to synchronize ->cpus_allowed
928-
* against ttwu().
929-
*/
930-
static inline int task_is_waking(struct task_struct *p)
931-
{
932-
return unlikely(p->state == TASK_WAKING);
933-
}
934-
935-
/*
936-
* __task_rq_lock - lock the runqueue a given task resides on.
937-
* Must be called interrupts disabled.
927+
* __task_rq_lock - lock the rq @p resides on.
938928
*/
939929
static inline struct rq *__task_rq_lock(struct task_struct *p)
940930
__acquires(rq->lock)
941931
{
942932
struct rq *rq;
943933

934+
lockdep_assert_held(&p->pi_lock);
935+
944936
for (;;) {
945937
rq = task_rq(p);
946938
raw_spin_lock(&rq->lock);
@@ -951,22 +943,22 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
951943
}
952944

953945
/*
954-
* task_rq_lock - lock the runqueue a given task resides on and disable
955-
* interrupts. Note the ordering: we can safely lookup the task_rq without
956-
* explicitly disabling preemption.
946+
* task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
957947
*/
958948
static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
949+
__acquires(p->pi_lock)
959950
__acquires(rq->lock)
960951
{
961952
struct rq *rq;
962953

963954
for (;;) {
964-
local_irq_save(*flags);
955+
raw_spin_lock_irqsave(&p->pi_lock, *flags);
965956
rq = task_rq(p);
966957
raw_spin_lock(&rq->lock);
967958
if (likely(rq == task_rq(p)))
968959
return rq;
969-
raw_spin_unlock_irqrestore(&rq->lock, *flags);
960+
raw_spin_unlock(&rq->lock);
961+
raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
970962
}
971963
}
972964

@@ -976,10 +968,13 @@ static void __task_rq_unlock(struct rq *rq)
976968
raw_spin_unlock(&rq->lock);
977969
}
978970

979-
static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
971+
static inline void
972+
task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
980973
__releases(rq->lock)
974+
__releases(p->pi_lock)
981975
{
982-
raw_spin_unlock_irqrestore(&rq->lock, *flags);
976+
raw_spin_unlock(&rq->lock);
977+
raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
983978
}
984979

985980
/*
@@ -2175,6 +2170,11 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
21752170
*/
21762171
WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
21772172
!(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
2173+
2174+
#ifdef CONFIG_LOCKDEP
2175+
WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
2176+
lockdep_is_held(&task_rq(p)->lock)));
2177+
#endif
21782178
#endif
21792179

21802180
trace_sched_migrate_task(p, new_cpu);
@@ -2270,7 +2270,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
22702270
ncsw = 0;
22712271
if (!match_state || p->state == match_state)
22722272
ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
2273-
task_rq_unlock(rq, &flags);
2273+
task_rq_unlock(rq, p, &flags);
22742274

22752275
/*
22762276
* If it changed from the expected state, bail out now.
@@ -2652,6 +2652,7 @@ static void __sched_fork(struct task_struct *p)
26522652
*/
26532653
void sched_fork(struct task_struct *p, int clone_flags)
26542654
{
2655+
unsigned long flags;
26552656
int cpu = get_cpu();
26562657

26572658
__sched_fork(p);
@@ -2702,9 +2703,9 @@ void sched_fork(struct task_struct *p, int clone_flags)
27022703
*
27032704
* Silence PROVE_RCU.
27042705
*/
2705-
rcu_read_lock();
2706+
raw_spin_lock_irqsave(&p->pi_lock, flags);
27062707
set_task_cpu(p, cpu);
2707-
rcu_read_unlock();
2708+
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
27082709

27092710
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
27102711
if (likely(sched_info_on()))
@@ -2753,7 +2754,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
27532754
set_task_cpu(p, cpu);
27542755

27552756
p->state = TASK_RUNNING;
2756-
task_rq_unlock(rq, &flags);
2757+
task_rq_unlock(rq, p, &flags);
27572758
#endif
27582759

27592760
rq = task_rq_lock(p, &flags);
@@ -2765,7 +2766,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
27652766
if (p->sched_class->task_woken)
27662767
p->sched_class->task_woken(rq, p);
27672768
#endif
2768-
task_rq_unlock(rq, &flags);
2769+
task_rq_unlock(rq, p, &flags);
27692770
put_cpu();
27702771
}
27712772

@@ -3490,12 +3491,12 @@ void sched_exec(void)
34903491
likely(cpu_active(dest_cpu)) && need_migrate_task(p)) {
34913492
struct migration_arg arg = { p, dest_cpu };
34923493

3493-
task_rq_unlock(rq, &flags);
3494+
task_rq_unlock(rq, p, &flags);
34943495
stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
34953496
return;
34963497
}
34973498
unlock:
3498-
task_rq_unlock(rq, &flags);
3499+
task_rq_unlock(rq, p, &flags);
34993500
}
35003501

35013502
#endif
@@ -3532,7 +3533,7 @@ unsigned long long task_delta_exec(struct task_struct *p)
35323533

35333534
rq = task_rq_lock(p, &flags);
35343535
ns = do_task_delta_exec(p, rq);
3535-
task_rq_unlock(rq, &flags);
3536+
task_rq_unlock(rq, p, &flags);
35363537

35373538
return ns;
35383539
}
@@ -3550,7 +3551,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
35503551

35513552
rq = task_rq_lock(p, &flags);
35523553
ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
3553-
task_rq_unlock(rq, &flags);
3554+
task_rq_unlock(rq, p, &flags);
35543555

35553556
return ns;
35563557
}
@@ -3574,7 +3575,7 @@ unsigned long long thread_group_sched_runtime(struct task_struct *p)
35743575
rq = task_rq_lock(p, &flags);
35753576
thread_group_cputime(p, &totals);
35763577
ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
3577-
task_rq_unlock(rq, &flags);
3578+
task_rq_unlock(rq, p, &flags);
35783579

35793580
return ns;
35803581
}
@@ -4693,16 +4694,13 @@ EXPORT_SYMBOL(sleep_on_timeout);
46934694
*/
46944695
void rt_mutex_setprio(struct task_struct *p, int prio)
46954696
{
4696-
unsigned long flags;
46974697
int oldprio, on_rq, running;
46984698
struct rq *rq;
46994699
const struct sched_class *prev_class;
47004700

47014701
BUG_ON(prio < 0 || prio > MAX_PRIO);
47024702

4703-
lockdep_assert_held(&p->pi_lock);
4704-
4705-
rq = task_rq_lock(p, &flags);
4703+
rq = __task_rq_lock(p);
47064704

47074705
trace_sched_pi_setprio(p, prio);
47084706
oldprio = p->prio;
@@ -4727,7 +4725,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
47274725
enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
47284726

47294727
check_class_changed(rq, p, prev_class, oldprio);
4730-
task_rq_unlock(rq, &flags);
4728+
__task_rq_unlock(rq);
47314729
}
47324730

47334731
#endif
@@ -4775,7 +4773,7 @@ void set_user_nice(struct task_struct *p, long nice)
47754773
resched_task(rq->curr);
47764774
}
47774775
out_unlock:
4778-
task_rq_unlock(rq, &flags);
4776+
task_rq_unlock(rq, p, &flags);
47794777
}
47804778
EXPORT_SYMBOL(set_user_nice);
47814779

@@ -5003,20 +5001,17 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
50035001
/*
50045002
* make sure no PI-waiters arrive (or leave) while we are
50055003
* changing the priority of the task:
5006-
*/
5007-
raw_spin_lock_irqsave(&p->pi_lock, flags);
5008-
/*
5004+
*
50095005
* To be able to change p->policy safely, the appropriate
50105006
* runqueue lock must be held.
50115007
*/
5012-
rq = __task_rq_lock(p);
5008+
rq = task_rq_lock(p, &flags);
50135009

50145010
/*
50155011
* Changing the policy of the stop threads its a very bad idea
50165012
*/
50175013
if (p == rq->stop) {
5018-
__task_rq_unlock(rq);
5019-
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5014+
task_rq_unlock(rq, p, &flags);
50205015
return -EINVAL;
50215016
}
50225017

@@ -5040,8 +5035,7 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
50405035
if (rt_bandwidth_enabled() && rt_policy(policy) &&
50415036
task_group(p)->rt_bandwidth.rt_runtime == 0 &&
50425037
!task_group_is_autogroup(task_group(p))) {
5043-
__task_rq_unlock(rq);
5044-
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5038+
task_rq_unlock(rq, p, &flags);
50455039
return -EPERM;
50465040
}
50475041
}
@@ -5050,8 +5044,7 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
50505044
/* recheck policy now with rq lock held */
50515045
if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
50525046
policy = oldpolicy = -1;
5053-
__task_rq_unlock(rq);
5054-
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5047+
task_rq_unlock(rq, p, &flags);
50555048
goto recheck;
50565049
}
50575050
on_rq = p->on_rq;
@@ -5073,8 +5066,7 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
50735066
activate_task(rq, p, 0);
50745067

50755068
check_class_changed(rq, p, prev_class, oldprio);
5076-
__task_rq_unlock(rq);
5077-
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5069+
task_rq_unlock(rq, p, &flags);
50785070

50795071
rt_mutex_adjust_pi(p);
50805072

@@ -5666,7 +5658,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
56665658

56675659
rq = task_rq_lock(p, &flags);
56685660
time_slice = p->sched_class->get_rr_interval(rq, p);
5669-
task_rq_unlock(rq, &flags);
5661+
task_rq_unlock(rq, p, &flags);
56705662

56715663
rcu_read_unlock();
56725664
jiffies_to_timespec(time_slice, &t);
@@ -5889,8 +5881,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
58895881
unsigned int dest_cpu;
58905882
int ret = 0;
58915883

5892-
raw_spin_lock_irqsave(&p->pi_lock, flags);
5893-
rq = __task_rq_lock(p);
5884+
rq = task_rq_lock(p, &flags);
58945885

58955886
if (!cpumask_intersects(new_mask, cpu_active_mask)) {
58965887
ret = -EINVAL;
@@ -5918,15 +5909,13 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
59185909
if (need_migrate_task(p)) {
59195910
struct migration_arg arg = { p, dest_cpu };
59205911
/* Need help from migration thread: drop lock and wait. */
5921-
__task_rq_unlock(rq);
5922-
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5912+
task_rq_unlock(rq, p, &flags);
59235913
stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
59245914
tlb_migrate_finish(p->mm);
59255915
return 0;
59265916
}
59275917
out:
5928-
__task_rq_unlock(rq);
5929-
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5918+
task_rq_unlock(rq, p, &flags);
59305919

59315920
return ret;
59325921
}
@@ -5954,6 +5943,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
59545943
rq_src = cpu_rq(src_cpu);
59555944
rq_dest = cpu_rq(dest_cpu);
59565945

5946+
raw_spin_lock(&p->pi_lock);
59575947
double_rq_lock(rq_src, rq_dest);
59585948
/* Already moved. */
59595949
if (task_cpu(p) != src_cpu)
@@ -5976,6 +5966,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
59765966
ret = 1;
59775967
fail:
59785968
double_rq_unlock(rq_src, rq_dest);
5969+
raw_spin_unlock(&p->pi_lock);
59795970
return ret;
59805971
}
59815972

@@ -8702,7 +8693,7 @@ void sched_move_task(struct task_struct *tsk)
87028693
if (on_rq)
87038694
enqueue_task(rq, tsk, 0);
87048695

8705-
task_rq_unlock(rq, &flags);
8696+
task_rq_unlock(rq, tsk, &flags);
87068697
}
87078698
#endif /* CONFIG_CGROUP_SCHED */
87088699

0 commit comments

Comments
 (0)