Skip to content

Commit 0b26351

Browse files
Peter ZijlstraIngo Molnar
authored andcommitted
stop_machine, sched: Fix migrate_swap() vs. active_balance() deadlock
Matt reported the following deadlock: CPU0 CPU1 schedule(.prev=migrate/0) <fault> pick_next_task() ... idle_balance() migrate_swap() active_balance() stop_two_cpus() spin_lock(stopper0->lock) spin_lock(stopper1->lock) ttwu(migrate/0) smp_cond_load_acquire() -- waits for schedule() stop_one_cpu(1) spin_lock(stopper1->lock) -- waits for stopper lock Fix this deadlock by taking the wakeups out from under stopper->lock. This allows the active_balance() to queue the stop work and finish the context switch, which in turn allows the wakeup from migrate_swap() to observe the context and complete the wakeup. Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Reported-by: Matt Fleming <[email protected]> Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Acked-by: Matt Fleming <[email protected]> Cc: Linus Torvalds <[email protected]> Cc: Michal Hocko <[email protected]> Cc: Mike Galbraith <[email protected]> Cc: Peter Zijlstra <[email protected]> Cc: Thomas Gleixner <[email protected]> Link: http://lkml.kernel.org/r/[email protected] Signed-off-by: Ingo Molnar <[email protected]>
1 parent f4ef6a4 commit 0b26351

File tree

1 file changed

+14
-5
lines changed

1 file changed

+14
-5
lines changed

kernel/stop_machine.c

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include <linux/smpboot.h>
2222
#include <linux/atomic.h>
2323
#include <linux/nmi.h>
24+
#include <linux/sched/wake_q.h>
2425

2526
/*
2627
* Structure to determine completion condition and record errors. May
@@ -65,27 +66,31 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done)
6566
}
6667

6768
static void __cpu_stop_queue_work(struct cpu_stopper *stopper,
68-
struct cpu_stop_work *work)
69+
struct cpu_stop_work *work,
70+
struct wake_q_head *wakeq)
6971
{
7072
list_add_tail(&work->list, &stopper->works);
71-
wake_up_process(stopper->thread);
73+
wake_q_add(wakeq, stopper->thread);
7274
}
7375

7476
/* queue @work to @stopper. if offline, @work is completed immediately */
7577
static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
7678
{
7779
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
80+
DEFINE_WAKE_Q(wakeq);
7881
unsigned long flags;
7982
bool enabled;
8083

8184
spin_lock_irqsave(&stopper->lock, flags);
8285
enabled = stopper->enabled;
8386
if (enabled)
84-
__cpu_stop_queue_work(stopper, work);
87+
__cpu_stop_queue_work(stopper, work, &wakeq);
8588
else if (work->done)
8689
cpu_stop_signal_done(work->done);
8790
spin_unlock_irqrestore(&stopper->lock, flags);
8891

92+
wake_up_q(&wakeq);
93+
8994
return enabled;
9095
}
9196

@@ -229,6 +234,7 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
229234
{
230235
struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1);
231236
struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
237+
DEFINE_WAKE_Q(wakeq);
232238
int err;
233239
retry:
234240
spin_lock_irq(&stopper1->lock);
@@ -252,8 +258,8 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
252258
goto unlock;
253259

254260
err = 0;
255-
__cpu_stop_queue_work(stopper1, work1);
256-
__cpu_stop_queue_work(stopper2, work2);
261+
__cpu_stop_queue_work(stopper1, work1, &wakeq);
262+
__cpu_stop_queue_work(stopper2, work2, &wakeq);
257263
unlock:
258264
spin_unlock(&stopper2->lock);
259265
spin_unlock_irq(&stopper1->lock);
@@ -263,6 +269,9 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
263269
cpu_relax();
264270
goto retry;
265271
}
272+
273+
wake_up_q(&wakeq);
274+
266275
return err;
267276
}
268277
/**

0 commit comments

Comments
 (0)