Skip to content

Commit 40f3a4f

Browse files
danieljordan10jfvogel
authored andcommitted
ktask: let helpers finish asynchronously to avoid deadlock
UEK6 testing uncovered a rare hang during deferred page init in a kvm guest. The hang is more readily reproducible under some amounts of CPU and memory than others; for example, on a 72-CPU system, 46-CPU guests would hit it every ten to one hundred tries, but 44-CPU, 32-CPU, 16-CPU, and 8-CPU guests survived hundreds of reboots. UEK5 is not affected. The problem is a deadlock that arises because the main page init thread (pgdatinitN) holds the spinlock node_size_lock for the whole operation. It can happen two ways. First, when the main thread calls into ktask, ktask queues a work per helper thread, and when the workqueue layer wakes up a worker or kthreadd (to create a worker), the scheduler rarely decides to place it on the same runqueue as the main thread. The worker or kthreadd never run because the main thread is busywaiting for either to finish. Second, an interrupt handler can pin a worker or kthreadd, exhaust the memory in the deferred init zone by attempting a large allocation, and spin in deferred_grow_zone() on node_size_lock, which the main thread can only release when kthreadd or the worker finish. The first was seen on a uek6 kernel, and the second is theoretically possible. Meanwhile the rest of the system can be idle at this phase of boot, in which case the scheduler can't move threads waiting on the runqueue elsewhere. The proper fix will probably involve scheduler changes, but as a stopgap due to the timing with the uek6 release, relieve the main ktask thread from having to wait on its helpers and avoid the deadlock by refcounting a dynamically allocated ktask_task object. With this approach, page init may use fewer threads than requested with the same frequency as the hang was observed. Thanks to Alejandro Jimenez for his assistance with this bug. Orabug: 30835752 Reported-by: Gerald Gibson <[email protected]> Signed-off-by: Daniel Jordan <[email protected]> Reviewed-by: Khalid Aziz <[email protected]>
1 parent 2971013 commit 40f3a4f

File tree

3 files changed

+109
-14
lines changed

3 files changed

+109
-14
lines changed

include/linux/ktask.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,14 @@ static inline void ktask_ctl_set_max_threads(struct ktask_ctl *ctl,
187187
}
188188

189189
enum {
190-
KTASK_ATOMIC = 1,
190+
KTASK_ATOMIC = 1, /* Main thread can't sleep. */
191+
KTASK_ASYNC_HELPERS = 2, /* Helpers may run after job is done. */
192+
193+
/*
194+
* Helpers disable irqs. The ktask user is responsible for making sure
195+
* that disabling IRQs for long periods of time is safe for its job.
196+
*/
197+
KTASK_IRQS_OFF = 4,
191198
};
192199

193200
/*

kernel/ktask.c

Lines changed: 99 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ static struct ktask_work *ktask_works;
6464
struct ktask_task {
6565
struct ktask_ctl kt_ctl;
6666
size_t kt_total_size;
67+
size_t kt_remaining_size;
6768
size_t kt_chunk_size;
6869
/* protects this struct and struct ktask_work's of a running task */
6970
spinlock_t kt_lock;
@@ -78,6 +79,7 @@ struct ktask_task {
7879
struct completion kt_ktask_done;
7980
int kt_complete;
8081
};
82+
atomic_t kt_refcnt;
8183
#ifdef CONFIG_LOCKDEP
8284
struct lockdep_map kt_lockdep_map;
8385
#endif
@@ -253,7 +255,11 @@ static void ktask_thread(struct work_struct *work)
253255
struct ktask_task *kt = kw->kw_task;
254256
struct ktask_ctl *kc = &kt->kt_ctl;
255257
struct ktask_node *kn = &kt->kt_nodes[kw->kw_ktask_node_i];
256-
bool done;
258+
bool done = false;
259+
unsigned long uninitialized_var(flags);
260+
261+
if (kc->kc_flags & KTASK_IRQS_OFF)
262+
local_irq_save(flags);
257263

258264
spin_lock(&kt->kt_lock);
259265

@@ -300,6 +306,8 @@ static void ktask_thread(struct work_struct *work)
300306
/* Start another worker on the node we've chosen. */
301307
if (ktask_node_migrate(old_kn, kn, i, kw, kt)) {
302308
spin_unlock(&kt->kt_lock);
309+
if (kc->kc_flags & KTASK_IRQS_OFF)
310+
local_irq_restore(flags);
303311
return;
304312
}
305313
}
@@ -327,6 +335,13 @@ static void ktask_thread(struct work_struct *work)
327335
lock_map_release(&kt->kt_lockdep_map);
328336
spin_lock(&kt->kt_lock);
329337

338+
WARN_ON(kt->kt_remaining_size < size);
339+
kt->kt_remaining_size -= size;
340+
341+
if (kt->kt_remaining_size == 0 &&
342+
(kc->kc_flags & KTASK_ASYNC_HELPERS))
343+
done = true;
344+
330345
if (ret != KTASK_RETURN_SUCCESS) {
331346
/* Save first error code only. */
332347
if (kt->kt_error == KTASK_RETURN_SUCCESS)
@@ -349,11 +364,24 @@ static void ktask_thread(struct work_struct *work)
349364

350365
++kt->kt_nworks_fini;
351366
WARN_ON(kt->kt_nworks_fini > kt->kt_nworks);
352-
done = (kt->kt_nworks_fini == kt->kt_nworks);
367+
if (!(kc->kc_flags & KTASK_ASYNC_HELPERS))
368+
done = (kt->kt_nworks_fini == kt->kt_nworks);
353369
spin_unlock(&kt->kt_lock);
354370

355371
if (done)
356372
ktask_complete(kt);
373+
374+
if (kc->kc_flags & KTASK_IRQS_OFF)
375+
local_irq_restore(flags);
376+
377+
if ((kc->kc_flags & KTASK_ASYNC_HELPERS) && !kw->kw_master_thr) {
378+
spin_lock(&ktask_rlim_lock);
379+
ktask_fini_work(kw);
380+
spin_unlock(&ktask_rlim_lock);
381+
382+
if (atomic_dec_and_test(&kt->kt_refcnt))
383+
kfree(kt);
384+
}
357385
}
358386

359387
/*
@@ -385,7 +413,14 @@ static size_t ktask_chunk_size(size_t task_size, size_t min_chunk_size,
385413
}
386414

387415
/*
388-
* Returns the number of works to be used in the task. This number includes
416+
* ktask_init_works - allocate the right number of ktask_work structs for the job
417+
* @nodes: array describing the job, split up by node
418+
* @nr_nodes: length of the array
419+
* @kt: the object holding the job state
420+
*
421+
* Assumes ktask_rlim_lock is held.
422+
*
423+
* Return: The number of works to be used in the task. This number includes
389424
* the current thread, so a return value of 1 means no extra threads are
390425
* started.
391426
*/
@@ -396,6 +431,8 @@ static size_t ktask_init_works(struct ktask_node *nodes, size_t nr_nodes,
396431
size_t min_chunk_size = kt->kt_ctl.kc_min_chunk_size;
397432
size_t max_threads = kt->kt_ctl.kc_max_threads;
398433

434+
lockdep_assert_held(&ktask_rlim_lock);
435+
399436
if (!ktask_wq)
400437
return 1;
401438

@@ -415,7 +452,6 @@ static size_t ktask_init_works(struct ktask_node *nodes, size_t nr_nodes,
415452
* ktask_rlim allows additional work items to be queued.
416453
*/
417454
nr_works = 1;
418-
spin_lock(&ktask_rlim_lock);
419455
for (i = nr_works; i < nr_works_check; ++i) {
420456
/* Allocate works evenly over the task's given nodes. */
421457
size_t ktask_node_i = i % nr_nodes;
@@ -447,7 +483,6 @@ static size_t ktask_init_works(struct ktask_node *nodes, size_t nr_nodes,
447483
++ktask_rlim_cur;
448484
++nr_works;
449485
}
450-
spin_unlock(&ktask_rlim_lock);
451486

452487
return nr_works;
453488
}
@@ -536,6 +571,7 @@ static int ktask_do_task(struct ktask_task *kt, struct ktask_node *nodes,
536571
size_t i, total_size;
537572
struct ktask_work kw;
538573
struct ktask_work *work;
574+
int error;
539575

540576
for (i = 0, total_size = 0; i < nr_nodes; ++i) {
541577
total_size += nodes[i].kn_task_size;
@@ -553,6 +589,7 @@ static int ktask_do_task(struct ktask_task *kt, struct ktask_node *nodes,
553589

554590
kt->kt_ctl = *ctl;
555591
kt->kt_total_size = total_size;
592+
kt->kt_remaining_size = total_size;
556593
kt->kt_nodes = nodes;
557594
kt->kt_nr_nodes = nr_nodes;
558595
kt->kt_nr_nodes_left = nr_nodes;
@@ -567,37 +604,87 @@ static int ktask_do_task(struct ktask_task *kt, struct ktask_node *nodes,
567604
lock_map_acquire(&kt->kt_lockdep_map);
568605
lock_map_release(&kt->kt_lockdep_map);
569606

607+
spin_lock(&ktask_rlim_lock);
608+
570609
kt->kt_nworks = ktask_init_works(nodes, nr_nodes, kt);
571610
kt->kt_chunk_size = ktask_chunk_size(kt->kt_total_size,
572611
ctl->kc_min_chunk_size,
573612
kt->kt_nworks);
574613

614+
if (ctl->kc_flags & KTASK_ASYNC_HELPERS)
615+
atomic_set(&kt->kt_refcnt, kt->kt_nworks);
616+
575617
list_for_each_entry(work, &kt->kt_works_list, kw_list)
576618
ktask_queue_work(work);
577619

620+
/*
621+
* Drop the lock after the list_for_each_entry loop above prevents
622+
* kt_works_list corruption when a ktask_thread calls ktask_work_fini()
623+
* to remove a ktask_work from the list.
624+
*/
625+
spin_unlock(&ktask_rlim_lock);
626+
578627
/* Use the current thread, which saves starting a workqueue worker. */
579628
ktask_init_work(&kw, kt, 0, nodes[0].kn_nid, true);
580629
INIT_LIST_HEAD(&kw.kw_list);
581630
ktask_thread(&kw.kw_work);
582631

583-
/* Wait for all the jobs to finish. */
632+
/* Wait for the job to finish. */
584633
ktask_wait_for_completion(kt);
585634

586-
if (kt->kt_error != KTASK_RETURN_SUCCESS && ctl->kc_undo_func)
587-
ktask_undo(nodes, nr_nodes, ctl, &kt->kt_works_list, &kw);
635+
error = kt->kt_error;
588636

589-
ktask_fini_works(kt);
637+
if (ctl->kc_flags & KTASK_ASYNC_HELPERS) {
638+
if (atomic_dec_and_test(&kt->kt_refcnt))
639+
kfree(kt);
640+
} else {
641+
if (error != KTASK_RETURN_SUCCESS && ctl->kc_undo_func)
642+
ktask_undo(nodes, nr_nodes, ctl, &kt->kt_works_list, &kw);
643+
644+
ktask_fini_works(kt);
645+
}
590646

591-
return kt->kt_error;
647+
return error;
592648
}
593649

594650
int __ktask_run_numa(struct ktask_node *nodes, size_t nr_nodes,
595651
struct ktask_ctl *ctl, struct lock_class_key *key,
596652
const char *map_name)
597653
{
598-
struct ktask_task kt;
654+
struct ktask_task *kt = NULL;
655+
int flags = ctl->kc_flags;
656+
657+
/*
658+
* Use a refcounted ktask_task to avoid waiting for all helpers to
659+
* finish before the task is done. Useful in contexts where deadlock
660+
* is possible from the main thread and its helpers waiting on each
661+
* other to finish.
662+
*/
663+
if (flags & KTASK_ASYNC_HELPERS) {
664+
gfp_t gfp = (flags & KTASK_ATOMIC) ? GFP_ATOMIC : GFP_KERNEL;
665+
666+
kt = kmalloc(sizeof(struct ktask_task), gfp);
667+
}
668+
669+
if (kt) {
670+
return ktask_do_task(kt, nodes, nr_nodes, ctl, key, map_name);
671+
} else {
672+
struct ktask_task kt_onstack;
599673

600-
return ktask_do_task(&kt, nodes, nr_nodes, ctl, key, map_name);
674+
kt = &kt_onstack;
675+
676+
if (flags & KTASK_ASYNC_HELPERS) {
677+
/*
678+
* kmalloc() failed so refcounting ktask_task isn't
679+
* possible. Fall back to one thread to avoid starting
680+
* helpers that this thread may deadlock on.
681+
*/
682+
ctl->kc_max_threads = 1;
683+
ctl->kc_flags &= ~KTASK_ASYNC_HELPERS;
684+
}
685+
686+
return ktask_do_task(kt, nodes, nr_nodes, ctl, key, map_name);
687+
}
601688
}
602689

603690
/*

mm/page_alloc.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1850,7 +1850,8 @@ static int __init deferred_init_memmap(void *data)
18501850
struct def_init_args args = { zone,
18511851
ATOMIC_LONG_INIT(0) };
18521852
DEFINE_KTASK_CTL(ctl, deferred_init_memmap_chunk, &args,
1853-
KTASK_PTE_MINCHUNK, KTASK_ATOMIC);
1853+
KTASK_PTE_MINCHUNK, KTASK_ATOMIC |
1854+
KTASK_ASYNC_HELPERS | KTASK_IRQS_OFF);
18541855

18551856
/*
18561857
* Helper threads should operate on MAX_ORDER_NR_PAGES

0 commit comments

Comments
 (0)