Skip to content

Commit e93ad19

Browse files
committed
cpuset: make mm migration asynchronous
If "cpuset.memory_migrate" is set, when a process is moved from one cpuset to another with a different memory node mask, pages in used by the process are migrated to the new set of nodes. This was performed synchronously in the ->attach() callback, which is synchronized against process management. Recently, the synchronization was changed from per-process rwsem to global percpu rwsem for simplicity and optimization. Combined with the synchronous mm migration, this led to deadlocks because mm migration could schedule a work item which may in turn try to create a new worker blocking on the process management lock held from cgroup process migration path. This heavy an operation shouldn't be performed synchronously from that deep inside cgroup migration in the first place. This patch punts the actual migration to an ordered workqueue and updates cgroup process migration and cpuset config update paths to flush the workqueue after all locks are released. This way, the operations still seem synchronous to userland without entangling mm migration with process management synchronization. CPU hotplug can also invoke mm migration but there's no reason for it to wait for mm migrations and thus doesn't synchronize against their completions. Signed-off-by: Tejun Heo <[email protected]> Reported-and-tested-by: Christian Borntraeger <[email protected]> Cc: [email protected] # v4.4+
1 parent 3e1e21c commit e93ad19

File tree

3 files changed

+57
-22
lines changed

3 files changed

+57
-22
lines changed

include/linux/cpuset.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,8 @@ static inline void set_mems_allowed(nodemask_t nodemask)
137137
task_unlock(current);
138138
}
139139

140+
extern void cpuset_post_attach_flush(void);
141+
140142
#else /* !CONFIG_CPUSETS */
141143

142144
static inline bool cpusets_enabled(void) { return false; }
@@ -243,6 +245,10 @@ static inline bool read_mems_allowed_retry(unsigned int seq)
243245
return false;
244246
}
245247

248+
static inline void cpuset_post_attach_flush(void)
249+
{
250+
}
251+
246252
#endif /* !CONFIG_CPUSETS */
247253

248254
#endif /* _LINUX_CPUSET_H */

kernel/cgroup.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@
5858
#include <linux/kthread.h>
5959
#include <linux/delay.h>
6060
#include <linux/atomic.h>
61+
#include <linux/cpuset.h>
6162
#include <net/sock.h>
6263

6364
/*
@@ -2739,6 +2740,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
27392740
out_unlock_threadgroup:
27402741
percpu_up_write(&cgroup_threadgroup_rwsem);
27412742
cgroup_kn_unlock(of->kn);
2743+
cpuset_post_attach_flush();
27422744
return ret ?: nbytes;
27432745
}
27442746

kernel/cpuset.c

Lines changed: 49 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -287,6 +287,8 @@ static struct cpuset top_cpuset = {
287287
static DEFINE_MUTEX(cpuset_mutex);
288288
static DEFINE_SPINLOCK(callback_lock);
289289

290+
static struct workqueue_struct *cpuset_migrate_mm_wq;
291+
290292
/*
291293
* CPU / memory hotplug is handled asynchronously.
292294
*/
@@ -972,31 +974,51 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
972974
}
973975

974976
/*
975-
* cpuset_migrate_mm
976-
*
977-
* Migrate memory region from one set of nodes to another.
978-
*
979-
* Temporarilly set tasks mems_allowed to target nodes of migration,
980-
* so that the migration code can allocate pages on these nodes.
981-
*
982-
* While the mm_struct we are migrating is typically from some
983-
* other task, the task_struct mems_allowed that we are hacking
984-
* is for our current task, which must allocate new pages for that
985-
* migrating memory region.
977+
* Migrate memory region from one set of nodes to another. This is
978+
* performed asynchronously as it can be called from process migration path
979+
* holding locks involved in process management. All mm migrations are
980+
* performed in the queued order and can be waited for by flushing
981+
* cpuset_migrate_mm_wq.
986982
*/
987983

984+
struct cpuset_migrate_mm_work {
985+
struct work_struct work;
986+
struct mm_struct *mm;
987+
nodemask_t from;
988+
nodemask_t to;
989+
};
990+
991+
static void cpuset_migrate_mm_workfn(struct work_struct *work)
992+
{
993+
struct cpuset_migrate_mm_work *mwork =
994+
container_of(work, struct cpuset_migrate_mm_work, work);
995+
996+
/* on a wq worker, no need to worry about %current's mems_allowed */
997+
do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
998+
mmput(mwork->mm);
999+
kfree(mwork);
1000+
}
1001+
9881002
static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
9891003
const nodemask_t *to)
9901004
{
991-
struct task_struct *tsk = current;
992-
993-
tsk->mems_allowed = *to;
1005+
struct cpuset_migrate_mm_work *mwork;
9941006

995-
do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
1007+
mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
1008+
if (mwork) {
1009+
mwork->mm = mm;
1010+
mwork->from = *from;
1011+
mwork->to = *to;
1012+
INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
1013+
queue_work(cpuset_migrate_mm_wq, &mwork->work);
1014+
} else {
1015+
mmput(mm);
1016+
}
1017+
}
9961018

997-
rcu_read_lock();
998-
guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed);
999-
rcu_read_unlock();
1019+
void cpuset_post_attach_flush(void)
1020+
{
1021+
flush_workqueue(cpuset_migrate_mm_wq);
10001022
}
10011023

10021024
/*
@@ -1097,7 +1119,8 @@ static void update_tasks_nodemask(struct cpuset *cs)
10971119
mpol_rebind_mm(mm, &cs->mems_allowed);
10981120
if (migrate)
10991121
cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
1100-
mmput(mm);
1122+
else
1123+
mmput(mm);
11011124
}
11021125
css_task_iter_end(&it);
11031126

@@ -1545,11 +1568,11 @@ static void cpuset_attach(struct cgroup_taskset *tset)
15451568
* @old_mems_allowed is the right nodesets that we
15461569
* migrate mm from.
15471570
*/
1548-
if (is_memory_migrate(cs)) {
1571+
if (is_memory_migrate(cs))
15491572
cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
15501573
&cpuset_attach_nodemask_to);
1551-
}
1552-
mmput(mm);
1574+
else
1575+
mmput(mm);
15531576
}
15541577
}
15551578

@@ -1714,6 +1737,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
17141737
mutex_unlock(&cpuset_mutex);
17151738
kernfs_unbreak_active_protection(of->kn);
17161739
css_put(&cs->css);
1740+
flush_workqueue(cpuset_migrate_mm_wq);
17171741
return retval ?: nbytes;
17181742
}
17191743

@@ -2359,6 +2383,9 @@ void __init cpuset_init_smp(void)
23592383
top_cpuset.effective_mems = node_states[N_MEMORY];
23602384

23612385
register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
2386+
2387+
cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
2388+
BUG_ON(!cpuset_migrate_mm_wq);
23622389
}
23632390

23642391
/**

0 commit comments

Comments
 (0)