Skip to content

Commit 2129258

Browse files
aagittorvalds
authored andcommitted
mm: oom: let oom_reap_task and exit_mmap run concurrently
This is purely required because exit_aio() may block and exit_mmap() may never start, if the oom_reap_task cannot start running on a mm with mm_users == 0. At the same time if the OOM reaper doesn't wait at all for the memory of the current OOM candidate to be freed by exit_mmap->unmap_vmas, it would generate a spurious OOM kill. If it wasn't because of the exit_aio or similar blocking functions in the last mmput, it would be enough to change the oom_reap_task() in the case it finds mm_users == 0, to wait for a timeout or to wait for __mmput to set MMF_OOM_SKIP itself, but it's not just exit_mmap the problem here so the concurrency of exit_mmap and oom_reap_task is apparently warranted. It's a non standard runtime, exit_mmap() runs without mmap_sem, and oom_reap_task runs with the mmap_sem for reading as usual (kind of MADV_DONTNEED). The race between the two is solved with a combination of tsk_is_oom_victim() (serialized by task_lock) and MMF_OOM_SKIP (serialized by a dummy down_write/up_write cycle on the same lines of the ksm_exit method). If the oom_reap_task() may be running concurrently during exit_mmap, exit_mmap will wait it to finish in down_write (before taking down mm structures that would make the oom_reap_task fail with use after free). If exit_mmap comes first, oom_reap_task() will skip the mm if MMF_OOM_SKIP is already set and in turn all memory is already freed and furthermore the mm data structures may already have been taken down by free_pgtables. [[email protected]: incremental one liner] Link: http://lkml.kernel.org/r/[email protected] [[email protected]: remove unused mmput_async] Link: http://lkml.kernel.org/r/[email protected] [[email protected]: microoptimization] Link: http://lkml.kernel.org/r/[email protected] Link: http://lkml.kernel.org/r/[email protected] Fixes: 26db62f ("oom: keep mm of the killed task available") Signed-off-by: Andrea Arcangeli <[email protected]> Signed-off-by: David Rientjes <[email protected]> Reported-by: David Rientjes <[email protected]> Tested-by: David Rientjes <[email protected]> Reviewed-by: Michal Hocko <[email protected]> Cc: Tetsuo Handa <[email protected]> Cc: Oleg Nesterov <[email protected]> Cc: Hugh Dickins <[email protected]> Cc: "Kirill A. Shutemov" <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent a2468cc commit 2129258

File tree

4 files changed

+23
-33
lines changed

4 files changed

+23
-33
lines changed

include/linux/sched/mm.h

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -84,12 +84,6 @@ static inline bool mmget_not_zero(struct mm_struct *mm)
8484

8585
/* mmput gets rid of the mappings and all user-space */
8686
extern void mmput(struct mm_struct *);
87-
#ifdef CONFIG_MMU
88-
/* same as above but performs the slow path from the async context. Can
89-
* be called from the atomic context as well
90-
*/
91-
extern void mmput_async(struct mm_struct *);
92-
#endif
9387

9488
/* Grab a reference to a task's mm, if it is not already going away */
9589
extern struct mm_struct *get_task_mm(struct task_struct *task);

kernel/fork.c

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -922,7 +922,6 @@ static inline void __mmput(struct mm_struct *mm)
922922
}
923923
if (mm->binfmt)
924924
module_put(mm->binfmt->module);
925-
set_bit(MMF_OOM_SKIP, &mm->flags);
926925
mmdrop(mm);
927926
}
928927

@@ -938,22 +937,6 @@ void mmput(struct mm_struct *mm)
938937
}
939938
EXPORT_SYMBOL_GPL(mmput);
940939

941-
#ifdef CONFIG_MMU
942-
static void mmput_async_fn(struct work_struct *work)
943-
{
944-
struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
945-
__mmput(mm);
946-
}
947-
948-
void mmput_async(struct mm_struct *mm)
949-
{
950-
if (atomic_dec_and_test(&mm->mm_users)) {
951-
INIT_WORK(&mm->async_put_work, mmput_async_fn);
952-
schedule_work(&mm->async_put_work);
953-
}
954-
}
955-
#endif
956-
957940
/**
958941
* set_mm_exe_file - change a reference to the mm's executable file
959942
*

mm/mmap.c

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
#include <linux/userfaultfd_k.h>
4545
#include <linux/moduleparam.h>
4646
#include <linux/pkeys.h>
47+
#include <linux/oom.h>
4748

4849
#include <linux/uaccess.h>
4950
#include <asm/cacheflush.h>
@@ -3001,6 +3002,23 @@ void exit_mmap(struct mm_struct *mm)
30013002
/* Use -1 here to ensure all VMAs in the mm are unmapped */
30023003
unmap_vmas(&tlb, vma, 0, -1);
30033004

3005+
set_bit(MMF_OOM_SKIP, &mm->flags);
3006+
if (unlikely(tsk_is_oom_victim(current))) {
3007+
/*
3008+
* Wait for oom_reap_task() to stop working on this
3009+
* mm. Because MMF_OOM_SKIP is already set before
3010+
* calling down_read(), oom_reap_task() will not run
3011+
* on this "mm" post up_write().
3012+
*
3013+
* tsk_is_oom_victim() cannot be set from under us
3014+
* either because current->mm is already set to NULL
3015+
* under task_lock before calling mmput and oom_mm is
3016+
* set not NULL by the OOM killer only if current->mm
3017+
* is found not NULL while holding the task_lock.
3018+
*/
3019+
down_write(&mm->mmap_sem);
3020+
up_write(&mm->mmap_sem);
3021+
}
30043022
free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
30053023
tlb_finish_mmu(&tlb, 0, -1);
30063024

mm/oom_kill.c

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -495,11 +495,12 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
495495
}
496496

497497
/*
498-
* increase mm_users only after we know we will reap something so
499-
* that the mmput_async is called only when we have reaped something
500-
* and delayed __mmput doesn't matter that much
498+
* MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
499+
* work on the mm anymore. The check for MMF_OOM_SKIP must run
500+
* under mmap_sem for reading because it serializes against the
501+
* down_write();up_write() cycle in exit_mmap().
501502
*/
502-
if (!mmget_not_zero(mm)) {
503+
if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
503504
up_read(&mm->mmap_sem);
504505
trace_skip_task_reaping(tsk->pid);
505506
goto unlock_oom;
@@ -542,12 +543,6 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
542543
K(get_mm_counter(mm, MM_SHMEMPAGES)));
543544
up_read(&mm->mmap_sem);
544545

545-
/*
546-
* Drop our reference but make sure the mmput slow path is called from a
547-
* different context because we shouldn't risk we get stuck there and
548-
* put the oom_reaper out of the way.
549-
*/
550-
mmput_async(mm);
551546
trace_finish_task_reaping(tsk->pid);
552547
unlock_oom:
553548
mutex_unlock(&oom_lock);

0 commit comments

Comments
 (0)