Skip to content

Commit 26db62f

Browse files
Michal Hockotorvalds
authored andcommitted
oom: keep mm of the killed task available
oom_reap_task has to call exit_oom_victim in order to make sure that the oom vicim will not block the oom killer for ever. This is, however, opening new problems (e.g oom_killer_disable exclusion - see commit 7407054 ("oom, suspend: fix oom_reaper vs. oom_killer_disable race")). exit_oom_victim should be only called from the victim's context ideally. One way to achieve this would be to rely on per mm_struct flags. We already have MMF_OOM_REAPED to hide a task from the oom killer since "mm, oom: hide mm which is shared with kthread or global init". The problem is that the exit path: do_exit exit_mm tsk->mm = NULL; mmput __mmput exit_oom_victim doesn't guarantee that exit_oom_victim will get called in a bounded amount of time. At least exit_aio depends on IO which might get blocked due to lack of memory and who knows what else is lurking there. This patch takes a different approach. We remember tsk->mm into the signal_struct and bind it to the signal struct life time for all oom victims. __oom_reap_task_mm as well as oom_scan_process_thread do not have to rely on find_lock_task_mm anymore and they will have a reliable reference to the mm struct. As a result all the oom specific communication inside the OOM killer can be done via tsk->signal->oom_mm. Increasing the signal_struct for something as unlikely as the oom killer is far from ideal but this approach will make the code much more reasonable and long term we even might want to move task->mm into the signal_struct anyway. In the next step we might want to make the oom killer exclusion and access to memory reserves completely independent which would be also nice. Link: http://lkml.kernel.org/r/[email protected] Signed-off-by: Michal Hocko <[email protected]> Cc: Tetsuo Handa <[email protected]> Cc: Oleg Nesterov <[email protected]> Cc: David Rientjes <[email protected]> Cc: Vladimir Davydov <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 8496afa commit 26db62f

File tree

3 files changed

+23
-32
lines changed

3 files changed

+23
-32
lines changed

include/linux/sched.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -805,6 +805,8 @@ struct signal_struct {
805805
short oom_score_adj; /* OOM kill score adjustment */
806806
short oom_score_adj_min; /* OOM kill score adjustment min value.
807807
* Only settable by CAP_SYS_RESOURCE. */
808+
struct mm_struct *oom_mm; /* recorded mm when the thread group got
809+
* killed by the oom killer */
808810

809811
struct mutex cred_guard_mutex; /* guard against foreign influences on
810812
* credential calculations

kernel/fork.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -359,6 +359,8 @@ static inline void free_signal_struct(struct signal_struct *sig)
359359
{
360360
taskstats_tgid_free(sig);
361361
sched_autogroup_exit(sig);
362+
if (sig->oom_mm)
363+
mmdrop(sig->oom_mm);
362364
kmem_cache_free(signal_cachep, sig);
363365
}
364366

mm/oom_kill.c

Lines changed: 19 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -300,14 +300,7 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
300300
* any memory is quite low.
301301
*/
302302
if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims)) {
303-
struct task_struct *p = find_lock_task_mm(task);
304-
bool reaped = false;
305-
306-
if (p) {
307-
reaped = test_bit(MMF_OOM_REAPED, &p->mm->flags);
308-
task_unlock(p);
309-
}
310-
if (reaped)
303+
if (test_bit(MMF_OOM_REAPED, &task->signal->oom_mm->flags))
311304
goto next;
312305
goto abort;
313306
}
@@ -536,11 +529,6 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
536529
K(get_mm_counter(mm, MM_SHMEMPAGES)));
537530
up_read(&mm->mmap_sem);
538531

539-
/*
540-
* This task can be safely ignored because we cannot do much more
541-
* to release its memory.
542-
*/
543-
set_bit(MMF_OOM_REAPED, &mm->flags);
544532
/*
545533
* Drop our reference but make sure the mmput slow path is called from a
546534
* different context because we shouldn't risk we get stuck there and
@@ -556,20 +544,7 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
556544
static void oom_reap_task(struct task_struct *tsk)
557545
{
558546
int attempts = 0;
559-
struct mm_struct *mm = NULL;
560-
struct task_struct *p = find_lock_task_mm(tsk);
561-
562-
/*
563-
* Make sure we find the associated mm_struct even when the particular
564-
* thread has already terminated and cleared its mm.
565-
* We might have race with exit path so consider our work done if there
566-
* is no mm.
567-
*/
568-
if (!p)
569-
goto done;
570-
mm = p->mm;
571-
atomic_inc(&mm->mm_count);
572-
task_unlock(p);
547+
struct mm_struct *mm = tsk->signal->oom_mm;
573548

574549
/* Retry the down_read_trylock(mmap_sem) a few times */
575550
while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task_mm(tsk, mm))
@@ -578,8 +553,6 @@ static void oom_reap_task(struct task_struct *tsk)
578553
if (attempts <= MAX_OOM_REAP_RETRIES)
579554
goto done;
580555

581-
/* Ignore this mm because somebody can't call up_write(mmap_sem). */
582-
set_bit(MMF_OOM_REAPED, &mm->flags);
583556

584557
pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
585558
task_pid_nr(tsk), tsk->comm);
@@ -595,11 +568,14 @@ static void oom_reap_task(struct task_struct *tsk)
595568
tsk->oom_reaper_list = NULL;
596569
exit_oom_victim(tsk);
597570

571+
/*
572+
* Hide this mm from OOM killer because it has been either reaped or
573+
* somebody can't call up_write(mmap_sem).
574+
*/
575+
set_bit(MMF_OOM_REAPED, &mm->flags);
576+
598577
/* Drop a reference taken by wake_oom_reaper */
599578
put_task_struct(tsk);
600-
/* Drop a reference taken above. */
601-
if (mm)
602-
mmdrop(mm);
603579
}
604580

605581
static int oom_reaper(void *unused)
@@ -665,14 +641,25 @@ static inline void wake_oom_reaper(struct task_struct *tsk)
665641
*
666642
* Has to be called with oom_lock held and never after
667643
* oom has been disabled already.
644+
*
645+
* tsk->mm has to be non NULL and caller has to guarantee it is stable (either
646+
* under task_lock or operate on the current).
668647
*/
669648
static void mark_oom_victim(struct task_struct *tsk)
670649
{
650+
struct mm_struct *mm = tsk->mm;
651+
671652
WARN_ON(oom_killer_disabled);
672653
/* OOM killer might race with memcg OOM */
673654
if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
674655
return;
656+
675657
atomic_inc(&tsk->signal->oom_victims);
658+
659+
/* oom_mm is bound to the signal struct life time. */
660+
if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm))
661+
atomic_inc(&tsk->signal->oom_mm->mm_count);
662+
676663
/*
677664
* Make sure that the task is woken up from uninterruptible sleep
678665
* if it is frozen because OOM killer wouldn't be able to free

0 commit comments

Comments
 (0)