Skip to content

Commit 422580c

Browse files
rgushchintorvalds
authored andcommitted
mm/oom_kill.c: add tracepoints for oom reaper-related events
During the debugging of the problem described in https://lkml.org/lkml/2017/5/17/542 and fixed by Tetsuo Handa in https://lkml.org/lkml/2017/5/19/383 , I've found that the existing debug output is not really useful to understand issues related to the oom reaper. So, I assume, that adding some tracepoints might help with debugging of similar issues. Trace the following events: 1) a process is marked as an oom victim, 2) a process is added to the oom reaper list, 3) the oom reaper starts reaping process's mm, 4) the oom reaper finished reaping, 5) the oom reaper skips reaping. How it works in practice? Below is an example which show how the problem mentioned above can be found: one process is added twice to the oom_reaper list: $ cd /sys/kernel/debug/tracing $ echo "oom:mark_victim" > set_event $ echo "oom:wake_reaper" >> set_event $ echo "oom:skip_task_reaping" >> set_event $ echo "oom:start_task_reaping" >> set_event $ echo "oom:finish_task_reaping" >> set_event $ cat trace_pipe allocate-502 [001] .... 91.836405: mark_victim: pid=502 allocate-502 [001] .N.. 91.837356: wake_reaper: pid=502 allocate-502 [000] .N.. 91.871149: wake_reaper: pid=502 oom_reaper-23 [000] .... 91.871177: start_task_reaping: pid=502 oom_reaper-23 [000] .N.. 91.879511: finish_task_reaping: pid=502 oom_reaper-23 [000] .... 91.879580: skip_task_reaping: pid=502 Link: http://lkml.kernel.org/r/20170530185231.GA13412@castle Signed-off-by: Roman Gushchin <[email protected]> Acked-by: Michal Hocko <[email protected]> Cc: Tetsuo Handa <[email protected]> Cc: Johannes Weiner <[email protected]> Cc: Vladimir Davydov <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 230ca98 commit 422580c

File tree

2 files changed

+87
-0
lines changed

2 files changed

+87
-0
lines changed

include/trace/events/oom.h

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,86 @@ TRACE_EVENT(reclaim_retry_zone,
7070
__entry->wmark_check)
7171
);
7272

73+
TRACE_EVENT(mark_victim,
74+
TP_PROTO(int pid),
75+
76+
TP_ARGS(pid),
77+
78+
TP_STRUCT__entry(
79+
__field(int, pid)
80+
),
81+
82+
TP_fast_assign(
83+
__entry->pid = pid;
84+
),
85+
86+
TP_printk("pid=%d", __entry->pid)
87+
);
88+
89+
TRACE_EVENT(wake_reaper,
90+
TP_PROTO(int pid),
91+
92+
TP_ARGS(pid),
93+
94+
TP_STRUCT__entry(
95+
__field(int, pid)
96+
),
97+
98+
TP_fast_assign(
99+
__entry->pid = pid;
100+
),
101+
102+
TP_printk("pid=%d", __entry->pid)
103+
);
104+
105+
TRACE_EVENT(start_task_reaping,
106+
TP_PROTO(int pid),
107+
108+
TP_ARGS(pid),
109+
110+
TP_STRUCT__entry(
111+
__field(int, pid)
112+
),
113+
114+
TP_fast_assign(
115+
__entry->pid = pid;
116+
),
117+
118+
TP_printk("pid=%d", __entry->pid)
119+
);
120+
121+
TRACE_EVENT(finish_task_reaping,
122+
TP_PROTO(int pid),
123+
124+
TP_ARGS(pid),
125+
126+
TP_STRUCT__entry(
127+
__field(int, pid)
128+
),
129+
130+
TP_fast_assign(
131+
__entry->pid = pid;
132+
),
133+
134+
TP_printk("pid=%d", __entry->pid)
135+
);
136+
137+
TRACE_EVENT(skip_task_reaping,
138+
TP_PROTO(int pid),
139+
140+
TP_ARGS(pid),
141+
142+
TP_STRUCT__entry(
143+
__field(int, pid)
144+
),
145+
146+
TP_fast_assign(
147+
__entry->pid = pid;
148+
),
149+
150+
TP_printk("pid=%d", __entry->pid)
151+
);
152+
73153
#ifdef CONFIG_COMPACTION
74154
TRACE_EVENT(compact_retry,
75155

mm/oom_kill.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -490,6 +490,7 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
490490

491491
if (!down_read_trylock(&mm->mmap_sem)) {
492492
ret = false;
493+
trace_skip_task_reaping(tsk->pid);
493494
goto unlock_oom;
494495
}
495496

@@ -500,9 +501,12 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
500501
*/
501502
if (!mmget_not_zero(mm)) {
502503
up_read(&mm->mmap_sem);
504+
trace_skip_task_reaping(tsk->pid);
503505
goto unlock_oom;
504506
}
505507

508+
trace_start_task_reaping(tsk->pid);
509+
506510
/*
507511
* Tell all users of get_user/copy_from_user etc... that the content
508512
* is no longer stable. No barriers really needed because unmapping
@@ -544,6 +548,7 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
544548
* put the oom_reaper out of the way.
545549
*/
546550
mmput_async(mm);
551+
trace_finish_task_reaping(tsk->pid);
547552
unlock_oom:
548553
mutex_unlock(&oom_lock);
549554
return ret;
@@ -615,6 +620,7 @@ static void wake_oom_reaper(struct task_struct *tsk)
615620
tsk->oom_reaper_list = oom_reaper_list;
616621
oom_reaper_list = tsk;
617622
spin_unlock(&oom_reaper_lock);
623+
trace_wake_reaper(tsk->pid);
618624
wake_up(&oom_reaper_wait);
619625
}
620626

@@ -666,6 +672,7 @@ static void mark_oom_victim(struct task_struct *tsk)
666672
*/
667673
__thaw_task(tsk);
668674
atomic_inc(&oom_victims);
675+
trace_mark_victim(tsk->pid);
669676
}
670677

671678
/**

0 commit comments

Comments
 (0)