Skip to content

Commit a2e5144

Browse files
dvyukovtorvalds
authored andcommitted
kernel/hung_task.c: allow to set checking interval separately from timeout
Currently task hung checking interval is equal to timeout, as the result hung is detected anywhere between timeout and 2*timeout. This is fine for most interactive environments, but this hurts automated testing setups (syzbot). In an automated setup we need to strictly order CPU lockup < RCU stall < workqueue lockup < task hung < silent loss, so that RCU stall is not detected as task hung and task hung is not detected as silent machine loss. The large variance in task hung detection timeout requires setting silent machine loss timeout to a very large value (e.g. if task hung is 3 mins, then silent loss need to be set to ~7 mins). The additional 3 minutes significantly reduce testing efficiency because usually we crash kernel within a minute, and this can add hours to bug localization process as it needs to do dozens of tests. Allow setting checking interval separately from timeout. This allows to set timeout to, say, 3 minutes, but checking interval to 10 secs. The interval is controlled via a new hung_task_check_interval_secs sysctl, similar to the existing hung_task_timeout_secs sysctl. The default value of 0 results in the current behavior: checking interval is equal to timeout. [[email protected]: update hung_task_timeout_max's comment] Link: http://lkml.kernel.org/r/[email protected] Signed-off-by: Dmitry Vyukov <[email protected]> Cc: Paul E. McKenney <[email protected]> Cc: Tetsuo Handa <[email protected]> Cc: Thomas Gleixner <[email protected]> Cc: Peter Zijlstra <[email protected]> Cc: Ingo Molnar <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 91bc9aa commit a2e5144

File tree

6 files changed

+43
-3
lines changed

6 files changed

+43
-3
lines changed

Documentation/sysctl/kernel.txt

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ show up in /proc/sys/kernel:
3838
- hung_task_panic
3939
- hung_task_check_count
4040
- hung_task_timeout_secs
41+
- hung_task_check_interval_secs
4142
- hung_task_warnings
4243
- hyperv_record_panic_msg
4344
- kexec_load_disabled
@@ -355,7 +356,7 @@ This file shows up if CONFIG_DETECT_HUNG_TASK is enabled.
355356

356357
hung_task_timeout_secs:
357358

358-
Check interval. When a task in D state did not get scheduled
359+
When a task in D state did not get scheduled
359360
for more than this value report a warning.
360361
This file shows up if CONFIG_DETECT_HUNG_TASK is enabled.
361362

@@ -364,6 +365,18 @@ Possible values to set are in range {0..LONG_MAX/HZ}.
364365

365366
==============================================================
366367

368+
hung_task_check_interval_secs:
369+
370+
Hung task check interval. If hung task checking is enabled
371+
(see hung_task_timeout_secs), the check is done every
372+
hung_task_check_interval_secs seconds.
373+
This file shows up if CONFIG_DETECT_HUNG_TASK is enabled.
374+
375+
0 (default): means use hung_task_timeout_secs as checking interval.
376+
Possible values to set are in range {0..LONG_MAX/HZ}.
377+
378+
==============================================================
379+
367380
hung_task_warnings:
368381

369382
The maximum number of warnings to report. During a check interval

include/linux/sched.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -853,6 +853,7 @@ struct task_struct {
853853
#endif
854854
#ifdef CONFIG_DETECT_HUNG_TASK
855855
unsigned long last_switch_count;
856+
unsigned long last_switch_time;
856857
#endif
857858
/* Filesystem information: */
858859
struct fs_struct *fs;

include/linux/sched/sysctl.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ struct ctl_table;
1010
extern int sysctl_hung_task_check_count;
1111
extern unsigned int sysctl_hung_task_panic;
1212
extern unsigned long sysctl_hung_task_timeout_secs;
13+
extern unsigned long sysctl_hung_task_check_interval_secs;
1314
extern int sysctl_hung_task_warnings;
1415
extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
1516
void __user *buffer,

kernel/fork.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1302,6 +1302,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
13021302
tsk->nvcsw = tsk->nivcsw = 0;
13031303
#ifdef CONFIG_DETECT_HUNG_TASK
13041304
tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
1305+
tsk->last_switch_time = 0;
13051306
#endif
13061307

13071308
tsk->mm = NULL;

kernel/hung_task.c

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,11 @@ int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
4040
*/
4141
unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT;
4242

43+
/*
44+
* Zero (default value) means use sysctl_hung_task_timeout_secs:
45+
*/
46+
unsigned long __read_mostly sysctl_hung_task_check_interval_secs;
47+
4348
int __read_mostly sysctl_hung_task_warnings = 10;
4449

4550
static int __read_mostly did_panic;
@@ -98,8 +103,11 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
98103

99104
if (switch_count != t->last_switch_count) {
100105
t->last_switch_count = switch_count;
106+
t->last_switch_time = jiffies;
101107
return;
102108
}
109+
if (time_is_after_jiffies(t->last_switch_time + timeout * HZ))
110+
return;
103111

104112
trace_sched_process_hang(t);
105113

@@ -245,8 +253,13 @@ static int watchdog(void *dummy)
245253

246254
for ( ; ; ) {
247255
unsigned long timeout = sysctl_hung_task_timeout_secs;
248-
long t = hung_timeout_jiffies(hung_last_checked, timeout);
256+
unsigned long interval = sysctl_hung_task_check_interval_secs;
257+
long t;
249258

259+
if (interval == 0)
260+
interval = timeout;
261+
interval = min_t(unsigned long, interval, timeout);
262+
t = hung_timeout_jiffies(hung_last_checked, interval);
250263
if (t <= 0) {
251264
if (!atomic_xchg(&reset_hung_task, 0))
252265
check_hung_uninterruptible_tasks(timeout);

kernel/sysctl.c

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,10 @@ static int minolduid;
145145
static int ngroups_max = NGROUPS_MAX;
146146
static const int cap_last_cap = CAP_LAST_CAP;
147147

148-
/*this is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs */
148+
/*
149+
* This is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs
150+
* and hung_task_check_interval_secs
151+
*/
149152
#ifdef CONFIG_DETECT_HUNG_TASK
150153
static unsigned long hung_task_timeout_max = (LONG_MAX/HZ);
151154
#endif
@@ -1090,6 +1093,14 @@ static struct ctl_table kern_table[] = {
10901093
.proc_handler = proc_dohung_task_timeout_secs,
10911094
.extra2 = &hung_task_timeout_max,
10921095
},
1096+
{
1097+
.procname = "hung_task_check_interval_secs",
1098+
.data = &sysctl_hung_task_check_interval_secs,
1099+
.maxlen = sizeof(unsigned long),
1100+
.mode = 0644,
1101+
.proc_handler = proc_dohung_task_timeout_secs,
1102+
.extra2 = &hung_task_timeout_max,
1103+
},
10931104
{
10941105
.procname = "hung_task_warnings",
10951106
.data = &sysctl_hung_task_warnings,

0 commit comments

Comments
 (0)