1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Detect Hung Task
4 *
5 * kernel/hung_task.c - kernel thread for detecting tasks stuck in D state
6 *
7 */
8
9 #include <linux/mm.h>
10 #include <linux/cpu.h>
11 #include <linux/nmi.h>
12 #include <linux/init.h>
13 #include <linux/delay.h>
14 #include <linux/freezer.h>
15 #include <linux/kthread.h>
16 #include <linux/lockdep.h>
17 #include <linux/export.h>
18 #include <linux/panic_notifier.h>
19 #include <linux/sysctl.h>
20 #include <linux/suspend.h>
21 #include <linux/utsname.h>
22 #include <linux/sched/signal.h>
23 #include <linux/sched/debug.h>
24 #include <linux/sched/sysctl.h>
25 #include <linux/hung_task.h>
26 #include <linux/rwsem.h>
27
28 #include <trace/events/sched.h>
29
30 /*
31 * The number of tasks checked:
32 */
33 static int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
34
35 /*
36 * Total number of tasks detected as hung since boot:
37 */
38 static unsigned long __read_mostly sysctl_hung_task_detect_count;
39
40 /*
41 * Limit number of tasks checked in a batch.
42 *
43 * This value controls the preemptibility of khungtaskd since preemption
44 * is disabled during the critical section. It also controls the size of
45 * the RCU grace period. So it needs to be upper-bound.
46 */
47 #define HUNG_TASK_LOCK_BREAK (HZ / 10)
48
49 /*
50 * Zero means infinite timeout - no checking done:
51 */
52 unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT;
53 EXPORT_SYMBOL_GPL(sysctl_hung_task_timeout_secs);
54
55 /*
56 * Zero (default value) means use sysctl_hung_task_timeout_secs:
57 */
58 static unsigned long __read_mostly sysctl_hung_task_check_interval_secs;
59
60 static int __read_mostly sysctl_hung_task_warnings = 10;
61
62 static int __read_mostly did_panic;
63 static bool hung_task_show_lock;
64 static bool hung_task_call_panic;
65 static bool hung_task_show_all_bt;
66
67 static struct task_struct *watchdog_task;
68
69 #ifdef CONFIG_SMP
70 /*
71 * Should we dump all CPUs backtraces in a hung task event?
72 * Defaults to 0, can be changed via sysctl.
73 */
74 static unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace;
75 #else
76 #define sysctl_hung_task_all_cpu_backtrace 0
77 #endif /* CONFIG_SMP */
78
79 /*
80 * Should we panic (and reboot, if panic_timeout= is set) when a
81 * hung task is detected:
82 */
83 static unsigned int __read_mostly sysctl_hung_task_panic =
84 IS_ENABLED(CONFIG_BOOTPARAM_HUNG_TASK_PANIC);
85
86 static int
hung_task_panic(struct notifier_block * this,unsigned long event,void * ptr)87 hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr)
88 {
89 did_panic = 1;
90
91 return NOTIFY_DONE;
92 }
93
94 static struct notifier_block panic_block = {
95 .notifier_call = hung_task_panic,
96 };
97
task_is_hung(struct task_struct * t,unsigned long timeout)98 static bool task_is_hung(struct task_struct *t, unsigned long timeout)
99 {
100 unsigned long switch_count = t->nvcsw + t->nivcsw;
101 unsigned int state = READ_ONCE(t->__state);
102
103 /*
104 * skip the TASK_KILLABLE tasks -- these can be killed
105 * skip the TASK_IDLE tasks -- those are genuinely idle
106 * skip the TASK_FROZEN task -- it reasonably stops scheduling by freezer
107 */
108 if (!(state & TASK_UNINTERRUPTIBLE) ||
109 (state & (TASK_WAKEKILL | TASK_NOLOAD | TASK_FROZEN)))
110 return false;
111
112 /*
113 * When a freshly created task is scheduled once, changes its state to
114 * TASK_UNINTERRUPTIBLE without having ever been switched out once, it
115 * musn't be checked.
116 */
117 if (unlikely(!switch_count))
118 return false;
119
120 if (switch_count != t->last_switch_count) {
121 t->last_switch_count = switch_count;
122 t->last_switch_time = jiffies;
123 return false;
124 }
125 if (time_is_after_jiffies(t->last_switch_time + timeout * HZ))
126 return false;
127
128 return true;
129 }
130
131 #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
debug_show_blocker(struct task_struct * task,unsigned long timeout)132 static void debug_show_blocker(struct task_struct *task, unsigned long timeout)
133 {
134 struct task_struct *g, *t;
135 unsigned long owner, blocker, blocker_type;
136 const char *rwsem_blocked_by, *rwsem_blocked_as;
137
138 RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "No rcu lock held");
139
140 blocker = READ_ONCE(task->blocker);
141 if (!blocker)
142 return;
143
144 blocker_type = hung_task_get_blocker_type(blocker);
145
146 switch (blocker_type) {
147 case BLOCKER_TYPE_MUTEX:
148 owner = mutex_get_owner(hung_task_blocker_to_lock(blocker));
149 break;
150 case BLOCKER_TYPE_SEM:
151 owner = sem_last_holder(hung_task_blocker_to_lock(blocker));
152 break;
153 case BLOCKER_TYPE_RWSEM_READER:
154 case BLOCKER_TYPE_RWSEM_WRITER:
155 owner = (unsigned long)rwsem_owner(
156 hung_task_blocker_to_lock(blocker));
157 rwsem_blocked_as = (blocker_type == BLOCKER_TYPE_RWSEM_READER) ?
158 "reader" : "writer";
159 rwsem_blocked_by = is_rwsem_reader_owned(
160 hung_task_blocker_to_lock(blocker)) ?
161 "reader" : "writer";
162 break;
163 default:
164 WARN_ON_ONCE(1);
165 return;
166 }
167
168
169 if (unlikely(!owner)) {
170 switch (blocker_type) {
171 case BLOCKER_TYPE_MUTEX:
172 pr_err("INFO: task %s:%d is blocked on a mutex, but the owner is not found.\n",
173 task->comm, task->pid);
174 break;
175 case BLOCKER_TYPE_SEM:
176 pr_err("INFO: task %s:%d is blocked on a semaphore, but the last holder is not found.\n",
177 task->comm, task->pid);
178 break;
179 case BLOCKER_TYPE_RWSEM_READER:
180 case BLOCKER_TYPE_RWSEM_WRITER:
181 pr_err("INFO: task %s:%d is blocked on an rw-semaphore, but the owner is not found.\n",
182 task->comm, task->pid);
183 break;
184 }
185 return;
186 }
187
188 /* Ensure the owner information is correct. */
189 for_each_process_thread(g, t) {
190 if ((unsigned long)t != owner)
191 continue;
192
193 switch (blocker_type) {
194 case BLOCKER_TYPE_MUTEX:
195 pr_err("INFO: task %s:%d is blocked on a mutex likely owned by task %s:%d.\n",
196 task->comm, task->pid, t->comm, t->pid);
197 break;
198 case BLOCKER_TYPE_SEM:
199 pr_err("INFO: task %s:%d blocked on a semaphore likely last held by task %s:%d\n",
200 task->comm, task->pid, t->comm, t->pid);
201 break;
202 case BLOCKER_TYPE_RWSEM_READER:
203 case BLOCKER_TYPE_RWSEM_WRITER:
204 pr_err("INFO: task %s:%d <%s> blocked on an rw-semaphore likely owned by task %s:%d <%s>\n",
205 task->comm, task->pid, rwsem_blocked_as, t->comm,
206 t->pid, rwsem_blocked_by);
207 break;
208 }
209 /* Avoid duplicated task dump, skip if the task is also hung. */
210 if (!task_is_hung(t, timeout))
211 sched_show_task(t);
212 return;
213 }
214 }
215 #else
debug_show_blocker(struct task_struct * task,unsigned long timeout)216 static inline void debug_show_blocker(struct task_struct *task, unsigned long timeout)
217 {
218 }
219 #endif
220
check_hung_task(struct task_struct * t,unsigned long timeout)221 static void check_hung_task(struct task_struct *t, unsigned long timeout)
222 {
223 if (!task_is_hung(t, timeout))
224 return;
225
226 /*
227 * This counter tracks the total number of tasks detected as hung
228 * since boot.
229 */
230 sysctl_hung_task_detect_count++;
231
232 trace_sched_process_hang(t);
233
234 if (sysctl_hung_task_panic) {
235 console_verbose();
236 hung_task_show_lock = true;
237 hung_task_call_panic = true;
238 }
239
240 /*
241 * Ok, the task did not get scheduled for more than 2 minutes,
242 * complain:
243 */
244 if (sysctl_hung_task_warnings || hung_task_call_panic) {
245 if (sysctl_hung_task_warnings > 0)
246 sysctl_hung_task_warnings--;
247 pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n",
248 t->comm, t->pid, (jiffies - t->last_switch_time) / HZ);
249 pr_err(" %s %s %.*s\n",
250 print_tainted(), init_utsname()->release,
251 (int)strcspn(init_utsname()->version, " "),
252 init_utsname()->version);
253 if (t->flags & PF_POSTCOREDUMP)
254 pr_err(" Blocked by coredump.\n");
255 pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
256 " disables this message.\n");
257 sched_show_task(t);
258 debug_show_blocker(t, timeout);
259 hung_task_show_lock = true;
260
261 if (sysctl_hung_task_all_cpu_backtrace)
262 hung_task_show_all_bt = true;
263 if (!sysctl_hung_task_warnings)
264 pr_info("Future hung task reports are suppressed, see sysctl kernel.hung_task_warnings\n");
265 }
266
267 touch_nmi_watchdog();
268 }
269
270 /*
271 * To avoid extending the RCU grace period for an unbounded amount of time,
272 * periodically exit the critical section and enter a new one.
273 *
274 * For preemptible RCU it is sufficient to call rcu_read_unlock in order
275 * to exit the grace period. For classic RCU, a reschedule is required.
276 */
rcu_lock_break(struct task_struct * g,struct task_struct * t)277 static bool rcu_lock_break(struct task_struct *g, struct task_struct *t)
278 {
279 bool can_cont;
280
281 get_task_struct(g);
282 get_task_struct(t);
283 rcu_read_unlock();
284 cond_resched();
285 rcu_read_lock();
286 can_cont = pid_alive(g) && pid_alive(t);
287 put_task_struct(t);
288 put_task_struct(g);
289
290 return can_cont;
291 }
292
293 /*
294 * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
295 * a really long time (120 seconds). If that happens, print out
296 * a warning.
297 */
check_hung_uninterruptible_tasks(unsigned long timeout)298 static void check_hung_uninterruptible_tasks(unsigned long timeout)
299 {
300 int max_count = sysctl_hung_task_check_count;
301 unsigned long last_break = jiffies;
302 struct task_struct *g, *t;
303
304 /*
305 * If the system crashed already then all bets are off,
306 * do not report extra hung tasks:
307 */
308 if (test_taint(TAINT_DIE) || did_panic)
309 return;
310
311 hung_task_show_lock = false;
312 rcu_read_lock();
313 for_each_process_thread(g, t) {
314
315 if (!max_count--)
316 goto unlock;
317 if (time_after(jiffies, last_break + HUNG_TASK_LOCK_BREAK)) {
318 if (!rcu_lock_break(g, t))
319 goto unlock;
320 last_break = jiffies;
321 }
322
323 check_hung_task(t, timeout);
324 }
325 unlock:
326 rcu_read_unlock();
327 if (hung_task_show_lock)
328 debug_show_all_locks();
329
330 if (hung_task_show_all_bt) {
331 hung_task_show_all_bt = false;
332 trigger_all_cpu_backtrace();
333 }
334
335 if (hung_task_call_panic)
336 panic("hung_task: blocked tasks");
337 }
338
hung_timeout_jiffies(unsigned long last_checked,unsigned long timeout)339 static long hung_timeout_jiffies(unsigned long last_checked,
340 unsigned long timeout)
341 {
342 /* timeout of 0 will disable the watchdog */
343 return timeout ? last_checked - jiffies + timeout * HZ :
344 MAX_SCHEDULE_TIMEOUT;
345 }
346
347 #ifdef CONFIG_SYSCTL
348 /*
349 * Process updating of timeout sysctl
350 */
proc_dohung_task_timeout_secs(const struct ctl_table * table,int write,void * buffer,size_t * lenp,loff_t * ppos)351 static int proc_dohung_task_timeout_secs(const struct ctl_table *table, int write,
352 void *buffer,
353 size_t *lenp, loff_t *ppos)
354 {
355 int ret;
356
357 ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
358
359 if (ret || !write)
360 goto out;
361
362 wake_up_process(watchdog_task);
363
364 out:
365 return ret;
366 }
367
368 /*
369 * This is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs
370 * and hung_task_check_interval_secs
371 */
372 static const unsigned long hung_task_timeout_max = (LONG_MAX / HZ);
373 static const struct ctl_table hung_task_sysctls[] = {
374 #ifdef CONFIG_SMP
375 {
376 .procname = "hung_task_all_cpu_backtrace",
377 .data = &sysctl_hung_task_all_cpu_backtrace,
378 .maxlen = sizeof(int),
379 .mode = 0644,
380 .proc_handler = proc_dointvec_minmax,
381 .extra1 = SYSCTL_ZERO,
382 .extra2 = SYSCTL_ONE,
383 },
384 #endif /* CONFIG_SMP */
385 {
386 .procname = "hung_task_panic",
387 .data = &sysctl_hung_task_panic,
388 .maxlen = sizeof(int),
389 .mode = 0644,
390 .proc_handler = proc_dointvec_minmax,
391 .extra1 = SYSCTL_ZERO,
392 .extra2 = SYSCTL_ONE,
393 },
394 {
395 .procname = "hung_task_check_count",
396 .data = &sysctl_hung_task_check_count,
397 .maxlen = sizeof(int),
398 .mode = 0644,
399 .proc_handler = proc_dointvec_minmax,
400 .extra1 = SYSCTL_ZERO,
401 },
402 {
403 .procname = "hung_task_timeout_secs",
404 .data = &sysctl_hung_task_timeout_secs,
405 .maxlen = sizeof(unsigned long),
406 .mode = 0644,
407 .proc_handler = proc_dohung_task_timeout_secs,
408 .extra2 = (void *)&hung_task_timeout_max,
409 },
410 {
411 .procname = "hung_task_check_interval_secs",
412 .data = &sysctl_hung_task_check_interval_secs,
413 .maxlen = sizeof(unsigned long),
414 .mode = 0644,
415 .proc_handler = proc_dohung_task_timeout_secs,
416 .extra2 = (void *)&hung_task_timeout_max,
417 },
418 {
419 .procname = "hung_task_warnings",
420 .data = &sysctl_hung_task_warnings,
421 .maxlen = sizeof(int),
422 .mode = 0644,
423 .proc_handler = proc_dointvec_minmax,
424 .extra1 = SYSCTL_NEG_ONE,
425 },
426 {
427 .procname = "hung_task_detect_count",
428 .data = &sysctl_hung_task_detect_count,
429 .maxlen = sizeof(unsigned long),
430 .mode = 0444,
431 .proc_handler = proc_doulongvec_minmax,
432 },
433 };
434
hung_task_sysctl_init(void)435 static void __init hung_task_sysctl_init(void)
436 {
437 register_sysctl_init("kernel", hung_task_sysctls);
438 }
439 #else
440 #define hung_task_sysctl_init() do { } while (0)
441 #endif /* CONFIG_SYSCTL */
442
443
444 static atomic_t reset_hung_task = ATOMIC_INIT(0);
445
reset_hung_task_detector(void)446 void reset_hung_task_detector(void)
447 {
448 atomic_set(&reset_hung_task, 1);
449 }
450 EXPORT_SYMBOL_GPL(reset_hung_task_detector);
451
452 static bool hung_detector_suspended;
453
hungtask_pm_notify(struct notifier_block * self,unsigned long action,void * hcpu)454 static int hungtask_pm_notify(struct notifier_block *self,
455 unsigned long action, void *hcpu)
456 {
457 switch (action) {
458 case PM_SUSPEND_PREPARE:
459 case PM_HIBERNATION_PREPARE:
460 case PM_RESTORE_PREPARE:
461 hung_detector_suspended = true;
462 break;
463 case PM_POST_SUSPEND:
464 case PM_POST_HIBERNATION:
465 case PM_POST_RESTORE:
466 hung_detector_suspended = false;
467 break;
468 default:
469 break;
470 }
471 return NOTIFY_OK;
472 }
473
474 /*
475 * kthread which checks for tasks stuck in D state
476 */
watchdog(void * dummy)477 static int watchdog(void *dummy)
478 {
479 unsigned long hung_last_checked = jiffies;
480
481 set_user_nice(current, 0);
482
483 for ( ; ; ) {
484 unsigned long timeout = sysctl_hung_task_timeout_secs;
485 unsigned long interval = sysctl_hung_task_check_interval_secs;
486 long t;
487
488 if (interval == 0)
489 interval = timeout;
490 interval = min_t(unsigned long, interval, timeout);
491 t = hung_timeout_jiffies(hung_last_checked, interval);
492 if (t <= 0) {
493 if (!atomic_xchg(&reset_hung_task, 0) &&
494 !hung_detector_suspended)
495 check_hung_uninterruptible_tasks(timeout);
496 hung_last_checked = jiffies;
497 continue;
498 }
499 schedule_timeout_interruptible(t);
500 }
501
502 return 0;
503 }
504
hung_task_init(void)505 static int __init hung_task_init(void)
506 {
507 atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
508
509 /* Disable hung task detector on suspend */
510 pm_notifier(hungtask_pm_notify, 0);
511
512 watchdog_task = kthread_run(watchdog, NULL, "khungtaskd");
513 hung_task_sysctl_init();
514
515 return 0;
516 }
517 subsys_initcall(hung_task_init);
518