1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Detect Hung Task 4 * 5 * kernel/hung_task.c - kernel thread for detecting tasks stuck in D state 6 * 7 */ 8 9 #include <linux/mm.h> 10 #include <linux/cpu.h> 11 #include <linux/nmi.h> 12 #include <linux/init.h> 13 #include <linux/delay.h> 14 #include <linux/freezer.h> 15 #include <linux/kthread.h> 16 #include <linux/lockdep.h> 17 #include <linux/export.h> 18 #include <linux/panic_notifier.h> 19 #include <linux/sysctl.h> 20 #include <linux/suspend.h> 21 #include <linux/utsname.h> 22 #include <linux/sched/signal.h> 23 #include <linux/sched/debug.h> 24 #include <linux/sched/sysctl.h> 25 #include <linux/hung_task.h> 26 #include <linux/rwsem.h> 27 28 #include <trace/events/sched.h> 29 30 /* 31 * The number of tasks checked: 32 */ 33 static int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; 34 35 /* 36 * Total number of tasks detected as hung since boot: 37 */ 38 static unsigned long __read_mostly sysctl_hung_task_detect_count; 39 40 /* 41 * Limit number of tasks checked in a batch. 42 * 43 * This value controls the preemptibility of khungtaskd since preemption 44 * is disabled during the critical section. It also controls the size of 45 * the RCU grace period. So it needs to be upper-bound. 46 */ 47 #define HUNG_TASK_LOCK_BREAK (HZ / 10) 48 49 /* 50 * Zero means infinite timeout - no checking done: 51 */ 52 unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT; 53 EXPORT_SYMBOL_GPL(sysctl_hung_task_timeout_secs); 54 55 /* 56 * Zero (default value) means use sysctl_hung_task_timeout_secs: 57 */ 58 static unsigned long __read_mostly sysctl_hung_task_check_interval_secs; 59 60 static int __read_mostly sysctl_hung_task_warnings = 10; 61 62 static int __read_mostly did_panic; 63 static bool hung_task_show_lock; 64 static bool hung_task_call_panic; 65 static bool hung_task_show_all_bt; 66 67 static struct task_struct *watchdog_task; 68 69 #ifdef CONFIG_SMP 70 /* 71 * Should we dump all CPUs backtraces in a hung task event? 72 * Defaults to 0, can be changed via sysctl. 73 */ 74 static unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace; 75 #else 76 #define sysctl_hung_task_all_cpu_backtrace 0 77 #endif /* CONFIG_SMP */ 78 79 /* 80 * Should we panic (and reboot, if panic_timeout= is set) when a 81 * hung task is detected: 82 */ 83 static unsigned int __read_mostly sysctl_hung_task_panic = 84 IS_ENABLED(CONFIG_BOOTPARAM_HUNG_TASK_PANIC); 85 86 static int 87 hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr) 88 { 89 did_panic = 1; 90 91 return NOTIFY_DONE; 92 } 93 94 static struct notifier_block panic_block = { 95 .notifier_call = hung_task_panic, 96 }; 97 98 99 #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER 100 static void debug_show_blocker(struct task_struct *task) 101 { 102 struct task_struct *g, *t; 103 unsigned long owner, blocker, blocker_type; 104 const char *rwsem_blocked_by, *rwsem_blocked_as; 105 106 RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "No rcu lock held"); 107 108 blocker = READ_ONCE(task->blocker); 109 if (!blocker) 110 return; 111 112 blocker_type = hung_task_get_blocker_type(blocker); 113 114 switch (blocker_type) { 115 case BLOCKER_TYPE_MUTEX: 116 owner = mutex_get_owner(hung_task_blocker_to_lock(blocker)); 117 break; 118 case BLOCKER_TYPE_SEM: 119 owner = sem_last_holder(hung_task_blocker_to_lock(blocker)); 120 break; 121 case BLOCKER_TYPE_RWSEM_READER: 122 case BLOCKER_TYPE_RWSEM_WRITER: 123 owner = (unsigned long)rwsem_owner( 124 hung_task_blocker_to_lock(blocker)); 125 rwsem_blocked_as = (blocker_type == BLOCKER_TYPE_RWSEM_READER) ? 126 "reader" : "writer"; 127 rwsem_blocked_by = is_rwsem_reader_owned( 128 hung_task_blocker_to_lock(blocker)) ? 129 "reader" : "writer"; 130 break; 131 default: 132 WARN_ON_ONCE(1); 133 return; 134 } 135 136 137 if (unlikely(!owner)) { 138 switch (blocker_type) { 139 case BLOCKER_TYPE_MUTEX: 140 pr_err("INFO: task %s:%d is blocked on a mutex, but the owner is not found.\n", 141 task->comm, task->pid); 142 break; 143 case BLOCKER_TYPE_SEM: 144 pr_err("INFO: task %s:%d is blocked on a semaphore, but the last holder is not found.\n", 145 task->comm, task->pid); 146 break; 147 case BLOCKER_TYPE_RWSEM_READER: 148 case BLOCKER_TYPE_RWSEM_WRITER: 149 pr_err("INFO: task %s:%d is blocked on an rw-semaphore, but the owner is not found.\n", 150 task->comm, task->pid); 151 break; 152 } 153 return; 154 } 155 156 /* Ensure the owner information is correct. */ 157 for_each_process_thread(g, t) { 158 if ((unsigned long)t != owner) 159 continue; 160 161 switch (blocker_type) { 162 case BLOCKER_TYPE_MUTEX: 163 pr_err("INFO: task %s:%d is blocked on a mutex likely owned by task %s:%d.\n", 164 task->comm, task->pid, t->comm, t->pid); 165 break; 166 case BLOCKER_TYPE_SEM: 167 pr_err("INFO: task %s:%d blocked on a semaphore likely last held by task %s:%d\n", 168 task->comm, task->pid, t->comm, t->pid); 169 break; 170 case BLOCKER_TYPE_RWSEM_READER: 171 case BLOCKER_TYPE_RWSEM_WRITER: 172 pr_err("INFO: task %s:%d <%s> blocked on an rw-semaphore likely owned by task %s:%d <%s>\n", 173 task->comm, task->pid, rwsem_blocked_as, t->comm, 174 t->pid, rwsem_blocked_by); 175 break; 176 } 177 sched_show_task(t); 178 return; 179 } 180 } 181 #else 182 static inline void debug_show_blocker(struct task_struct *task) 183 { 184 } 185 #endif 186 187 static void check_hung_task(struct task_struct *t, unsigned long timeout) 188 { 189 unsigned long switch_count = t->nvcsw + t->nivcsw; 190 191 /* 192 * Ensure the task is not frozen. 193 * Also, skip vfork and any other user process that freezer should skip. 194 */ 195 if (unlikely(READ_ONCE(t->__state) & TASK_FROZEN)) 196 return; 197 198 /* 199 * When a freshly created task is scheduled once, changes its state to 200 * TASK_UNINTERRUPTIBLE without having ever been switched out once, it 201 * musn't be checked. 202 */ 203 if (unlikely(!switch_count)) 204 return; 205 206 if (switch_count != t->last_switch_count) { 207 t->last_switch_count = switch_count; 208 t->last_switch_time = jiffies; 209 return; 210 } 211 if (time_is_after_jiffies(t->last_switch_time + timeout * HZ)) 212 return; 213 214 /* 215 * This counter tracks the total number of tasks detected as hung 216 * since boot. 217 */ 218 sysctl_hung_task_detect_count++; 219 220 trace_sched_process_hang(t); 221 222 if (sysctl_hung_task_panic) { 223 console_verbose(); 224 hung_task_show_lock = true; 225 hung_task_call_panic = true; 226 } 227 228 /* 229 * Ok, the task did not get scheduled for more than 2 minutes, 230 * complain: 231 */ 232 if (sysctl_hung_task_warnings || hung_task_call_panic) { 233 if (sysctl_hung_task_warnings > 0) 234 sysctl_hung_task_warnings--; 235 pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n", 236 t->comm, t->pid, (jiffies - t->last_switch_time) / HZ); 237 pr_err(" %s %s %.*s\n", 238 print_tainted(), init_utsname()->release, 239 (int)strcspn(init_utsname()->version, " "), 240 init_utsname()->version); 241 if (t->flags & PF_POSTCOREDUMP) 242 pr_err(" Blocked by coredump.\n"); 243 pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" 244 " disables this message.\n"); 245 sched_show_task(t); 246 debug_show_blocker(t); 247 hung_task_show_lock = true; 248 249 if (sysctl_hung_task_all_cpu_backtrace) 250 hung_task_show_all_bt = true; 251 if (!sysctl_hung_task_warnings) 252 pr_info("Future hung task reports are suppressed, see sysctl kernel.hung_task_warnings\n"); 253 } 254 255 touch_nmi_watchdog(); 256 } 257 258 /* 259 * To avoid extending the RCU grace period for an unbounded amount of time, 260 * periodically exit the critical section and enter a new one. 261 * 262 * For preemptible RCU it is sufficient to call rcu_read_unlock in order 263 * to exit the grace period. For classic RCU, a reschedule is required. 264 */ 265 static bool rcu_lock_break(struct task_struct *g, struct task_struct *t) 266 { 267 bool can_cont; 268 269 get_task_struct(g); 270 get_task_struct(t); 271 rcu_read_unlock(); 272 cond_resched(); 273 rcu_read_lock(); 274 can_cont = pid_alive(g) && pid_alive(t); 275 put_task_struct(t); 276 put_task_struct(g); 277 278 return can_cont; 279 } 280 281 /* 282 * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for 283 * a really long time (120 seconds). If that happens, print out 284 * a warning. 285 */ 286 static void check_hung_uninterruptible_tasks(unsigned long timeout) 287 { 288 int max_count = sysctl_hung_task_check_count; 289 unsigned long last_break = jiffies; 290 struct task_struct *g, *t; 291 292 /* 293 * If the system crashed already then all bets are off, 294 * do not report extra hung tasks: 295 */ 296 if (test_taint(TAINT_DIE) || did_panic) 297 return; 298 299 hung_task_show_lock = false; 300 rcu_read_lock(); 301 for_each_process_thread(g, t) { 302 unsigned int state; 303 304 if (!max_count--) 305 goto unlock; 306 if (time_after(jiffies, last_break + HUNG_TASK_LOCK_BREAK)) { 307 if (!rcu_lock_break(g, t)) 308 goto unlock; 309 last_break = jiffies; 310 } 311 /* 312 * skip the TASK_KILLABLE tasks -- these can be killed 313 * skip the TASK_IDLE tasks -- those are genuinely idle 314 */ 315 state = READ_ONCE(t->__state); 316 if ((state & TASK_UNINTERRUPTIBLE) && 317 !(state & TASK_WAKEKILL) && 318 !(state & TASK_NOLOAD)) 319 check_hung_task(t, timeout); 320 } 321 unlock: 322 rcu_read_unlock(); 323 if (hung_task_show_lock) 324 debug_show_all_locks(); 325 326 if (hung_task_show_all_bt) { 327 hung_task_show_all_bt = false; 328 trigger_all_cpu_backtrace(); 329 } 330 331 if (hung_task_call_panic) 332 panic("hung_task: blocked tasks"); 333 } 334 335 static long hung_timeout_jiffies(unsigned long last_checked, 336 unsigned long timeout) 337 { 338 /* timeout of 0 will disable the watchdog */ 339 return timeout ? last_checked - jiffies + timeout * HZ : 340 MAX_SCHEDULE_TIMEOUT; 341 } 342 343 #ifdef CONFIG_SYSCTL 344 /* 345 * Process updating of timeout sysctl 346 */ 347 static int proc_dohung_task_timeout_secs(const struct ctl_table *table, int write, 348 void *buffer, 349 size_t *lenp, loff_t *ppos) 350 { 351 int ret; 352 353 ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); 354 355 if (ret || !write) 356 goto out; 357 358 wake_up_process(watchdog_task); 359 360 out: 361 return ret; 362 } 363 364 /* 365 * This is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs 366 * and hung_task_check_interval_secs 367 */ 368 static const unsigned long hung_task_timeout_max = (LONG_MAX / HZ); 369 static const struct ctl_table hung_task_sysctls[] = { 370 #ifdef CONFIG_SMP 371 { 372 .procname = "hung_task_all_cpu_backtrace", 373 .data = &sysctl_hung_task_all_cpu_backtrace, 374 .maxlen = sizeof(int), 375 .mode = 0644, 376 .proc_handler = proc_dointvec_minmax, 377 .extra1 = SYSCTL_ZERO, 378 .extra2 = SYSCTL_ONE, 379 }, 380 #endif /* CONFIG_SMP */ 381 { 382 .procname = "hung_task_panic", 383 .data = &sysctl_hung_task_panic, 384 .maxlen = sizeof(int), 385 .mode = 0644, 386 .proc_handler = proc_dointvec_minmax, 387 .extra1 = SYSCTL_ZERO, 388 .extra2 = SYSCTL_ONE, 389 }, 390 { 391 .procname = "hung_task_check_count", 392 .data = &sysctl_hung_task_check_count, 393 .maxlen = sizeof(int), 394 .mode = 0644, 395 .proc_handler = proc_dointvec_minmax, 396 .extra1 = SYSCTL_ZERO, 397 }, 398 { 399 .procname = "hung_task_timeout_secs", 400 .data = &sysctl_hung_task_timeout_secs, 401 .maxlen = sizeof(unsigned long), 402 .mode = 0644, 403 .proc_handler = proc_dohung_task_timeout_secs, 404 .extra2 = (void *)&hung_task_timeout_max, 405 }, 406 { 407 .procname = "hung_task_check_interval_secs", 408 .data = &sysctl_hung_task_check_interval_secs, 409 .maxlen = sizeof(unsigned long), 410 .mode = 0644, 411 .proc_handler = proc_dohung_task_timeout_secs, 412 .extra2 = (void *)&hung_task_timeout_max, 413 }, 414 { 415 .procname = "hung_task_warnings", 416 .data = &sysctl_hung_task_warnings, 417 .maxlen = sizeof(int), 418 .mode = 0644, 419 .proc_handler = proc_dointvec_minmax, 420 .extra1 = SYSCTL_NEG_ONE, 421 }, 422 { 423 .procname = "hung_task_detect_count", 424 .data = &sysctl_hung_task_detect_count, 425 .maxlen = sizeof(unsigned long), 426 .mode = 0444, 427 .proc_handler = proc_doulongvec_minmax, 428 }, 429 }; 430 431 static void __init hung_task_sysctl_init(void) 432 { 433 register_sysctl_init("kernel", hung_task_sysctls); 434 } 435 #else 436 #define hung_task_sysctl_init() do { } while (0) 437 #endif /* CONFIG_SYSCTL */ 438 439 440 static atomic_t reset_hung_task = ATOMIC_INIT(0); 441 442 void reset_hung_task_detector(void) 443 { 444 atomic_set(&reset_hung_task, 1); 445 } 446 EXPORT_SYMBOL_GPL(reset_hung_task_detector); 447 448 static bool hung_detector_suspended; 449 450 static int hungtask_pm_notify(struct notifier_block *self, 451 unsigned long action, void *hcpu) 452 { 453 switch (action) { 454 case PM_SUSPEND_PREPARE: 455 case PM_HIBERNATION_PREPARE: 456 case PM_RESTORE_PREPARE: 457 hung_detector_suspended = true; 458 break; 459 case PM_POST_SUSPEND: 460 case PM_POST_HIBERNATION: 461 case PM_POST_RESTORE: 462 hung_detector_suspended = false; 463 break; 464 default: 465 break; 466 } 467 return NOTIFY_OK; 468 } 469 470 /* 471 * kthread which checks for tasks stuck in D state 472 */ 473 static int watchdog(void *dummy) 474 { 475 unsigned long hung_last_checked = jiffies; 476 477 set_user_nice(current, 0); 478 479 for ( ; ; ) { 480 unsigned long timeout = sysctl_hung_task_timeout_secs; 481 unsigned long interval = sysctl_hung_task_check_interval_secs; 482 long t; 483 484 if (interval == 0) 485 interval = timeout; 486 interval = min_t(unsigned long, interval, timeout); 487 t = hung_timeout_jiffies(hung_last_checked, interval); 488 if (t <= 0) { 489 if (!atomic_xchg(&reset_hung_task, 0) && 490 !hung_detector_suspended) 491 check_hung_uninterruptible_tasks(timeout); 492 hung_last_checked = jiffies; 493 continue; 494 } 495 schedule_timeout_interruptible(t); 496 } 497 498 return 0; 499 } 500 501 static int __init hung_task_init(void) 502 { 503 atomic_notifier_chain_register(&panic_notifier_list, &panic_block); 504 505 /* Disable hung task detector on suspend */ 506 pm_notifier(hungtask_pm_notify, 0); 507 508 watchdog_task = kthread_run(watchdog, NULL, "khungtaskd"); 509 hung_task_sysctl_init(); 510 511 return 0; 512 } 513 subsys_initcall(hung_task_init); 514