1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Detect Hung Task 4 * 5 * kernel/hung_task.c - kernel thread for detecting tasks stuck in D state 6 * 7 */ 8 9 #include <linux/mm.h> 10 #include <linux/cpu.h> 11 #include <linux/nmi.h> 12 #include <linux/init.h> 13 #include <linux/delay.h> 14 #include <linux/freezer.h> 15 #include <linux/kthread.h> 16 #include <linux/lockdep.h> 17 #include <linux/export.h> 18 #include <linux/panic_notifier.h> 19 #include <linux/sysctl.h> 20 #include <linux/suspend.h> 21 #include <linux/utsname.h> 22 #include <linux/sched/signal.h> 23 #include <linux/sched/debug.h> 24 #include <linux/sched/sysctl.h> 25 #include <linux/hung_task.h> 26 #include <linux/rwsem.h> 27 #include <linux/sys_info.h> 28 29 #include <trace/events/sched.h> 30 31 /* 32 * The number of tasks checked: 33 */ 34 static int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; 35 36 /* 37 * Total number of tasks detected as hung since boot: 38 */ 39 static atomic_long_t sysctl_hung_task_detect_count = ATOMIC_LONG_INIT(0); 40 41 /* 42 * Limit number of tasks checked in a batch. 43 * 44 * This value controls the preemptibility of khungtaskd since preemption 45 * is disabled during the critical section. It also controls the size of 46 * the RCU grace period. So it needs to be upper-bound. 47 */ 48 #define HUNG_TASK_LOCK_BREAK (HZ / 10) 49 50 /* 51 * Zero means infinite timeout - no checking done: 52 */ 53 unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT; 54 55 /* 56 * Zero (default value) means use sysctl_hung_task_timeout_secs: 57 */ 58 static unsigned long __read_mostly sysctl_hung_task_check_interval_secs; 59 60 static int __read_mostly sysctl_hung_task_warnings = 10; 61 62 static int __read_mostly did_panic; 63 static bool hung_task_call_panic; 64 65 static struct task_struct *watchdog_task; 66 67 /* 68 * A bitmask to control what kinds of system info to be printed when 69 * a hung task is detected, it could be task, memory, lock etc. Refer 70 * include/linux/sys_info.h for detailed bit definition. 71 */ 72 static unsigned long hung_task_si_mask; 73 74 #ifdef CONFIG_SMP 75 /* 76 * Should we dump all CPUs backtraces in a hung task event? 77 * Defaults to 0, can be changed via sysctl. 78 */ 79 static unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace; 80 #else 81 #define sysctl_hung_task_all_cpu_backtrace 0 82 #endif /* CONFIG_SMP */ 83 84 /* 85 * Should we panic (and reboot, if panic_timeout= is set) when a 86 * hung task is detected: 87 */ 88 static unsigned int __read_mostly sysctl_hung_task_panic = 89 CONFIG_BOOTPARAM_HUNG_TASK_PANIC; 90 91 static int 92 hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr) 93 { 94 did_panic = 1; 95 96 return NOTIFY_DONE; 97 } 98 99 static struct notifier_block panic_block = { 100 .notifier_call = hung_task_panic, 101 }; 102 103 static bool task_is_hung(struct task_struct *t, unsigned long timeout) 104 { 105 unsigned long switch_count = t->nvcsw + t->nivcsw; 106 unsigned int state = READ_ONCE(t->__state); 107 108 /* 109 * skip the TASK_KILLABLE tasks -- these can be killed 110 * skip the TASK_IDLE tasks -- those are genuinely idle 111 * skip the TASK_FROZEN task -- it reasonably stops scheduling by freezer 112 */ 113 if (!(state & TASK_UNINTERRUPTIBLE) || 114 (state & (TASK_WAKEKILL | TASK_NOLOAD | TASK_FROZEN))) 115 return false; 116 117 /* 118 * When a freshly created task is scheduled once, changes its state to 119 * TASK_UNINTERRUPTIBLE without having ever been switched out once, it 120 * musn't be checked. 121 */ 122 if (unlikely(!switch_count)) 123 return false; 124 125 if (switch_count != t->last_switch_count) { 126 t->last_switch_count = switch_count; 127 t->last_switch_time = jiffies; 128 return false; 129 } 130 if (time_is_after_jiffies(t->last_switch_time + timeout * HZ)) 131 return false; 132 133 return true; 134 } 135 136 #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER 137 static void debug_show_blocker(struct task_struct *task, unsigned long timeout) 138 { 139 struct task_struct *g, *t; 140 unsigned long owner, blocker, blocker_type; 141 const char *rwsem_blocked_by, *rwsem_blocked_as; 142 143 RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "No rcu lock held"); 144 145 blocker = READ_ONCE(task->blocker); 146 if (!blocker) 147 return; 148 149 blocker_type = hung_task_get_blocker_type(blocker); 150 151 switch (blocker_type) { 152 case BLOCKER_TYPE_MUTEX: 153 owner = mutex_get_owner(hung_task_blocker_to_lock(blocker)); 154 break; 155 case BLOCKER_TYPE_SEM: 156 owner = sem_last_holder(hung_task_blocker_to_lock(blocker)); 157 break; 158 case BLOCKER_TYPE_RWSEM_READER: 159 case BLOCKER_TYPE_RWSEM_WRITER: 160 owner = (unsigned long)rwsem_owner( 161 hung_task_blocker_to_lock(blocker)); 162 rwsem_blocked_as = (blocker_type == BLOCKER_TYPE_RWSEM_READER) ? 163 "reader" : "writer"; 164 rwsem_blocked_by = is_rwsem_reader_owned( 165 hung_task_blocker_to_lock(blocker)) ? 166 "reader" : "writer"; 167 break; 168 default: 169 WARN_ON_ONCE(1); 170 return; 171 } 172 173 174 if (unlikely(!owner)) { 175 switch (blocker_type) { 176 case BLOCKER_TYPE_MUTEX: 177 pr_err("INFO: task %s:%d is blocked on a mutex, but the owner is not found.\n", 178 task->comm, task->pid); 179 break; 180 case BLOCKER_TYPE_SEM: 181 pr_err("INFO: task %s:%d is blocked on a semaphore, but the last holder is not found.\n", 182 task->comm, task->pid); 183 break; 184 case BLOCKER_TYPE_RWSEM_READER: 185 case BLOCKER_TYPE_RWSEM_WRITER: 186 pr_err("INFO: task %s:%d is blocked on an rw-semaphore, but the owner is not found.\n", 187 task->comm, task->pid); 188 break; 189 } 190 return; 191 } 192 193 /* Ensure the owner information is correct. */ 194 for_each_process_thread(g, t) { 195 if ((unsigned long)t != owner) 196 continue; 197 198 switch (blocker_type) { 199 case BLOCKER_TYPE_MUTEX: 200 pr_err("INFO: task %s:%d is blocked on a mutex likely owned by task %s:%d.\n", 201 task->comm, task->pid, t->comm, t->pid); 202 break; 203 case BLOCKER_TYPE_SEM: 204 pr_err("INFO: task %s:%d blocked on a semaphore likely last held by task %s:%d\n", 205 task->comm, task->pid, t->comm, t->pid); 206 break; 207 case BLOCKER_TYPE_RWSEM_READER: 208 case BLOCKER_TYPE_RWSEM_WRITER: 209 pr_err("INFO: task %s:%d <%s> blocked on an rw-semaphore likely owned by task %s:%d <%s>\n", 210 task->comm, task->pid, rwsem_blocked_as, t->comm, 211 t->pid, rwsem_blocked_by); 212 break; 213 } 214 /* Avoid duplicated task dump, skip if the task is also hung. */ 215 if (!task_is_hung(t, timeout)) 216 sched_show_task(t); 217 return; 218 } 219 } 220 #else 221 static inline void debug_show_blocker(struct task_struct *task, unsigned long timeout) 222 { 223 } 224 #endif 225 226 /** 227 * hung_task_info - Print diagnostic details for a hung task 228 * @t: Pointer to the detected hung task. 229 * @timeout: Timeout threshold for detecting hung tasks 230 * @this_round_count: Count of hung tasks detected in the current iteration 231 * 232 * Print structured information about the specified hung task, if warnings 233 * are enabled or if the panic batch threshold is exceeded. 234 */ 235 static void hung_task_info(struct task_struct *t, unsigned long timeout, 236 unsigned long this_round_count) 237 { 238 trace_sched_process_hang(t); 239 240 if (sysctl_hung_task_panic && this_round_count >= sysctl_hung_task_panic) { 241 console_verbose(); 242 hung_task_call_panic = true; 243 } 244 245 /* 246 * The given task did not get scheduled for more than 247 * CONFIG_DEFAULT_HUNG_TASK_TIMEOUT. Therefore, complain 248 * accordingly 249 */ 250 if (sysctl_hung_task_warnings || hung_task_call_panic) { 251 if (sysctl_hung_task_warnings > 0) 252 sysctl_hung_task_warnings--; 253 pr_err("INFO: task %s:%d blocked%s for more than %ld seconds.\n", 254 t->comm, t->pid, t->in_iowait ? " in I/O wait" : "", 255 (jiffies - t->last_switch_time) / HZ); 256 pr_err(" %s %s %.*s\n", 257 print_tainted(), init_utsname()->release, 258 (int)strcspn(init_utsname()->version, " "), 259 init_utsname()->version); 260 if (t->flags & PF_POSTCOREDUMP) 261 pr_err(" Blocked by coredump.\n"); 262 pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" 263 " disables this message.\n"); 264 sched_show_task(t); 265 debug_show_blocker(t, timeout); 266 267 if (!sysctl_hung_task_warnings) 268 pr_info("Future hung task reports are suppressed, see sysctl kernel.hung_task_warnings\n"); 269 } 270 271 touch_nmi_watchdog(); 272 } 273 274 /* 275 * To avoid extending the RCU grace period for an unbounded amount of time, 276 * periodically exit the critical section and enter a new one. 277 * 278 * For preemptible RCU it is sufficient to call rcu_read_unlock in order 279 * to exit the grace period. For classic RCU, a reschedule is required. 280 */ 281 static bool rcu_lock_break(struct task_struct *g, struct task_struct *t) 282 { 283 bool can_cont; 284 285 get_task_struct(g); 286 get_task_struct(t); 287 rcu_read_unlock(); 288 cond_resched(); 289 rcu_read_lock(); 290 can_cont = pid_alive(g) && pid_alive(t); 291 put_task_struct(t); 292 put_task_struct(g); 293 294 return can_cont; 295 } 296 297 /* 298 * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for 299 * a really long time. If that happens, print out a warning. 300 */ 301 static void check_hung_uninterruptible_tasks(unsigned long timeout) 302 { 303 int max_count = sysctl_hung_task_check_count; 304 unsigned long last_break = jiffies; 305 struct task_struct *g, *t; 306 unsigned long this_round_count; 307 int need_warning = sysctl_hung_task_warnings; 308 unsigned long si_mask = hung_task_si_mask; 309 310 /* 311 * If the system crashed already then all bets are off, 312 * do not report extra hung tasks: 313 */ 314 if (test_taint(TAINT_DIE) || did_panic) 315 return; 316 317 this_round_count = 0; 318 rcu_read_lock(); 319 for_each_process_thread(g, t) { 320 if (!max_count--) 321 goto unlock; 322 if (time_after(jiffies, last_break + HUNG_TASK_LOCK_BREAK)) { 323 if (!rcu_lock_break(g, t)) 324 goto unlock; 325 last_break = jiffies; 326 } 327 328 if (task_is_hung(t, timeout)) { 329 /* 330 * Increment the global counter so that userspace could 331 * start migrating tasks ASAP. But count the current 332 * round separately because userspace could reset 333 * the global counter at any time. 334 */ 335 atomic_long_inc(&sysctl_hung_task_detect_count); 336 this_round_count++; 337 hung_task_info(t, timeout, this_round_count); 338 } 339 } 340 unlock: 341 rcu_read_unlock(); 342 343 if (!this_round_count) 344 return; 345 346 if (need_warning || hung_task_call_panic) { 347 si_mask |= SYS_INFO_LOCKS; 348 349 if (sysctl_hung_task_all_cpu_backtrace) 350 si_mask |= SYS_INFO_ALL_BT; 351 } 352 353 sys_info(si_mask); 354 355 if (hung_task_call_panic) 356 panic("hung_task: blocked tasks"); 357 } 358 359 static long hung_timeout_jiffies(unsigned long last_checked, 360 unsigned long timeout) 361 { 362 /* timeout of 0 will disable the watchdog */ 363 return timeout ? last_checked - jiffies + timeout * HZ : 364 MAX_SCHEDULE_TIMEOUT; 365 } 366 367 #ifdef CONFIG_SYSCTL 368 369 /** 370 * proc_dohung_task_detect_count - proc handler for hung_task_detect_count 371 * @table: Pointer to the struct ctl_table definition for this proc entry 372 * @dir: Flag indicating the operation 373 * @buffer: User space buffer for data transfer 374 * @lenp: Pointer to the length of the data being transferred 375 * @ppos: Pointer to the current file offset 376 * 377 * This handler is used for reading the current hung task detection count 378 * and for resetting it to zero when a write operation is performed using a 379 * zero value only. 380 * Return: 0 on success, or a negative error code on failure. 381 */ 382 static int proc_dohung_task_detect_count(const struct ctl_table *table, int dir, 383 void *buffer, size_t *lenp, loff_t *ppos) 384 { 385 unsigned long detect_count; 386 struct ctl_table proxy_table; 387 int err; 388 389 proxy_table = *table; 390 proxy_table.data = &detect_count; 391 392 if (SYSCTL_KERN_TO_USER(dir)) 393 detect_count = atomic_long_read(&sysctl_hung_task_detect_count); 394 395 err = proc_doulongvec_minmax(&proxy_table, dir, buffer, lenp, ppos); 396 if (err < 0) 397 return err; 398 399 if (SYSCTL_USER_TO_KERN(dir)) { 400 if (detect_count) 401 return -EINVAL; 402 atomic_long_set(&sysctl_hung_task_detect_count, 0); 403 } 404 405 return 0; 406 } 407 408 /* 409 * Process updating of timeout sysctl 410 */ 411 static int proc_dohung_task_timeout_secs(const struct ctl_table *table, int write, 412 void *buffer, 413 size_t *lenp, loff_t *ppos) 414 { 415 int ret; 416 417 ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); 418 419 if (ret || !write) 420 goto out; 421 422 wake_up_process(watchdog_task); 423 424 out: 425 return ret; 426 } 427 428 /* 429 * This is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs 430 * and hung_task_check_interval_secs 431 */ 432 static const unsigned long hung_task_timeout_max = (LONG_MAX / HZ); 433 static const struct ctl_table hung_task_sysctls[] = { 434 #ifdef CONFIG_SMP 435 { 436 .procname = "hung_task_all_cpu_backtrace", 437 .data = &sysctl_hung_task_all_cpu_backtrace, 438 .maxlen = sizeof(int), 439 .mode = 0644, 440 .proc_handler = proc_dointvec_minmax, 441 .extra1 = SYSCTL_ZERO, 442 .extra2 = SYSCTL_ONE, 443 }, 444 #endif /* CONFIG_SMP */ 445 { 446 .procname = "hung_task_panic", 447 .data = &sysctl_hung_task_panic, 448 .maxlen = sizeof(int), 449 .mode = 0644, 450 .proc_handler = proc_dointvec_minmax, 451 .extra1 = SYSCTL_ZERO, 452 .extra2 = SYSCTL_INT_MAX, 453 }, 454 { 455 .procname = "hung_task_check_count", 456 .data = &sysctl_hung_task_check_count, 457 .maxlen = sizeof(int), 458 .mode = 0644, 459 .proc_handler = proc_dointvec_minmax, 460 .extra1 = SYSCTL_ZERO, 461 }, 462 { 463 .procname = "hung_task_timeout_secs", 464 .data = &sysctl_hung_task_timeout_secs, 465 .maxlen = sizeof(unsigned long), 466 .mode = 0644, 467 .proc_handler = proc_dohung_task_timeout_secs, 468 .extra2 = (void *)&hung_task_timeout_max, 469 }, 470 { 471 .procname = "hung_task_check_interval_secs", 472 .data = &sysctl_hung_task_check_interval_secs, 473 .maxlen = sizeof(unsigned long), 474 .mode = 0644, 475 .proc_handler = proc_dohung_task_timeout_secs, 476 .extra2 = (void *)&hung_task_timeout_max, 477 }, 478 { 479 .procname = "hung_task_warnings", 480 .data = &sysctl_hung_task_warnings, 481 .maxlen = sizeof(int), 482 .mode = 0644, 483 .proc_handler = proc_dointvec_minmax, 484 .extra1 = SYSCTL_NEG_ONE, 485 }, 486 { 487 .procname = "hung_task_detect_count", 488 .maxlen = sizeof(unsigned long), 489 .mode = 0644, 490 .proc_handler = proc_dohung_task_detect_count, 491 }, 492 { 493 .procname = "hung_task_sys_info", 494 .data = &hung_task_si_mask, 495 .maxlen = sizeof(hung_task_si_mask), 496 .mode = 0644, 497 .proc_handler = sysctl_sys_info_handler, 498 }, 499 }; 500 501 static void __init hung_task_sysctl_init(void) 502 { 503 register_sysctl_init("kernel", hung_task_sysctls); 504 } 505 #else 506 #define hung_task_sysctl_init() do { } while (0) 507 #endif /* CONFIG_SYSCTL */ 508 509 510 static atomic_t reset_hung_task = ATOMIC_INIT(0); 511 512 void reset_hung_task_detector(void) 513 { 514 atomic_set(&reset_hung_task, 1); 515 } 516 EXPORT_SYMBOL_GPL(reset_hung_task_detector); 517 518 static bool hung_detector_suspended; 519 520 static int hungtask_pm_notify(struct notifier_block *self, 521 unsigned long action, void *hcpu) 522 { 523 switch (action) { 524 case PM_SUSPEND_PREPARE: 525 case PM_HIBERNATION_PREPARE: 526 case PM_RESTORE_PREPARE: 527 hung_detector_suspended = true; 528 break; 529 case PM_POST_SUSPEND: 530 case PM_POST_HIBERNATION: 531 case PM_POST_RESTORE: 532 hung_detector_suspended = false; 533 break; 534 default: 535 break; 536 } 537 return NOTIFY_OK; 538 } 539 540 /* 541 * kthread which checks for tasks stuck in D state 542 */ 543 static int watchdog(void *dummy) 544 { 545 unsigned long hung_last_checked = jiffies; 546 547 set_user_nice(current, 0); 548 549 for ( ; ; ) { 550 unsigned long timeout = sysctl_hung_task_timeout_secs; 551 unsigned long interval = sysctl_hung_task_check_interval_secs; 552 long t; 553 554 if (interval == 0) 555 interval = timeout; 556 interval = min_t(unsigned long, interval, timeout); 557 t = hung_timeout_jiffies(hung_last_checked, interval); 558 if (t <= 0) { 559 if (!atomic_xchg(&reset_hung_task, 0) && 560 !hung_detector_suspended) 561 check_hung_uninterruptible_tasks(timeout); 562 hung_last_checked = jiffies; 563 continue; 564 } 565 schedule_timeout_interruptible(t); 566 } 567 568 return 0; 569 } 570 571 static int __init hung_task_init(void) 572 { 573 atomic_notifier_chain_register(&panic_notifier_list, &panic_block); 574 575 /* Disable hung task detector on suspend */ 576 pm_notifier(hungtask_pm_notify, 0); 577 578 watchdog_task = kthread_run(watchdog, NULL, "khungtaskd"); 579 hung_task_sysctl_init(); 580 581 return 0; 582 } 583 subsys_initcall(hung_task_init); 584