1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * linux/mm/oom_kill.c 4 * 5 * Copyright (C) 1998,2000 Rik van Riel 6 * Thanks go out to Claus Fischer for some serious inspiration and 7 * for goading me into coding this file... 8 * Copyright (C) 2010 Google, Inc. 9 * Rewritten by David Rientjes 10 * 11 * The routines in this file are used to kill a process when 12 * we're seriously out of memory. This gets called from __alloc_pages() 13 * in mm/page_alloc.c when we really run out of memory. 14 * 15 * Since we won't call these routines often (on a well-configured 16 * machine) this file will double as a 'coding guide' and a signpost 17 * for newbie kernel hackers. It features several pointers to major 18 * kernel subsystems and hints as to where to find out what things do. 19 */ 20 21 #include <linux/oom.h> 22 #include <linux/mm.h> 23 #include <linux/err.h> 24 #include <linux/gfp.h> 25 #include <linux/sched.h> 26 #include <linux/sched/mm.h> 27 #include <linux/sched/coredump.h> 28 #include <linux/sched/task.h> 29 #include <linux/sched/debug.h> 30 #include <linux/swap.h> 31 #include <linux/syscalls.h> 32 #include <linux/timex.h> 33 #include <linux/jiffies.h> 34 #include <linux/cpuset.h> 35 #include <linux/export.h> 36 #include <linux/notifier.h> 37 #include <linux/memcontrol.h> 38 #include <linux/mempolicy.h> 39 #include <linux/security.h> 40 #include <linux/ptrace.h> 41 #include <linux/freezer.h> 42 #include <linux/ftrace.h> 43 #include <linux/ratelimit.h> 44 #include <linux/kthread.h> 45 #include <linux/init.h> 46 #include <linux/mmu_notifier.h> 47 48 #include <asm/tlb.h> 49 #include "internal.h" 50 #include "slab.h" 51 52 #define CREATE_TRACE_POINTS 53 #include <trace/events/oom.h> 54 55 int sysctl_panic_on_oom; 56 int sysctl_oom_kill_allocating_task; 57 int sysctl_oom_dump_tasks = 1; 58 59 /* 60 * Serializes oom killer invocations (out_of_memory()) from all contexts to 61 * prevent from over eager oom killing (e.g. when the oom killer is invoked 62 * from different domains). 63 * 64 * oom_killer_disable() relies on this lock to stabilize oom_killer_disabled 65 * and mark_oom_victim 66 */ 67 DEFINE_MUTEX(oom_lock); 68 /* Serializes oom_score_adj and oom_score_adj_min updates */ 69 DEFINE_MUTEX(oom_adj_mutex); 70 71 static inline bool is_memcg_oom(struct oom_control *oc) 72 { 73 return oc->memcg != NULL; 74 } 75 76 #ifdef CONFIG_NUMA 77 /** 78 * oom_cpuset_eligible() - check task eligibility for kill 79 * @start: task struct of which task to consider 80 * @oc: pointer to struct oom_control 81 * 82 * Task eligibility is determined by whether or not a candidate task, @tsk, 83 * shares the same mempolicy nodes as current if it is bound by such a policy 84 * and whether or not it has the same set of allowed cpuset nodes. 85 * 86 * This function is assuming oom-killer context and 'current' has triggered 87 * the oom-killer. 88 */ 89 static bool oom_cpuset_eligible(struct task_struct *start, 90 struct oom_control *oc) 91 { 92 struct task_struct *tsk; 93 bool ret = false; 94 const nodemask_t *mask = oc->nodemask; 95 96 if (is_memcg_oom(oc)) 97 return true; 98 99 rcu_read_lock(); 100 for_each_thread(start, tsk) { 101 if (mask) { 102 /* 103 * If this is a mempolicy constrained oom, tsk's 104 * cpuset is irrelevant. Only return true if its 105 * mempolicy intersects current, otherwise it may be 106 * needlessly killed. 107 */ 108 ret = mempolicy_in_oom_domain(tsk, mask); 109 } else { 110 /* 111 * This is not a mempolicy constrained oom, so only 112 * check the mems of tsk's cpuset. 113 */ 114 ret = cpuset_mems_allowed_intersects(current, tsk); 115 } 116 if (ret) 117 break; 118 } 119 rcu_read_unlock(); 120 121 return ret; 122 } 123 #else 124 static bool oom_cpuset_eligible(struct task_struct *tsk, struct oom_control *oc) 125 { 126 return true; 127 } 128 #endif /* CONFIG_NUMA */ 129 130 /* 131 * The process p may have detached its own ->mm while exiting or through 132 * kthread_use_mm(), but one or more of its subthreads may still have a valid 133 * pointer. Return p, or any of its subthreads with a valid ->mm, with 134 * task_lock() held. 135 */ 136 struct task_struct *find_lock_task_mm(struct task_struct *p) 137 { 138 struct task_struct *t; 139 140 rcu_read_lock(); 141 142 for_each_thread(p, t) { 143 task_lock(t); 144 if (likely(t->mm)) 145 goto found; 146 task_unlock(t); 147 } 148 t = NULL; 149 found: 150 rcu_read_unlock(); 151 152 return t; 153 } 154 155 /* 156 * order == -1 means the oom kill is required by sysrq, otherwise only 157 * for display purposes. 158 */ 159 static inline bool is_sysrq_oom(struct oom_control *oc) 160 { 161 return oc->order == -1; 162 } 163 164 /* return true if the task is not adequate as candidate victim task. */ 165 static bool oom_unkillable_task(struct task_struct *p) 166 { 167 if (is_global_init(p)) 168 return true; 169 if (p->flags & PF_KTHREAD) 170 return true; 171 return false; 172 } 173 174 /* 175 * Check whether unreclaimable slab amount is greater than 176 * all user memory(LRU pages). 177 * dump_unreclaimable_slab() could help in the case that 178 * oom due to too much unreclaimable slab used by kernel. 179 */ 180 static bool should_dump_unreclaim_slab(void) 181 { 182 unsigned long nr_lru; 183 184 nr_lru = global_node_page_state(NR_ACTIVE_ANON) + 185 global_node_page_state(NR_INACTIVE_ANON) + 186 global_node_page_state(NR_ACTIVE_FILE) + 187 global_node_page_state(NR_INACTIVE_FILE) + 188 global_node_page_state(NR_ISOLATED_ANON) + 189 global_node_page_state(NR_ISOLATED_FILE) + 190 global_node_page_state(NR_UNEVICTABLE); 191 192 return (global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B) > nr_lru); 193 } 194 195 /** 196 * oom_badness - heuristic function to determine which candidate task to kill 197 * @p: task struct of which task we should calculate 198 * @totalpages: total present RAM allowed for page allocation 199 * 200 * The heuristic for determining which task to kill is made to be as simple and 201 * predictable as possible. The goal is to return the highest value for the 202 * task consuming the most memory to avoid subsequent oom failures. 203 */ 204 long oom_badness(struct task_struct *p, unsigned long totalpages) 205 { 206 long points; 207 long adj; 208 209 if (oom_unkillable_task(p)) 210 return LONG_MIN; 211 212 p = find_lock_task_mm(p); 213 if (!p) 214 return LONG_MIN; 215 216 /* 217 * Do not even consider tasks which are explicitly marked oom 218 * unkillable or have been already oom reaped or the are in 219 * the middle of vfork 220 */ 221 adj = (long)p->signal->oom_score_adj; 222 if (adj == OOM_SCORE_ADJ_MIN || 223 test_bit(MMF_OOM_SKIP, &p->mm->flags) || 224 in_vfork(p)) { 225 task_unlock(p); 226 return LONG_MIN; 227 } 228 229 /* 230 * The baseline for the badness score is the proportion of RAM that each 231 * task's rss, pagetable and swap space use. 232 */ 233 points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) + 234 mm_pgtables_bytes(p->mm) / PAGE_SIZE; 235 task_unlock(p); 236 237 /* Normalize to oom_score_adj units */ 238 adj *= totalpages / 1000; 239 points += adj; 240 241 return points; 242 } 243 244 static const char * const oom_constraint_text[] = { 245 [CONSTRAINT_NONE] = "CONSTRAINT_NONE", 246 [CONSTRAINT_CPUSET] = "CONSTRAINT_CPUSET", 247 [CONSTRAINT_MEMORY_POLICY] = "CONSTRAINT_MEMORY_POLICY", 248 [CONSTRAINT_MEMCG] = "CONSTRAINT_MEMCG", 249 }; 250 251 /* 252 * Determine the type of allocation constraint. 253 */ 254 static enum oom_constraint constrained_alloc(struct oom_control *oc) 255 { 256 struct zone *zone; 257 struct zoneref *z; 258 enum zone_type highest_zoneidx = gfp_zone(oc->gfp_mask); 259 bool cpuset_limited = false; 260 int nid; 261 262 if (is_memcg_oom(oc)) { 263 oc->totalpages = mem_cgroup_get_max(oc->memcg) ?: 1; 264 return CONSTRAINT_MEMCG; 265 } 266 267 /* Default to all available memory */ 268 oc->totalpages = totalram_pages() + total_swap_pages; 269 270 if (!IS_ENABLED(CONFIG_NUMA)) 271 return CONSTRAINT_NONE; 272 273 if (!oc->zonelist) 274 return CONSTRAINT_NONE; 275 /* 276 * Reach here only when __GFP_NOFAIL is used. So, we should avoid 277 * to kill current.We have to random task kill in this case. 278 * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now. 279 */ 280 if (oc->gfp_mask & __GFP_THISNODE) 281 return CONSTRAINT_NONE; 282 283 /* 284 * This is not a __GFP_THISNODE allocation, so a truncated nodemask in 285 * the page allocator means a mempolicy is in effect. Cpuset policy 286 * is enforced in get_page_from_freelist(). 287 */ 288 if (oc->nodemask && 289 !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) { 290 oc->totalpages = total_swap_pages; 291 for_each_node_mask(nid, *oc->nodemask) 292 oc->totalpages += node_present_pages(nid); 293 return CONSTRAINT_MEMORY_POLICY; 294 } 295 296 /* Check this allocation failure is caused by cpuset's wall function */ 297 for_each_zone_zonelist_nodemask(zone, z, oc->zonelist, 298 highest_zoneidx, oc->nodemask) 299 if (!cpuset_zone_allowed(zone, oc->gfp_mask)) 300 cpuset_limited = true; 301 302 if (cpuset_limited) { 303 oc->totalpages = total_swap_pages; 304 for_each_node_mask(nid, cpuset_current_mems_allowed) 305 oc->totalpages += node_present_pages(nid); 306 return CONSTRAINT_CPUSET; 307 } 308 return CONSTRAINT_NONE; 309 } 310 311 static int oom_evaluate_task(struct task_struct *task, void *arg) 312 { 313 struct oom_control *oc = arg; 314 long points; 315 316 if (oom_unkillable_task(task)) 317 goto next; 318 319 /* p may not have freeable memory in nodemask */ 320 if (!is_memcg_oom(oc) && !oom_cpuset_eligible(task, oc)) 321 goto next; 322 323 /* 324 * This task already has access to memory reserves and is being killed. 325 * Don't allow any other task to have access to the reserves unless 326 * the task has MMF_OOM_SKIP because chances that it would release 327 * any memory is quite low. 328 */ 329 if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) { 330 if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags)) 331 goto next; 332 goto abort; 333 } 334 335 /* 336 * If task is allocating a lot of memory and has been marked to be 337 * killed first if it triggers an oom, then select it. 338 */ 339 if (oom_task_origin(task)) { 340 points = LONG_MAX; 341 goto select; 342 } 343 344 points = oom_badness(task, oc->totalpages); 345 if (points == LONG_MIN || points < oc->chosen_points) 346 goto next; 347 348 select: 349 if (oc->chosen) 350 put_task_struct(oc->chosen); 351 get_task_struct(task); 352 oc->chosen = task; 353 oc->chosen_points = points; 354 next: 355 return 0; 356 abort: 357 if (oc->chosen) 358 put_task_struct(oc->chosen); 359 oc->chosen = (void *)-1UL; 360 return 1; 361 } 362 363 /* 364 * Simple selection loop. We choose the process with the highest number of 365 * 'points'. In case scan was aborted, oc->chosen is set to -1. 366 */ 367 static void select_bad_process(struct oom_control *oc) 368 { 369 oc->chosen_points = LONG_MIN; 370 371 if (is_memcg_oom(oc)) 372 mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc); 373 else { 374 struct task_struct *p; 375 376 rcu_read_lock(); 377 for_each_process(p) 378 if (oom_evaluate_task(p, oc)) 379 break; 380 rcu_read_unlock(); 381 } 382 } 383 384 static int dump_task(struct task_struct *p, void *arg) 385 { 386 struct oom_control *oc = arg; 387 struct task_struct *task; 388 389 if (oom_unkillable_task(p)) 390 return 0; 391 392 /* p may not have freeable memory in nodemask */ 393 if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc)) 394 return 0; 395 396 task = find_lock_task_mm(p); 397 if (!task) { 398 /* 399 * All of p's threads have already detached their mm's. There's 400 * no need to report them; they can't be oom killed anyway. 401 */ 402 return 0; 403 } 404 405 pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n", 406 task->pid, from_kuid(&init_user_ns, task_uid(task)), 407 task->tgid, task->mm->total_vm, get_mm_rss(task->mm), 408 mm_pgtables_bytes(task->mm), 409 get_mm_counter(task->mm, MM_SWAPENTS), 410 task->signal->oom_score_adj, task->comm); 411 task_unlock(task); 412 413 return 0; 414 } 415 416 /** 417 * dump_tasks - dump current memory state of all system tasks 418 * @oc: pointer to struct oom_control 419 * 420 * Dumps the current memory state of all eligible tasks. Tasks not in the same 421 * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes 422 * are not shown. 423 * State information includes task's pid, uid, tgid, vm size, rss, 424 * pgtables_bytes, swapents, oom_score_adj value, and name. 425 */ 426 static void dump_tasks(struct oom_control *oc) 427 { 428 pr_info("Tasks state (memory values in pages):\n"); 429 pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n"); 430 431 if (is_memcg_oom(oc)) 432 mem_cgroup_scan_tasks(oc->memcg, dump_task, oc); 433 else { 434 struct task_struct *p; 435 436 rcu_read_lock(); 437 for_each_process(p) 438 dump_task(p, oc); 439 rcu_read_unlock(); 440 } 441 } 442 443 static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim) 444 { 445 /* one line summary of the oom killer context. */ 446 pr_info("oom-kill:constraint=%s,nodemask=%*pbl", 447 oom_constraint_text[oc->constraint], 448 nodemask_pr_args(oc->nodemask)); 449 cpuset_print_current_mems_allowed(); 450 mem_cgroup_print_oom_context(oc->memcg, victim); 451 pr_cont(",task=%s,pid=%d,uid=%d\n", victim->comm, victim->pid, 452 from_kuid(&init_user_ns, task_uid(victim))); 453 } 454 455 static void dump_header(struct oom_control *oc, struct task_struct *p) 456 { 457 pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n", 458 current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, 459 current->signal->oom_score_adj); 460 if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order) 461 pr_warn("COMPACTION is disabled!!!\n"); 462 463 dump_stack(); 464 if (is_memcg_oom(oc)) 465 mem_cgroup_print_oom_meminfo(oc->memcg); 466 else { 467 show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask); 468 if (should_dump_unreclaim_slab()) 469 dump_unreclaimable_slab(); 470 } 471 if (sysctl_oom_dump_tasks) 472 dump_tasks(oc); 473 if (p) 474 dump_oom_summary(oc, p); 475 } 476 477 /* 478 * Number of OOM victims in flight 479 */ 480 static atomic_t oom_victims = ATOMIC_INIT(0); 481 static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait); 482 483 static bool oom_killer_disabled __read_mostly; 484 485 #define K(x) ((x) << (PAGE_SHIFT-10)) 486 487 /* 488 * task->mm can be NULL if the task is the exited group leader. So to 489 * determine whether the task is using a particular mm, we examine all the 490 * task's threads: if one of those is using this mm then this task was also 491 * using it. 492 */ 493 bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) 494 { 495 struct task_struct *t; 496 497 for_each_thread(p, t) { 498 struct mm_struct *t_mm = READ_ONCE(t->mm); 499 if (t_mm) 500 return t_mm == mm; 501 } 502 return false; 503 } 504 505 #ifdef CONFIG_MMU 506 /* 507 * OOM Reaper kernel thread which tries to reap the memory used by the OOM 508 * victim (if that is possible) to help the OOM killer to move on. 509 */ 510 static struct task_struct *oom_reaper_th; 511 static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); 512 static struct task_struct *oom_reaper_list; 513 static DEFINE_SPINLOCK(oom_reaper_lock); 514 515 bool __oom_reap_task_mm(struct mm_struct *mm) 516 { 517 struct vm_area_struct *vma; 518 bool ret = true; 519 520 /* 521 * Tell all users of get_user/copy_from_user etc... that the content 522 * is no longer stable. No barriers really needed because unmapping 523 * should imply barriers already and the reader would hit a page fault 524 * if it stumbled over a reaped memory. 525 */ 526 set_bit(MMF_UNSTABLE, &mm->flags); 527 528 for (vma = mm->mmap ; vma; vma = vma->vm_next) { 529 if (!can_madv_lru_vma(vma)) 530 continue; 531 532 /* 533 * Only anonymous pages have a good chance to be dropped 534 * without additional steps which we cannot afford as we 535 * are OOM already. 536 * 537 * We do not even care about fs backed pages because all 538 * which are reclaimable have already been reclaimed and 539 * we do not want to block exit_mmap by keeping mm ref 540 * count elevated without a good reason. 541 */ 542 if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) { 543 struct mmu_notifier_range range; 544 struct mmu_gather tlb; 545 546 mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, 547 vma, mm, vma->vm_start, 548 vma->vm_end); 549 tlb_gather_mmu(&tlb, mm); 550 if (mmu_notifier_invalidate_range_start_nonblock(&range)) { 551 tlb_finish_mmu(&tlb); 552 ret = false; 553 continue; 554 } 555 unmap_page_range(&tlb, vma, range.start, range.end, NULL); 556 mmu_notifier_invalidate_range_end(&range); 557 tlb_finish_mmu(&tlb); 558 } 559 } 560 561 return ret; 562 } 563 564 /* 565 * Reaps the address space of the give task. 566 * 567 * Returns true on success and false if none or part of the address space 568 * has been reclaimed and the caller should retry later. 569 */ 570 static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) 571 { 572 bool ret = true; 573 574 if (!mmap_read_trylock(mm)) { 575 trace_skip_task_reaping(tsk->pid); 576 return false; 577 } 578 579 /* 580 * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't 581 * work on the mm anymore. The check for MMF_OOM_SKIP must run 582 * under mmap_lock for reading because it serializes against the 583 * mmap_write_lock();mmap_write_unlock() cycle in exit_mmap(). 584 */ 585 if (test_bit(MMF_OOM_SKIP, &mm->flags)) { 586 trace_skip_task_reaping(tsk->pid); 587 goto out_unlock; 588 } 589 590 trace_start_task_reaping(tsk->pid); 591 592 /* failed to reap part of the address space. Try again later */ 593 ret = __oom_reap_task_mm(mm); 594 if (!ret) 595 goto out_finish; 596 597 pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", 598 task_pid_nr(tsk), tsk->comm, 599 K(get_mm_counter(mm, MM_ANONPAGES)), 600 K(get_mm_counter(mm, MM_FILEPAGES)), 601 K(get_mm_counter(mm, MM_SHMEMPAGES))); 602 out_finish: 603 trace_finish_task_reaping(tsk->pid); 604 out_unlock: 605 mmap_read_unlock(mm); 606 607 return ret; 608 } 609 610 #define MAX_OOM_REAP_RETRIES 10 611 static void oom_reap_task(struct task_struct *tsk) 612 { 613 int attempts = 0; 614 struct mm_struct *mm = tsk->signal->oom_mm; 615 616 /* Retry the mmap_read_trylock(mm) a few times */ 617 while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm)) 618 schedule_timeout_idle(HZ/10); 619 620 if (attempts <= MAX_OOM_REAP_RETRIES || 621 test_bit(MMF_OOM_SKIP, &mm->flags)) 622 goto done; 623 624 pr_info("oom_reaper: unable to reap pid:%d (%s)\n", 625 task_pid_nr(tsk), tsk->comm); 626 sched_show_task(tsk); 627 debug_show_all_locks(); 628 629 done: 630 tsk->oom_reaper_list = NULL; 631 632 /* 633 * Hide this mm from OOM killer because it has been either reaped or 634 * somebody can't call mmap_write_unlock(mm). 635 */ 636 set_bit(MMF_OOM_SKIP, &mm->flags); 637 638 /* Drop a reference taken by wake_oom_reaper */ 639 put_task_struct(tsk); 640 } 641 642 static int oom_reaper(void *unused) 643 { 644 set_freezable(); 645 646 while (true) { 647 struct task_struct *tsk = NULL; 648 649 wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL); 650 spin_lock(&oom_reaper_lock); 651 if (oom_reaper_list != NULL) { 652 tsk = oom_reaper_list; 653 oom_reaper_list = tsk->oom_reaper_list; 654 } 655 spin_unlock(&oom_reaper_lock); 656 657 if (tsk) 658 oom_reap_task(tsk); 659 } 660 661 return 0; 662 } 663 664 static void wake_oom_reaper(struct task_struct *tsk) 665 { 666 /* mm is already queued? */ 667 if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags)) 668 return; 669 670 get_task_struct(tsk); 671 672 spin_lock(&oom_reaper_lock); 673 tsk->oom_reaper_list = oom_reaper_list; 674 oom_reaper_list = tsk; 675 spin_unlock(&oom_reaper_lock); 676 trace_wake_reaper(tsk->pid); 677 wake_up(&oom_reaper_wait); 678 } 679 680 static int __init oom_init(void) 681 { 682 oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); 683 return 0; 684 } 685 subsys_initcall(oom_init) 686 #else 687 static inline void wake_oom_reaper(struct task_struct *tsk) 688 { 689 } 690 #endif /* CONFIG_MMU */ 691 692 /** 693 * mark_oom_victim - mark the given task as OOM victim 694 * @tsk: task to mark 695 * 696 * Has to be called with oom_lock held and never after 697 * oom has been disabled already. 698 * 699 * tsk->mm has to be non NULL and caller has to guarantee it is stable (either 700 * under task_lock or operate on the current). 701 */ 702 static void mark_oom_victim(struct task_struct *tsk) 703 { 704 struct mm_struct *mm = tsk->mm; 705 706 WARN_ON(oom_killer_disabled); 707 /* OOM killer might race with memcg OOM */ 708 if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE)) 709 return; 710 711 /* oom_mm is bound to the signal struct life time. */ 712 if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) { 713 mmgrab(tsk->signal->oom_mm); 714 set_bit(MMF_OOM_VICTIM, &mm->flags); 715 } 716 717 /* 718 * Make sure that the task is woken up from uninterruptible sleep 719 * if it is frozen because OOM killer wouldn't be able to free 720 * any memory and livelock. freezing_slow_path will tell the freezer 721 * that TIF_MEMDIE tasks should be ignored. 722 */ 723 __thaw_task(tsk); 724 atomic_inc(&oom_victims); 725 trace_mark_victim(tsk->pid); 726 } 727 728 /** 729 * exit_oom_victim - note the exit of an OOM victim 730 */ 731 void exit_oom_victim(void) 732 { 733 clear_thread_flag(TIF_MEMDIE); 734 735 if (!atomic_dec_return(&oom_victims)) 736 wake_up_all(&oom_victims_wait); 737 } 738 739 /** 740 * oom_killer_enable - enable OOM killer 741 */ 742 void oom_killer_enable(void) 743 { 744 oom_killer_disabled = false; 745 pr_info("OOM killer enabled.\n"); 746 } 747 748 /** 749 * oom_killer_disable - disable OOM killer 750 * @timeout: maximum timeout to wait for oom victims in jiffies 751 * 752 * Forces all page allocations to fail rather than trigger OOM killer. 753 * Will block and wait until all OOM victims are killed or the given 754 * timeout expires. 755 * 756 * The function cannot be called when there are runnable user tasks because 757 * the userspace would see unexpected allocation failures as a result. Any 758 * new usage of this function should be consulted with MM people. 759 * 760 * Returns true if successful and false if the OOM killer cannot be 761 * disabled. 762 */ 763 bool oom_killer_disable(signed long timeout) 764 { 765 signed long ret; 766 767 /* 768 * Make sure to not race with an ongoing OOM killer. Check that the 769 * current is not killed (possibly due to sharing the victim's memory). 770 */ 771 if (mutex_lock_killable(&oom_lock)) 772 return false; 773 oom_killer_disabled = true; 774 mutex_unlock(&oom_lock); 775 776 ret = wait_event_interruptible_timeout(oom_victims_wait, 777 !atomic_read(&oom_victims), timeout); 778 if (ret <= 0) { 779 oom_killer_enable(); 780 return false; 781 } 782 pr_info("OOM killer disabled.\n"); 783 784 return true; 785 } 786 787 static inline bool __task_will_free_mem(struct task_struct *task) 788 { 789 struct signal_struct *sig = task->signal; 790 791 /* 792 * A coredumping process may sleep for an extended period in 793 * coredump_task_exit(), so the oom killer cannot assume that 794 * the process will promptly exit and release memory. 795 */ 796 if (sig->flags & SIGNAL_GROUP_COREDUMP) 797 return false; 798 799 if (sig->flags & SIGNAL_GROUP_EXIT) 800 return true; 801 802 if (thread_group_empty(task) && (task->flags & PF_EXITING)) 803 return true; 804 805 return false; 806 } 807 808 /* 809 * Checks whether the given task is dying or exiting and likely to 810 * release its address space. This means that all threads and processes 811 * sharing the same mm have to be killed or exiting. 812 * Caller has to make sure that task->mm is stable (hold task_lock or 813 * it operates on the current). 814 */ 815 static bool task_will_free_mem(struct task_struct *task) 816 { 817 struct mm_struct *mm = task->mm; 818 struct task_struct *p; 819 bool ret = true; 820 821 /* 822 * Skip tasks without mm because it might have passed its exit_mm and 823 * exit_oom_victim. oom_reaper could have rescued that but do not rely 824 * on that for now. We can consider find_lock_task_mm in future. 825 */ 826 if (!mm) 827 return false; 828 829 if (!__task_will_free_mem(task)) 830 return false; 831 832 /* 833 * This task has already been drained by the oom reaper so there are 834 * only small chances it will free some more 835 */ 836 if (test_bit(MMF_OOM_SKIP, &mm->flags)) 837 return false; 838 839 if (atomic_read(&mm->mm_users) <= 1) 840 return true; 841 842 /* 843 * Make sure that all tasks which share the mm with the given tasks 844 * are dying as well to make sure that a) nobody pins its mm and 845 * b) the task is also reapable by the oom reaper. 846 */ 847 rcu_read_lock(); 848 for_each_process(p) { 849 if (!process_shares_mm(p, mm)) 850 continue; 851 if (same_thread_group(task, p)) 852 continue; 853 ret = __task_will_free_mem(p); 854 if (!ret) 855 break; 856 } 857 rcu_read_unlock(); 858 859 return ret; 860 } 861 862 static void __oom_kill_process(struct task_struct *victim, const char *message) 863 { 864 struct task_struct *p; 865 struct mm_struct *mm; 866 bool can_oom_reap = true; 867 868 p = find_lock_task_mm(victim); 869 if (!p) { 870 pr_info("%s: OOM victim %d (%s) is already exiting. Skip killing the task\n", 871 message, task_pid_nr(victim), victim->comm); 872 put_task_struct(victim); 873 return; 874 } else if (victim != p) { 875 get_task_struct(p); 876 put_task_struct(victim); 877 victim = p; 878 } 879 880 /* Get a reference to safely compare mm after task_unlock(victim) */ 881 mm = victim->mm; 882 mmgrab(mm); 883 884 /* Raise event before sending signal: task reaper must see this */ 885 count_vm_event(OOM_KILL); 886 memcg_memory_event_mm(mm, MEMCG_OOM_KILL); 887 888 /* 889 * We should send SIGKILL before granting access to memory reserves 890 * in order to prevent the OOM victim from depleting the memory 891 * reserves from the user space under its control. 892 */ 893 do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID); 894 mark_oom_victim(victim); 895 pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%hd\n", 896 message, task_pid_nr(victim), victim->comm, K(mm->total_vm), 897 K(get_mm_counter(mm, MM_ANONPAGES)), 898 K(get_mm_counter(mm, MM_FILEPAGES)), 899 K(get_mm_counter(mm, MM_SHMEMPAGES)), 900 from_kuid(&init_user_ns, task_uid(victim)), 901 mm_pgtables_bytes(mm) >> 10, victim->signal->oom_score_adj); 902 task_unlock(victim); 903 904 /* 905 * Kill all user processes sharing victim->mm in other thread groups, if 906 * any. They don't get access to memory reserves, though, to avoid 907 * depletion of all memory. This prevents mm->mmap_lock livelock when an 908 * oom killed thread cannot exit because it requires the semaphore and 909 * its contended by another thread trying to allocate memory itself. 910 * That thread will now get access to memory reserves since it has a 911 * pending fatal signal. 912 */ 913 rcu_read_lock(); 914 for_each_process(p) { 915 if (!process_shares_mm(p, mm)) 916 continue; 917 if (same_thread_group(p, victim)) 918 continue; 919 if (is_global_init(p)) { 920 can_oom_reap = false; 921 set_bit(MMF_OOM_SKIP, &mm->flags); 922 pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n", 923 task_pid_nr(victim), victim->comm, 924 task_pid_nr(p), p->comm); 925 continue; 926 } 927 /* 928 * No kthread_use_mm() user needs to read from the userspace so 929 * we are ok to reap it. 930 */ 931 if (unlikely(p->flags & PF_KTHREAD)) 932 continue; 933 do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID); 934 } 935 rcu_read_unlock(); 936 937 if (can_oom_reap) 938 wake_oom_reaper(victim); 939 940 mmdrop(mm); 941 put_task_struct(victim); 942 } 943 #undef K 944 945 /* 946 * Kill provided task unless it's secured by setting 947 * oom_score_adj to OOM_SCORE_ADJ_MIN. 948 */ 949 static int oom_kill_memcg_member(struct task_struct *task, void *message) 950 { 951 if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN && 952 !is_global_init(task)) { 953 get_task_struct(task); 954 __oom_kill_process(task, message); 955 } 956 return 0; 957 } 958 959 static void oom_kill_process(struct oom_control *oc, const char *message) 960 { 961 struct task_struct *victim = oc->chosen; 962 struct mem_cgroup *oom_group; 963 static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, 964 DEFAULT_RATELIMIT_BURST); 965 966 /* 967 * If the task is already exiting, don't alarm the sysadmin or kill 968 * its children or threads, just give it access to memory reserves 969 * so it can die quickly 970 */ 971 task_lock(victim); 972 if (task_will_free_mem(victim)) { 973 mark_oom_victim(victim); 974 wake_oom_reaper(victim); 975 task_unlock(victim); 976 put_task_struct(victim); 977 return; 978 } 979 task_unlock(victim); 980 981 if (__ratelimit(&oom_rs)) 982 dump_header(oc, victim); 983 984 /* 985 * Do we need to kill the entire memory cgroup? 986 * Or even one of the ancestor memory cgroups? 987 * Check this out before killing the victim task. 988 */ 989 oom_group = mem_cgroup_get_oom_group(victim, oc->memcg); 990 991 __oom_kill_process(victim, message); 992 993 /* 994 * If necessary, kill all tasks in the selected memory cgroup. 995 */ 996 if (oom_group) { 997 mem_cgroup_print_oom_group(oom_group); 998 mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member, 999 (void *)message); 1000 mem_cgroup_put(oom_group); 1001 } 1002 } 1003 1004 /* 1005 * Determines whether the kernel must panic because of the panic_on_oom sysctl. 1006 */ 1007 static void check_panic_on_oom(struct oom_control *oc) 1008 { 1009 if (likely(!sysctl_panic_on_oom)) 1010 return; 1011 if (sysctl_panic_on_oom != 2) { 1012 /* 1013 * panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel 1014 * does not panic for cpuset, mempolicy, or memcg allocation 1015 * failures. 1016 */ 1017 if (oc->constraint != CONSTRAINT_NONE) 1018 return; 1019 } 1020 /* Do not panic for oom kills triggered by sysrq */ 1021 if (is_sysrq_oom(oc)) 1022 return; 1023 dump_header(oc, NULL); 1024 panic("Out of memory: %s panic_on_oom is enabled\n", 1025 sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); 1026 } 1027 1028 static BLOCKING_NOTIFIER_HEAD(oom_notify_list); 1029 1030 int register_oom_notifier(struct notifier_block *nb) 1031 { 1032 return blocking_notifier_chain_register(&oom_notify_list, nb); 1033 } 1034 EXPORT_SYMBOL_GPL(register_oom_notifier); 1035 1036 int unregister_oom_notifier(struct notifier_block *nb) 1037 { 1038 return blocking_notifier_chain_unregister(&oom_notify_list, nb); 1039 } 1040 EXPORT_SYMBOL_GPL(unregister_oom_notifier); 1041 1042 /** 1043 * out_of_memory - kill the "best" process when we run out of memory 1044 * @oc: pointer to struct oom_control 1045 * 1046 * If we run out of memory, we have the choice between either 1047 * killing a random task (bad), letting the system crash (worse) 1048 * OR try to be smart about which process to kill. Note that we 1049 * don't have to be perfect here, we just have to be good. 1050 */ 1051 bool out_of_memory(struct oom_control *oc) 1052 { 1053 unsigned long freed = 0; 1054 1055 if (oom_killer_disabled) 1056 return false; 1057 1058 if (!is_memcg_oom(oc)) { 1059 blocking_notifier_call_chain(&oom_notify_list, 0, &freed); 1060 if (freed > 0) 1061 /* Got some memory back in the last second. */ 1062 return true; 1063 } 1064 1065 /* 1066 * If current has a pending SIGKILL or is exiting, then automatically 1067 * select it. The goal is to allow it to allocate so that it may 1068 * quickly exit and free its memory. 1069 */ 1070 if (task_will_free_mem(current)) { 1071 mark_oom_victim(current); 1072 wake_oom_reaper(current); 1073 return true; 1074 } 1075 1076 /* 1077 * The OOM killer does not compensate for IO-less reclaim. 1078 * pagefault_out_of_memory lost its gfp context so we have to 1079 * make sure exclude 0 mask - all other users should have at least 1080 * ___GFP_DIRECT_RECLAIM to get here. But mem_cgroup_oom() has to 1081 * invoke the OOM killer even if it is a GFP_NOFS allocation. 1082 */ 1083 if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc)) 1084 return true; 1085 1086 /* 1087 * Check if there were limitations on the allocation (only relevant for 1088 * NUMA and memcg) that may require different handling. 1089 */ 1090 oc->constraint = constrained_alloc(oc); 1091 if (oc->constraint != CONSTRAINT_MEMORY_POLICY) 1092 oc->nodemask = NULL; 1093 check_panic_on_oom(oc); 1094 1095 if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task && 1096 current->mm && !oom_unkillable_task(current) && 1097 oom_cpuset_eligible(current, oc) && 1098 current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { 1099 get_task_struct(current); 1100 oc->chosen = current; 1101 oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)"); 1102 return true; 1103 } 1104 1105 select_bad_process(oc); 1106 /* Found nothing?!?! */ 1107 if (!oc->chosen) { 1108 dump_header(oc, NULL); 1109 pr_warn("Out of memory and no killable processes...\n"); 1110 /* 1111 * If we got here due to an actual allocation at the 1112 * system level, we cannot survive this and will enter 1113 * an endless loop in the allocator. Bail out now. 1114 */ 1115 if (!is_sysrq_oom(oc) && !is_memcg_oom(oc)) 1116 panic("System is deadlocked on memory\n"); 1117 } 1118 if (oc->chosen && oc->chosen != (void *)-1UL) 1119 oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" : 1120 "Memory cgroup out of memory"); 1121 return !!oc->chosen; 1122 } 1123 1124 /* 1125 * The pagefault handler calls here because some allocation has failed. We have 1126 * to take care of the memcg OOM here because this is the only safe context without 1127 * any locks held but let the oom killer triggered from the allocation context care 1128 * about the global OOM. 1129 */ 1130 void pagefault_out_of_memory(void) 1131 { 1132 static DEFINE_RATELIMIT_STATE(pfoom_rs, DEFAULT_RATELIMIT_INTERVAL, 1133 DEFAULT_RATELIMIT_BURST); 1134 1135 if (mem_cgroup_oom_synchronize(true)) 1136 return; 1137 1138 if (fatal_signal_pending(current)) 1139 return; 1140 1141 if (__ratelimit(&pfoom_rs)) 1142 pr_warn("Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF\n"); 1143 } 1144 1145 SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) 1146 { 1147 #ifdef CONFIG_MMU 1148 struct mm_struct *mm = NULL; 1149 struct task_struct *task; 1150 struct task_struct *p; 1151 unsigned int f_flags; 1152 bool reap = false; 1153 long ret = 0; 1154 1155 if (flags) 1156 return -EINVAL; 1157 1158 task = pidfd_get_task(pidfd, &f_flags); 1159 if (IS_ERR(task)) 1160 return PTR_ERR(task); 1161 1162 /* 1163 * Make sure to choose a thread which still has a reference to mm 1164 * during the group exit 1165 */ 1166 p = find_lock_task_mm(task); 1167 if (!p) { 1168 ret = -ESRCH; 1169 goto put_task; 1170 } 1171 1172 if (mmget_not_zero(p->mm)) { 1173 mm = p->mm; 1174 if (task_will_free_mem(p)) 1175 reap = true; 1176 else { 1177 /* Error only if the work has not been done already */ 1178 if (!test_bit(MMF_OOM_SKIP, &mm->flags)) 1179 ret = -EINVAL; 1180 } 1181 } 1182 task_unlock(p); 1183 1184 if (!reap) 1185 goto drop_mm; 1186 1187 if (mmap_read_lock_killable(mm)) { 1188 ret = -EINTR; 1189 goto drop_mm; 1190 } 1191 if (!__oom_reap_task_mm(mm)) 1192 ret = -EAGAIN; 1193 mmap_read_unlock(mm); 1194 1195 drop_mm: 1196 if (mm) 1197 mmput(mm); 1198 put_task: 1199 put_task_struct(task); 1200 return ret; 1201 #else 1202 return -ENOSYS; 1203 #endif /* CONFIG_MMU */ 1204 } 1205