1 /* 2 * linux/mm/oom_kill.c 3 * 4 * Copyright (C) 1998,2000 Rik van Riel 5 * Thanks go out to Claus Fischer for some serious inspiration and 6 * for goading me into coding this file... 7 * 8 * The routines in this file are used to kill a process when 9 * we're seriously out of memory. This gets called from __alloc_pages() 10 * in mm/page_alloc.c when we really run out of memory. 11 * 12 * Since we won't call these routines often (on a well-configured 13 * machine) this file will double as a 'coding guide' and a signpost 14 * for newbie kernel hackers. It features several pointers to major 15 * kernel subsystems and hints as to where to find out what things do. 16 */ 17 18 #include <linux/oom.h> 19 #include <linux/mm.h> 20 #include <linux/err.h> 21 #include <linux/sched.h> 22 #include <linux/swap.h> 23 #include <linux/timex.h> 24 #include <linux/jiffies.h> 25 #include <linux/cpuset.h> 26 #include <linux/module.h> 27 #include <linux/notifier.h> 28 #include <linux/memcontrol.h> 29 #include <linux/security.h> 30 31 int sysctl_panic_on_oom; 32 int sysctl_oom_kill_allocating_task; 33 int sysctl_oom_dump_tasks; 34 static DEFINE_SPINLOCK(zone_scan_lock); 35 /* #define DEBUG */ 36 37 /* 38 * Is all threads of the target process nodes overlap ours? 39 */ 40 static int has_intersects_mems_allowed(struct task_struct *tsk) 41 { 42 struct task_struct *t; 43 44 t = tsk; 45 do { 46 if (cpuset_mems_allowed_intersects(current, t)) 47 return 1; 48 t = next_thread(t); 49 } while (t != tsk); 50 51 return 0; 52 } 53 54 /** 55 * badness - calculate a numeric value for how bad this task has been 56 * @p: task struct of which task we should calculate 57 * @uptime: current uptime in seconds 58 * 59 * The formula used is relatively simple and documented inline in the 60 * function. The main rationale is that we want to select a good task 61 * to kill when we run out of memory. 62 * 63 * Good in this context means that: 64 * 1) we lose the minimum amount of work done 65 * 2) we recover a large amount of memory 66 * 3) we don't kill anything innocent of eating tons of memory 67 * 4) we want to kill the minimum amount of processes (one) 68 * 5) we try to kill the process the user expects us to kill, this 69 * algorithm has been meticulously tuned to meet the principle 70 * of least surprise ... (be careful when you change it) 71 */ 72 73 unsigned long badness(struct task_struct *p, unsigned long uptime) 74 { 75 unsigned long points, cpu_time, run_time; 76 struct mm_struct *mm; 77 struct task_struct *child; 78 int oom_adj = p->signal->oom_adj; 79 struct task_cputime task_time; 80 unsigned long utime; 81 unsigned long stime; 82 83 if (oom_adj == OOM_DISABLE) 84 return 0; 85 86 task_lock(p); 87 mm = p->mm; 88 if (!mm) { 89 task_unlock(p); 90 return 0; 91 } 92 93 /* 94 * The memory size of the process is the basis for the badness. 95 */ 96 points = mm->total_vm; 97 98 /* 99 * After this unlock we can no longer dereference local variable `mm' 100 */ 101 task_unlock(p); 102 103 /* 104 * swapoff can easily use up all memory, so kill those first. 105 */ 106 if (p->flags & PF_OOM_ORIGIN) 107 return ULONG_MAX; 108 109 /* 110 * Processes which fork a lot of child processes are likely 111 * a good choice. We add half the vmsize of the children if they 112 * have an own mm. This prevents forking servers to flood the 113 * machine with an endless amount of children. In case a single 114 * child is eating the vast majority of memory, adding only half 115 * to the parents will make the child our kill candidate of choice. 116 */ 117 list_for_each_entry(child, &p->children, sibling) { 118 task_lock(child); 119 if (child->mm != mm && child->mm) 120 points += child->mm->total_vm/2 + 1; 121 task_unlock(child); 122 } 123 124 /* 125 * CPU time is in tens of seconds and run time is in thousands 126 * of seconds. There is no particular reason for this other than 127 * that it turned out to work very well in practice. 128 */ 129 thread_group_cputime(p, &task_time); 130 utime = cputime_to_jiffies(task_time.utime); 131 stime = cputime_to_jiffies(task_time.stime); 132 cpu_time = (utime + stime) >> (SHIFT_HZ + 3); 133 134 135 if (uptime >= p->start_time.tv_sec) 136 run_time = (uptime - p->start_time.tv_sec) >> 10; 137 else 138 run_time = 0; 139 140 if (cpu_time) 141 points /= int_sqrt(cpu_time); 142 if (run_time) 143 points /= int_sqrt(int_sqrt(run_time)); 144 145 /* 146 * Niced processes are most likely less important, so double 147 * their badness points. 148 */ 149 if (task_nice(p) > 0) 150 points *= 2; 151 152 /* 153 * Superuser processes are usually more important, so we make it 154 * less likely that we kill those. 155 */ 156 if (has_capability_noaudit(p, CAP_SYS_ADMIN) || 157 has_capability_noaudit(p, CAP_SYS_RESOURCE)) 158 points /= 4; 159 160 /* 161 * We don't want to kill a process with direct hardware access. 162 * Not only could that mess up the hardware, but usually users 163 * tend to only have this flag set on applications they think 164 * of as important. 165 */ 166 if (has_capability_noaudit(p, CAP_SYS_RAWIO)) 167 points /= 4; 168 169 /* 170 * If p's nodes don't overlap ours, it may still help to kill p 171 * because p may have allocated or otherwise mapped memory on 172 * this node before. However it will be less likely. 173 */ 174 if (!has_intersects_mems_allowed(p)) 175 points /= 8; 176 177 /* 178 * Adjust the score by oom_adj. 179 */ 180 if (oom_adj) { 181 if (oom_adj > 0) { 182 if (!points) 183 points = 1; 184 points <<= oom_adj; 185 } else 186 points >>= -(oom_adj); 187 } 188 189 #ifdef DEBUG 190 printk(KERN_DEBUG "OOMkill: task %d (%s) got %lu points\n", 191 p->pid, p->comm, points); 192 #endif 193 return points; 194 } 195 196 /* 197 * Determine the type of allocation constraint. 198 */ 199 static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist, 200 gfp_t gfp_mask) 201 { 202 #ifdef CONFIG_NUMA 203 struct zone *zone; 204 struct zoneref *z; 205 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 206 nodemask_t nodes = node_states[N_HIGH_MEMORY]; 207 208 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) 209 if (cpuset_zone_allowed_softwall(zone, gfp_mask)) 210 node_clear(zone_to_nid(zone), nodes); 211 else 212 return CONSTRAINT_CPUSET; 213 214 if (!nodes_empty(nodes)) 215 return CONSTRAINT_MEMORY_POLICY; 216 #endif 217 218 return CONSTRAINT_NONE; 219 } 220 221 /* 222 * Simple selection loop. We chose the process with the highest 223 * number of 'points'. We expect the caller will lock the tasklist. 224 * 225 * (not docbooked, we don't want this one cluttering up the manual) 226 */ 227 static struct task_struct *select_bad_process(unsigned long *ppoints, 228 struct mem_cgroup *mem) 229 { 230 struct task_struct *p; 231 struct task_struct *chosen = NULL; 232 struct timespec uptime; 233 *ppoints = 0; 234 235 do_posix_clock_monotonic_gettime(&uptime); 236 for_each_process(p) { 237 unsigned long points; 238 239 /* 240 * skip kernel threads and tasks which have already released 241 * their mm. 242 */ 243 if (!p->mm) 244 continue; 245 /* skip the init task */ 246 if (is_global_init(p)) 247 continue; 248 if (mem && !task_in_mem_cgroup(p, mem)) 249 continue; 250 251 /* 252 * This task already has access to memory reserves and is 253 * being killed. Don't allow any other task access to the 254 * memory reserve. 255 * 256 * Note: this may have a chance of deadlock if it gets 257 * blocked waiting for another task which itself is waiting 258 * for memory. Is there a better alternative? 259 */ 260 if (test_tsk_thread_flag(p, TIF_MEMDIE)) 261 return ERR_PTR(-1UL); 262 263 /* 264 * This is in the process of releasing memory so wait for it 265 * to finish before killing some other task by mistake. 266 * 267 * However, if p is the current task, we allow the 'kill' to 268 * go ahead if it is exiting: this will simply set TIF_MEMDIE, 269 * which will allow it to gain access to memory reserves in 270 * the process of exiting and releasing its resources. 271 * Otherwise we could get an easy OOM deadlock. 272 */ 273 if (p->flags & PF_EXITING) { 274 if (p != current) 275 return ERR_PTR(-1UL); 276 277 chosen = p; 278 *ppoints = ULONG_MAX; 279 } 280 281 if (p->signal->oom_adj == OOM_DISABLE) 282 continue; 283 284 points = badness(p, uptime.tv_sec); 285 if (points > *ppoints || !chosen) { 286 chosen = p; 287 *ppoints = points; 288 } 289 } 290 291 return chosen; 292 } 293 294 /** 295 * dump_tasks - dump current memory state of all system tasks 296 * @mem: target memory controller 297 * 298 * Dumps the current memory state of all system tasks, excluding kernel threads. 299 * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj 300 * score, and name. 301 * 302 * If the actual is non-NULL, only tasks that are a member of the mem_cgroup are 303 * shown. 304 * 305 * Call with tasklist_lock read-locked. 306 */ 307 static void dump_tasks(const struct mem_cgroup *mem) 308 { 309 struct task_struct *g, *p; 310 311 printk(KERN_INFO "[ pid ] uid tgid total_vm rss cpu oom_adj " 312 "name\n"); 313 do_each_thread(g, p) { 314 struct mm_struct *mm; 315 316 if (mem && !task_in_mem_cgroup(p, mem)) 317 continue; 318 if (!thread_group_leader(p)) 319 continue; 320 321 task_lock(p); 322 mm = p->mm; 323 if (!mm) { 324 /* 325 * total_vm and rss sizes do not exist for tasks with no 326 * mm so there's no need to report them; they can't be 327 * oom killed anyway. 328 */ 329 task_unlock(p); 330 continue; 331 } 332 printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", 333 p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, 334 get_mm_rss(mm), (int)task_cpu(p), p->signal->oom_adj, 335 p->comm); 336 task_unlock(p); 337 } while_each_thread(g, p); 338 } 339 340 /* 341 * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO 342 * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO 343 * set. 344 */ 345 static void __oom_kill_task(struct task_struct *p, int verbose) 346 { 347 if (is_global_init(p)) { 348 WARN_ON(1); 349 printk(KERN_WARNING "tried to kill init!\n"); 350 return; 351 } 352 353 if (!p->mm) { 354 WARN_ON(1); 355 printk(KERN_WARNING "tried to kill an mm-less task!\n"); 356 return; 357 } 358 359 if (verbose) 360 printk(KERN_ERR "Killed process %d (%s)\n", 361 task_pid_nr(p), p->comm); 362 363 /* 364 * We give our sacrificial lamb high priority and access to 365 * all the memory it needs. That way it should be able to 366 * exit() and clear out its resources quickly... 367 */ 368 p->rt.time_slice = HZ; 369 set_tsk_thread_flag(p, TIF_MEMDIE); 370 371 force_sig(SIGKILL, p); 372 } 373 374 static int oom_kill_task(struct task_struct *p) 375 { 376 /* WARNING: mm may not be dereferenced since we did not obtain its 377 * value from get_task_mm(p). This is OK since all we need to do is 378 * compare mm to q->mm below. 379 * 380 * Furthermore, even if mm contains a non-NULL value, p->mm may 381 * change to NULL at any time since we do not hold task_lock(p). 382 * However, this is of no concern to us. 383 */ 384 if (!p->mm || p->signal->oom_adj == OOM_DISABLE) 385 return 1; 386 387 __oom_kill_task(p, 1); 388 389 return 0; 390 } 391 392 static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, 393 unsigned long points, struct mem_cgroup *mem, 394 const char *message) 395 { 396 struct task_struct *c; 397 398 if (printk_ratelimit()) { 399 printk(KERN_WARNING "%s invoked oom-killer: " 400 "gfp_mask=0x%x, order=%d, oom_adj=%d\n", 401 current->comm, gfp_mask, order, 402 current->signal->oom_adj); 403 task_lock(current); 404 cpuset_print_task_mems_allowed(current); 405 task_unlock(current); 406 dump_stack(); 407 mem_cgroup_print_oom_info(mem, current); 408 show_mem(); 409 if (sysctl_oom_dump_tasks) 410 dump_tasks(mem); 411 } 412 413 /* 414 * If the task is already exiting, don't alarm the sysadmin or kill 415 * its children or threads, just set TIF_MEMDIE so it can die quickly 416 */ 417 if (p->flags & PF_EXITING) { 418 __oom_kill_task(p, 0); 419 return 0; 420 } 421 422 printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n", 423 message, task_pid_nr(p), p->comm, points); 424 425 /* Try to kill a child first */ 426 list_for_each_entry(c, &p->children, sibling) { 427 if (c->mm == p->mm) 428 continue; 429 if (!oom_kill_task(c)) 430 return 0; 431 } 432 return oom_kill_task(p); 433 } 434 435 #ifdef CONFIG_CGROUP_MEM_RES_CTLR 436 void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) 437 { 438 unsigned long points = 0; 439 struct task_struct *p; 440 441 read_lock(&tasklist_lock); 442 retry: 443 p = select_bad_process(&points, mem); 444 if (PTR_ERR(p) == -1UL) 445 goto out; 446 447 if (!p) 448 p = current; 449 450 if (oom_kill_process(p, gfp_mask, 0, points, mem, 451 "Memory cgroup out of memory")) 452 goto retry; 453 out: 454 read_unlock(&tasklist_lock); 455 } 456 #endif 457 458 static BLOCKING_NOTIFIER_HEAD(oom_notify_list); 459 460 int register_oom_notifier(struct notifier_block *nb) 461 { 462 return blocking_notifier_chain_register(&oom_notify_list, nb); 463 } 464 EXPORT_SYMBOL_GPL(register_oom_notifier); 465 466 int unregister_oom_notifier(struct notifier_block *nb) 467 { 468 return blocking_notifier_chain_unregister(&oom_notify_list, nb); 469 } 470 EXPORT_SYMBOL_GPL(unregister_oom_notifier); 471 472 /* 473 * Try to acquire the OOM killer lock for the zones in zonelist. Returns zero 474 * if a parallel OOM killing is already taking place that includes a zone in 475 * the zonelist. Otherwise, locks all zones in the zonelist and returns 1. 476 */ 477 int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask) 478 { 479 struct zoneref *z; 480 struct zone *zone; 481 int ret = 1; 482 483 spin_lock(&zone_scan_lock); 484 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { 485 if (zone_is_oom_locked(zone)) { 486 ret = 0; 487 goto out; 488 } 489 } 490 491 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { 492 /* 493 * Lock each zone in the zonelist under zone_scan_lock so a 494 * parallel invocation of try_set_zone_oom() doesn't succeed 495 * when it shouldn't. 496 */ 497 zone_set_flag(zone, ZONE_OOM_LOCKED); 498 } 499 500 out: 501 spin_unlock(&zone_scan_lock); 502 return ret; 503 } 504 505 /* 506 * Clears the ZONE_OOM_LOCKED flag for all zones in the zonelist so that failed 507 * allocation attempts with zonelists containing them may now recall the OOM 508 * killer, if necessary. 509 */ 510 void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) 511 { 512 struct zoneref *z; 513 struct zone *zone; 514 515 spin_lock(&zone_scan_lock); 516 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { 517 zone_clear_flag(zone, ZONE_OOM_LOCKED); 518 } 519 spin_unlock(&zone_scan_lock); 520 } 521 522 /* 523 * Must be called with tasklist_lock held for read. 524 */ 525 static void __out_of_memory(gfp_t gfp_mask, int order) 526 { 527 struct task_struct *p; 528 unsigned long points; 529 530 if (sysctl_oom_kill_allocating_task) 531 if (!oom_kill_process(current, gfp_mask, order, 0, NULL, 532 "Out of memory (oom_kill_allocating_task)")) 533 return; 534 retry: 535 /* 536 * Rambo mode: Shoot down a process and hope it solves whatever 537 * issues we may have. 538 */ 539 p = select_bad_process(&points, NULL); 540 541 if (PTR_ERR(p) == -1UL) 542 return; 543 544 /* Found nothing?!?! Either we hang forever, or we panic. */ 545 if (!p) { 546 read_unlock(&tasklist_lock); 547 panic("Out of memory and no killable processes...\n"); 548 } 549 550 if (oom_kill_process(p, gfp_mask, order, points, NULL, 551 "Out of memory")) 552 goto retry; 553 } 554 555 /* 556 * pagefault handler calls into here because it is out of memory but 557 * doesn't know exactly how or why. 558 */ 559 void pagefault_out_of_memory(void) 560 { 561 unsigned long freed = 0; 562 563 blocking_notifier_call_chain(&oom_notify_list, 0, &freed); 564 if (freed > 0) 565 /* Got some memory back in the last second. */ 566 return; 567 568 /* 569 * If this is from memcg, oom-killer is already invoked. 570 * and not worth to go system-wide-oom. 571 */ 572 if (mem_cgroup_oom_called(current)) 573 goto rest_and_return; 574 575 if (sysctl_panic_on_oom) 576 panic("out of memory from page fault. panic_on_oom is selected.\n"); 577 578 read_lock(&tasklist_lock); 579 __out_of_memory(0, 0); /* unknown gfp_mask and order */ 580 read_unlock(&tasklist_lock); 581 582 /* 583 * Give "p" a good chance of killing itself before we 584 * retry to allocate memory. 585 */ 586 rest_and_return: 587 if (!test_thread_flag(TIF_MEMDIE)) 588 schedule_timeout_uninterruptible(1); 589 } 590 591 /** 592 * out_of_memory - kill the "best" process when we run out of memory 593 * @zonelist: zonelist pointer 594 * @gfp_mask: memory allocation flags 595 * @order: amount of memory being requested as a power of 2 596 * 597 * If we run out of memory, we have the choice between either 598 * killing a random task (bad), letting the system crash (worse) 599 * OR try to be smart about which process to kill. Note that we 600 * don't have to be perfect here, we just have to be good. 601 */ 602 void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) 603 { 604 unsigned long freed = 0; 605 enum oom_constraint constraint; 606 607 blocking_notifier_call_chain(&oom_notify_list, 0, &freed); 608 if (freed > 0) 609 /* Got some memory back in the last second. */ 610 return; 611 612 if (sysctl_panic_on_oom == 2) 613 panic("out of memory. Compulsory panic_on_oom is selected.\n"); 614 615 /* 616 * Check if there were limitations on the allocation (only relevant for 617 * NUMA) that may require different handling. 618 */ 619 constraint = constrained_alloc(zonelist, gfp_mask); 620 read_lock(&tasklist_lock); 621 622 switch (constraint) { 623 case CONSTRAINT_MEMORY_POLICY: 624 oom_kill_process(current, gfp_mask, order, 0, NULL, 625 "No available memory (MPOL_BIND)"); 626 break; 627 628 case CONSTRAINT_NONE: 629 if (sysctl_panic_on_oom) 630 panic("out of memory. panic_on_oom is selected\n"); 631 /* Fall-through */ 632 case CONSTRAINT_CPUSET: 633 __out_of_memory(gfp_mask, order); 634 break; 635 } 636 637 read_unlock(&tasklist_lock); 638 639 /* 640 * Give "p" a good chance of killing itself before we 641 * retry to allocate memory unless "p" is current 642 */ 643 if (!test_thread_flag(TIF_MEMDIE)) 644 schedule_timeout_uninterruptible(1); 645 } 646