1 /* 2 * linux/mm/vmscan.c 3 * 4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 5 * 6 * Swap reorganised 29.12.95, Stephen Tweedie. 7 * kswapd added: 7.1.96 sct 8 * Removed kswapd_ctl limits, and swap out as many pages as needed 9 * to bring the system back to freepages.high: 2.4.97, Rik van Riel. 10 * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com). 11 * Multiqueue VM started 5.8.00, Rik van Riel. 12 */ 13 14 #include <linux/mm.h> 15 #include <linux/module.h> 16 #include <linux/slab.h> 17 #include <linux/kernel_stat.h> 18 #include <linux/swap.h> 19 #include <linux/pagemap.h> 20 #include <linux/init.h> 21 #include <linux/highmem.h> 22 #include <linux/file.h> 23 #include <linux/writeback.h> 24 #include <linux/blkdev.h> 25 #include <linux/buffer_head.h> /* for try_to_release_page(), 26 buffer_heads_over_limit */ 27 #include <linux/mm_inline.h> 28 #include <linux/pagevec.h> 29 #include <linux/backing-dev.h> 30 #include <linux/rmap.h> 31 #include <linux/topology.h> 32 #include <linux/cpu.h> 33 #include <linux/cpuset.h> 34 #include <linux/notifier.h> 35 #include <linux/rwsem.h> 36 #include <linux/delay.h> 37 #include <linux/kthread.h> 38 39 #include <asm/tlbflush.h> 40 #include <asm/div64.h> 41 42 #include <linux/swapops.h> 43 44 #include "internal.h" 45 46 struct scan_control { 47 /* Incremented by the number of inactive pages that were scanned */ 48 unsigned long nr_scanned; 49 50 /* This context's GFP mask */ 51 gfp_t gfp_mask; 52 53 int may_writepage; 54 55 /* Can pages be swapped as part of reclaim? */ 56 int may_swap; 57 58 /* This context's SWAP_CLUSTER_MAX. If freeing memory for 59 * suspend, we effectively ignore SWAP_CLUSTER_MAX. 60 * In this context, it doesn't matter that we scan the 61 * whole list at once. */ 62 int swap_cluster_max; 63 64 int swappiness; 65 }; 66 67 /* 68 * The list of shrinker callbacks used by to apply pressure to 69 * ageable caches. 70 */ 71 struct shrinker { 72 shrinker_t shrinker; 73 struct list_head list; 74 int seeks; /* seeks to recreate an obj */ 75 long nr; /* objs pending delete */ 76 }; 77 78 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) 79 80 #ifdef ARCH_HAS_PREFETCH 81 #define prefetch_prev_lru_page(_page, _base, _field) \ 82 do { \ 83 if ((_page)->lru.prev != _base) { \ 84 struct page *prev; \ 85 \ 86 prev = lru_to_page(&(_page->lru)); \ 87 prefetch(&prev->_field); \ 88 } \ 89 } while (0) 90 #else 91 #define prefetch_prev_lru_page(_page, _base, _field) do { } while (0) 92 #endif 93 94 #ifdef ARCH_HAS_PREFETCHW 95 #define prefetchw_prev_lru_page(_page, _base, _field) \ 96 do { \ 97 if ((_page)->lru.prev != _base) { \ 98 struct page *prev; \ 99 \ 100 prev = lru_to_page(&(_page->lru)); \ 101 prefetchw(&prev->_field); \ 102 } \ 103 } while (0) 104 #else 105 #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0) 106 #endif 107 108 /* 109 * From 0 .. 100. Higher means more swappy. 110 */ 111 int vm_swappiness = 60; 112 long vm_total_pages; /* The total number of pages which the VM controls */ 113 114 static LIST_HEAD(shrinker_list); 115 static DECLARE_RWSEM(shrinker_rwsem); 116 117 /* 118 * Add a shrinker callback to be called from the vm 119 */ 120 struct shrinker *set_shrinker(int seeks, shrinker_t theshrinker) 121 { 122 struct shrinker *shrinker; 123 124 shrinker = kmalloc(sizeof(*shrinker), GFP_KERNEL); 125 if (shrinker) { 126 shrinker->shrinker = theshrinker; 127 shrinker->seeks = seeks; 128 shrinker->nr = 0; 129 down_write(&shrinker_rwsem); 130 list_add_tail(&shrinker->list, &shrinker_list); 131 up_write(&shrinker_rwsem); 132 } 133 return shrinker; 134 } 135 EXPORT_SYMBOL(set_shrinker); 136 137 /* 138 * Remove one 139 */ 140 void remove_shrinker(struct shrinker *shrinker) 141 { 142 down_write(&shrinker_rwsem); 143 list_del(&shrinker->list); 144 up_write(&shrinker_rwsem); 145 kfree(shrinker); 146 } 147 EXPORT_SYMBOL(remove_shrinker); 148 149 #define SHRINK_BATCH 128 150 /* 151 * Call the shrink functions to age shrinkable caches 152 * 153 * Here we assume it costs one seek to replace a lru page and that it also 154 * takes a seek to recreate a cache object. With this in mind we age equal 155 * percentages of the lru and ageable caches. This should balance the seeks 156 * generated by these structures. 157 * 158 * If the vm encounted mapped pages on the LRU it increase the pressure on 159 * slab to avoid swapping. 160 * 161 * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits. 162 * 163 * `lru_pages' represents the number of on-LRU pages in all the zones which 164 * are eligible for the caller's allocation attempt. It is used for balancing 165 * slab reclaim versus page reclaim. 166 * 167 * Returns the number of slab objects which we shrunk. 168 */ 169 unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, 170 unsigned long lru_pages) 171 { 172 struct shrinker *shrinker; 173 unsigned long ret = 0; 174 175 if (scanned == 0) 176 scanned = SWAP_CLUSTER_MAX; 177 178 if (!down_read_trylock(&shrinker_rwsem)) 179 return 1; /* Assume we'll be able to shrink next time */ 180 181 list_for_each_entry(shrinker, &shrinker_list, list) { 182 unsigned long long delta; 183 unsigned long total_scan; 184 unsigned long max_pass = (*shrinker->shrinker)(0, gfp_mask); 185 186 delta = (4 * scanned) / shrinker->seeks; 187 delta *= max_pass; 188 do_div(delta, lru_pages + 1); 189 shrinker->nr += delta; 190 if (shrinker->nr < 0) { 191 printk(KERN_ERR "%s: nr=%ld\n", 192 __FUNCTION__, shrinker->nr); 193 shrinker->nr = max_pass; 194 } 195 196 /* 197 * Avoid risking looping forever due to too large nr value: 198 * never try to free more than twice the estimate number of 199 * freeable entries. 200 */ 201 if (shrinker->nr > max_pass * 2) 202 shrinker->nr = max_pass * 2; 203 204 total_scan = shrinker->nr; 205 shrinker->nr = 0; 206 207 while (total_scan >= SHRINK_BATCH) { 208 long this_scan = SHRINK_BATCH; 209 int shrink_ret; 210 int nr_before; 211 212 nr_before = (*shrinker->shrinker)(0, gfp_mask); 213 shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask); 214 if (shrink_ret == -1) 215 break; 216 if (shrink_ret < nr_before) 217 ret += nr_before - shrink_ret; 218 count_vm_events(SLABS_SCANNED, this_scan); 219 total_scan -= this_scan; 220 221 cond_resched(); 222 } 223 224 shrinker->nr += total_scan; 225 } 226 up_read(&shrinker_rwsem); 227 return ret; 228 } 229 230 /* Called without lock on whether page is mapped, so answer is unstable */ 231 static inline int page_mapping_inuse(struct page *page) 232 { 233 struct address_space *mapping; 234 235 /* Page is in somebody's page tables. */ 236 if (page_mapped(page)) 237 return 1; 238 239 /* Be more reluctant to reclaim swapcache than pagecache */ 240 if (PageSwapCache(page)) 241 return 1; 242 243 mapping = page_mapping(page); 244 if (!mapping) 245 return 0; 246 247 /* File is mmap'd by somebody? */ 248 return mapping_mapped(mapping); 249 } 250 251 static inline int is_page_cache_freeable(struct page *page) 252 { 253 return page_count(page) - !!PagePrivate(page) == 2; 254 } 255 256 static int may_write_to_queue(struct backing_dev_info *bdi) 257 { 258 if (current->flags & PF_SWAPWRITE) 259 return 1; 260 if (!bdi_write_congested(bdi)) 261 return 1; 262 if (bdi == current->backing_dev_info) 263 return 1; 264 return 0; 265 } 266 267 /* 268 * We detected a synchronous write error writing a page out. Probably 269 * -ENOSPC. We need to propagate that into the address_space for a subsequent 270 * fsync(), msync() or close(). 271 * 272 * The tricky part is that after writepage we cannot touch the mapping: nothing 273 * prevents it from being freed up. But we have a ref on the page and once 274 * that page is locked, the mapping is pinned. 275 * 276 * We're allowed to run sleeping lock_page() here because we know the caller has 277 * __GFP_FS. 278 */ 279 static void handle_write_error(struct address_space *mapping, 280 struct page *page, int error) 281 { 282 lock_page(page); 283 if (page_mapping(page) == mapping) { 284 if (error == -ENOSPC) 285 set_bit(AS_ENOSPC, &mapping->flags); 286 else 287 set_bit(AS_EIO, &mapping->flags); 288 } 289 unlock_page(page); 290 } 291 292 /* possible outcome of pageout() */ 293 typedef enum { 294 /* failed to write page out, page is locked */ 295 PAGE_KEEP, 296 /* move page to the active list, page is locked */ 297 PAGE_ACTIVATE, 298 /* page has been sent to the disk successfully, page is unlocked */ 299 PAGE_SUCCESS, 300 /* page is clean and locked */ 301 PAGE_CLEAN, 302 } pageout_t; 303 304 /* 305 * pageout is called by shrink_page_list() for each dirty page. 306 * Calls ->writepage(). 307 */ 308 static pageout_t pageout(struct page *page, struct address_space *mapping) 309 { 310 /* 311 * If the page is dirty, only perform writeback if that write 312 * will be non-blocking. To prevent this allocation from being 313 * stalled by pagecache activity. But note that there may be 314 * stalls if we need to run get_block(). We could test 315 * PagePrivate for that. 316 * 317 * If this process is currently in generic_file_write() against 318 * this page's queue, we can perform writeback even if that 319 * will block. 320 * 321 * If the page is swapcache, write it back even if that would 322 * block, for some throttling. This happens by accident, because 323 * swap_backing_dev_info is bust: it doesn't reflect the 324 * congestion state of the swapdevs. Easy to fix, if needed. 325 * See swapfile.c:page_queue_congested(). 326 */ 327 if (!is_page_cache_freeable(page)) 328 return PAGE_KEEP; 329 if (!mapping) { 330 /* 331 * Some data journaling orphaned pages can have 332 * page->mapping == NULL while being dirty with clean buffers. 333 */ 334 if (PagePrivate(page)) { 335 if (try_to_free_buffers(page)) { 336 ClearPageDirty(page); 337 printk("%s: orphaned page\n", __FUNCTION__); 338 return PAGE_CLEAN; 339 } 340 } 341 return PAGE_KEEP; 342 } 343 if (mapping->a_ops->writepage == NULL) 344 return PAGE_ACTIVATE; 345 if (!may_write_to_queue(mapping->backing_dev_info)) 346 return PAGE_KEEP; 347 348 if (clear_page_dirty_for_io(page)) { 349 int res; 350 struct writeback_control wbc = { 351 .sync_mode = WB_SYNC_NONE, 352 .nr_to_write = SWAP_CLUSTER_MAX, 353 .range_start = 0, 354 .range_end = LLONG_MAX, 355 .nonblocking = 1, 356 .for_reclaim = 1, 357 }; 358 359 SetPageReclaim(page); 360 res = mapping->a_ops->writepage(page, &wbc); 361 if (res < 0) 362 handle_write_error(mapping, page, res); 363 if (res == AOP_WRITEPAGE_ACTIVATE) { 364 ClearPageReclaim(page); 365 return PAGE_ACTIVATE; 366 } 367 if (!PageWriteback(page)) { 368 /* synchronous write or broken a_ops? */ 369 ClearPageReclaim(page); 370 } 371 372 return PAGE_SUCCESS; 373 } 374 375 return PAGE_CLEAN; 376 } 377 378 int remove_mapping(struct address_space *mapping, struct page *page) 379 { 380 if (!mapping) 381 return 0; /* truncate got there first */ 382 383 write_lock_irq(&mapping->tree_lock); 384 385 /* 386 * The non-racy check for busy page. It is critical to check 387 * PageDirty _after_ making sure that the page is freeable and 388 * not in use by anybody. (pagecache + us == 2) 389 */ 390 if (unlikely(page_count(page) != 2)) 391 goto cannot_free; 392 smp_rmb(); 393 if (unlikely(PageDirty(page))) 394 goto cannot_free; 395 396 if (PageSwapCache(page)) { 397 swp_entry_t swap = { .val = page_private(page) }; 398 __delete_from_swap_cache(page); 399 write_unlock_irq(&mapping->tree_lock); 400 swap_free(swap); 401 __put_page(page); /* The pagecache ref */ 402 return 1; 403 } 404 405 __remove_from_page_cache(page); 406 write_unlock_irq(&mapping->tree_lock); 407 __put_page(page); 408 return 1; 409 410 cannot_free: 411 write_unlock_irq(&mapping->tree_lock); 412 return 0; 413 } 414 415 /* 416 * shrink_page_list() returns the number of reclaimed pages 417 */ 418 static unsigned long shrink_page_list(struct list_head *page_list, 419 struct scan_control *sc) 420 { 421 LIST_HEAD(ret_pages); 422 struct pagevec freed_pvec; 423 int pgactivate = 0; 424 unsigned long nr_reclaimed = 0; 425 426 cond_resched(); 427 428 pagevec_init(&freed_pvec, 1); 429 while (!list_empty(page_list)) { 430 struct address_space *mapping; 431 struct page *page; 432 int may_enter_fs; 433 int referenced; 434 435 cond_resched(); 436 437 page = lru_to_page(page_list); 438 list_del(&page->lru); 439 440 if (TestSetPageLocked(page)) 441 goto keep; 442 443 BUG_ON(PageActive(page)); 444 445 sc->nr_scanned++; 446 447 if (!sc->may_swap && page_mapped(page)) 448 goto keep_locked; 449 450 /* Double the slab pressure for mapped and swapcache pages */ 451 if (page_mapped(page) || PageSwapCache(page)) 452 sc->nr_scanned++; 453 454 if (PageWriteback(page)) 455 goto keep_locked; 456 457 referenced = page_referenced(page, 1); 458 /* In active use or really unfreeable? Activate it. */ 459 if (referenced && page_mapping_inuse(page)) 460 goto activate_locked; 461 462 #ifdef CONFIG_SWAP 463 /* 464 * Anonymous process memory has backing store? 465 * Try to allocate it some swap space here. 466 */ 467 if (PageAnon(page) && !PageSwapCache(page)) 468 if (!add_to_swap(page, GFP_ATOMIC)) 469 goto activate_locked; 470 #endif /* CONFIG_SWAP */ 471 472 mapping = page_mapping(page); 473 may_enter_fs = (sc->gfp_mask & __GFP_FS) || 474 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); 475 476 /* 477 * The page is mapped into the page tables of one or more 478 * processes. Try to unmap it here. 479 */ 480 if (page_mapped(page) && mapping) { 481 switch (try_to_unmap(page, 0)) { 482 case SWAP_FAIL: 483 goto activate_locked; 484 case SWAP_AGAIN: 485 goto keep_locked; 486 case SWAP_SUCCESS: 487 ; /* try to free the page below */ 488 } 489 } 490 491 if (PageDirty(page)) { 492 if (referenced) 493 goto keep_locked; 494 if (!may_enter_fs) 495 goto keep_locked; 496 if (!sc->may_writepage) 497 goto keep_locked; 498 499 /* Page is dirty, try to write it out here */ 500 switch(pageout(page, mapping)) { 501 case PAGE_KEEP: 502 goto keep_locked; 503 case PAGE_ACTIVATE: 504 goto activate_locked; 505 case PAGE_SUCCESS: 506 if (PageWriteback(page) || PageDirty(page)) 507 goto keep; 508 /* 509 * A synchronous write - probably a ramdisk. Go 510 * ahead and try to reclaim the page. 511 */ 512 if (TestSetPageLocked(page)) 513 goto keep; 514 if (PageDirty(page) || PageWriteback(page)) 515 goto keep_locked; 516 mapping = page_mapping(page); 517 case PAGE_CLEAN: 518 ; /* try to free the page below */ 519 } 520 } 521 522 /* 523 * If the page has buffers, try to free the buffer mappings 524 * associated with this page. If we succeed we try to free 525 * the page as well. 526 * 527 * We do this even if the page is PageDirty(). 528 * try_to_release_page() does not perform I/O, but it is 529 * possible for a page to have PageDirty set, but it is actually 530 * clean (all its buffers are clean). This happens if the 531 * buffers were written out directly, with submit_bh(). ext3 532 * will do this, as well as the blockdev mapping. 533 * try_to_release_page() will discover that cleanness and will 534 * drop the buffers and mark the page clean - it can be freed. 535 * 536 * Rarely, pages can have buffers and no ->mapping. These are 537 * the pages which were not successfully invalidated in 538 * truncate_complete_page(). We try to drop those buffers here 539 * and if that worked, and the page is no longer mapped into 540 * process address space (page_count == 1) it can be freed. 541 * Otherwise, leave the page on the LRU so it is swappable. 542 */ 543 if (PagePrivate(page)) { 544 if (!try_to_release_page(page, sc->gfp_mask)) 545 goto activate_locked; 546 if (!mapping && page_count(page) == 1) 547 goto free_it; 548 } 549 550 if (!remove_mapping(mapping, page)) 551 goto keep_locked; 552 553 free_it: 554 unlock_page(page); 555 nr_reclaimed++; 556 if (!pagevec_add(&freed_pvec, page)) 557 __pagevec_release_nonlru(&freed_pvec); 558 continue; 559 560 activate_locked: 561 SetPageActive(page); 562 pgactivate++; 563 keep_locked: 564 unlock_page(page); 565 keep: 566 list_add(&page->lru, &ret_pages); 567 BUG_ON(PageLRU(page)); 568 } 569 list_splice(&ret_pages, page_list); 570 if (pagevec_count(&freed_pvec)) 571 __pagevec_release_nonlru(&freed_pvec); 572 count_vm_events(PGACTIVATE, pgactivate); 573 return nr_reclaimed; 574 } 575 576 /* 577 * zone->lru_lock is heavily contended. Some of the functions that 578 * shrink the lists perform better by taking out a batch of pages 579 * and working on them outside the LRU lock. 580 * 581 * For pagecache intensive workloads, this function is the hottest 582 * spot in the kernel (apart from copy_*_user functions). 583 * 584 * Appropriate locks must be held before calling this function. 585 * 586 * @nr_to_scan: The number of pages to look through on the list. 587 * @src: The LRU list to pull pages off. 588 * @dst: The temp list to put pages on to. 589 * @scanned: The number of pages that were scanned. 590 * 591 * returns how many pages were moved onto *@dst. 592 */ 593 static unsigned long isolate_lru_pages(unsigned long nr_to_scan, 594 struct list_head *src, struct list_head *dst, 595 unsigned long *scanned) 596 { 597 unsigned long nr_taken = 0; 598 struct page *page; 599 unsigned long scan; 600 601 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { 602 struct list_head *target; 603 page = lru_to_page(src); 604 prefetchw_prev_lru_page(page, src, flags); 605 606 BUG_ON(!PageLRU(page)); 607 608 list_del(&page->lru); 609 target = src; 610 if (likely(get_page_unless_zero(page))) { 611 /* 612 * Be careful not to clear PageLRU until after we're 613 * sure the page is not being freed elsewhere -- the 614 * page release code relies on it. 615 */ 616 ClearPageLRU(page); 617 target = dst; 618 nr_taken++; 619 } /* else it is being freed elsewhere */ 620 621 list_add(&page->lru, target); 622 } 623 624 *scanned = scan; 625 return nr_taken; 626 } 627 628 /* 629 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number 630 * of reclaimed pages 631 */ 632 static unsigned long shrink_inactive_list(unsigned long max_scan, 633 struct zone *zone, struct scan_control *sc) 634 { 635 LIST_HEAD(page_list); 636 struct pagevec pvec; 637 unsigned long nr_scanned = 0; 638 unsigned long nr_reclaimed = 0; 639 640 pagevec_init(&pvec, 1); 641 642 lru_add_drain(); 643 spin_lock_irq(&zone->lru_lock); 644 do { 645 struct page *page; 646 unsigned long nr_taken; 647 unsigned long nr_scan; 648 unsigned long nr_freed; 649 650 nr_taken = isolate_lru_pages(sc->swap_cluster_max, 651 &zone->inactive_list, 652 &page_list, &nr_scan); 653 zone->nr_inactive -= nr_taken; 654 zone->pages_scanned += nr_scan; 655 spin_unlock_irq(&zone->lru_lock); 656 657 nr_scanned += nr_scan; 658 nr_freed = shrink_page_list(&page_list, sc); 659 nr_reclaimed += nr_freed; 660 local_irq_disable(); 661 if (current_is_kswapd()) { 662 __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan); 663 __count_vm_events(KSWAPD_STEAL, nr_freed); 664 } else 665 __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan); 666 __count_vm_events(PGACTIVATE, nr_freed); 667 668 if (nr_taken == 0) 669 goto done; 670 671 spin_lock(&zone->lru_lock); 672 /* 673 * Put back any unfreeable pages. 674 */ 675 while (!list_empty(&page_list)) { 676 page = lru_to_page(&page_list); 677 BUG_ON(PageLRU(page)); 678 SetPageLRU(page); 679 list_del(&page->lru); 680 if (PageActive(page)) 681 add_page_to_active_list(zone, page); 682 else 683 add_page_to_inactive_list(zone, page); 684 if (!pagevec_add(&pvec, page)) { 685 spin_unlock_irq(&zone->lru_lock); 686 __pagevec_release(&pvec); 687 spin_lock_irq(&zone->lru_lock); 688 } 689 } 690 } while (nr_scanned < max_scan); 691 spin_unlock(&zone->lru_lock); 692 done: 693 local_irq_enable(); 694 pagevec_release(&pvec); 695 return nr_reclaimed; 696 } 697 698 /* 699 * This moves pages from the active list to the inactive list. 700 * 701 * We move them the other way if the page is referenced by one or more 702 * processes, from rmap. 703 * 704 * If the pages are mostly unmapped, the processing is fast and it is 705 * appropriate to hold zone->lru_lock across the whole operation. But if 706 * the pages are mapped, the processing is slow (page_referenced()) so we 707 * should drop zone->lru_lock around each page. It's impossible to balance 708 * this, so instead we remove the pages from the LRU while processing them. 709 * It is safe to rely on PG_active against the non-LRU pages in here because 710 * nobody will play with that bit on a non-LRU page. 711 * 712 * The downside is that we have to touch page->_count against each page. 713 * But we had to alter page->flags anyway. 714 */ 715 static void shrink_active_list(unsigned long nr_pages, struct zone *zone, 716 struct scan_control *sc) 717 { 718 unsigned long pgmoved; 719 int pgdeactivate = 0; 720 unsigned long pgscanned; 721 LIST_HEAD(l_hold); /* The pages which were snipped off */ 722 LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */ 723 LIST_HEAD(l_active); /* Pages to go onto the active_list */ 724 struct page *page; 725 struct pagevec pvec; 726 int reclaim_mapped = 0; 727 728 if (sc->may_swap) { 729 long mapped_ratio; 730 long distress; 731 long swap_tendency; 732 733 /* 734 * `distress' is a measure of how much trouble we're having 735 * reclaiming pages. 0 -> no problems. 100 -> great trouble. 736 */ 737 distress = 100 >> zone->prev_priority; 738 739 /* 740 * The point of this algorithm is to decide when to start 741 * reclaiming mapped memory instead of just pagecache. Work out 742 * how much memory 743 * is mapped. 744 */ 745 mapped_ratio = ((global_page_state(NR_FILE_MAPPED) + 746 global_page_state(NR_ANON_PAGES)) * 100) / 747 vm_total_pages; 748 749 /* 750 * Now decide how much we really want to unmap some pages. The 751 * mapped ratio is downgraded - just because there's a lot of 752 * mapped memory doesn't necessarily mean that page reclaim 753 * isn't succeeding. 754 * 755 * The distress ratio is important - we don't want to start 756 * going oom. 757 * 758 * A 100% value of vm_swappiness overrides this algorithm 759 * altogether. 760 */ 761 swap_tendency = mapped_ratio / 2 + distress + sc->swappiness; 762 763 /* 764 * Now use this metric to decide whether to start moving mapped 765 * memory onto the inactive list. 766 */ 767 if (swap_tendency >= 100) 768 reclaim_mapped = 1; 769 } 770 771 lru_add_drain(); 772 spin_lock_irq(&zone->lru_lock); 773 pgmoved = isolate_lru_pages(nr_pages, &zone->active_list, 774 &l_hold, &pgscanned); 775 zone->pages_scanned += pgscanned; 776 zone->nr_active -= pgmoved; 777 spin_unlock_irq(&zone->lru_lock); 778 779 while (!list_empty(&l_hold)) { 780 cond_resched(); 781 page = lru_to_page(&l_hold); 782 list_del(&page->lru); 783 if (page_mapped(page)) { 784 if (!reclaim_mapped || 785 (total_swap_pages == 0 && PageAnon(page)) || 786 page_referenced(page, 0)) { 787 list_add(&page->lru, &l_active); 788 continue; 789 } 790 } 791 list_add(&page->lru, &l_inactive); 792 } 793 794 pagevec_init(&pvec, 1); 795 pgmoved = 0; 796 spin_lock_irq(&zone->lru_lock); 797 while (!list_empty(&l_inactive)) { 798 page = lru_to_page(&l_inactive); 799 prefetchw_prev_lru_page(page, &l_inactive, flags); 800 BUG_ON(PageLRU(page)); 801 SetPageLRU(page); 802 BUG_ON(!PageActive(page)); 803 ClearPageActive(page); 804 805 list_move(&page->lru, &zone->inactive_list); 806 pgmoved++; 807 if (!pagevec_add(&pvec, page)) { 808 zone->nr_inactive += pgmoved; 809 spin_unlock_irq(&zone->lru_lock); 810 pgdeactivate += pgmoved; 811 pgmoved = 0; 812 if (buffer_heads_over_limit) 813 pagevec_strip(&pvec); 814 __pagevec_release(&pvec); 815 spin_lock_irq(&zone->lru_lock); 816 } 817 } 818 zone->nr_inactive += pgmoved; 819 pgdeactivate += pgmoved; 820 if (buffer_heads_over_limit) { 821 spin_unlock_irq(&zone->lru_lock); 822 pagevec_strip(&pvec); 823 spin_lock_irq(&zone->lru_lock); 824 } 825 826 pgmoved = 0; 827 while (!list_empty(&l_active)) { 828 page = lru_to_page(&l_active); 829 prefetchw_prev_lru_page(page, &l_active, flags); 830 BUG_ON(PageLRU(page)); 831 SetPageLRU(page); 832 BUG_ON(!PageActive(page)); 833 list_move(&page->lru, &zone->active_list); 834 pgmoved++; 835 if (!pagevec_add(&pvec, page)) { 836 zone->nr_active += pgmoved; 837 pgmoved = 0; 838 spin_unlock_irq(&zone->lru_lock); 839 __pagevec_release(&pvec); 840 spin_lock_irq(&zone->lru_lock); 841 } 842 } 843 zone->nr_active += pgmoved; 844 845 __count_zone_vm_events(PGREFILL, zone, pgscanned); 846 __count_vm_events(PGDEACTIVATE, pgdeactivate); 847 spin_unlock_irq(&zone->lru_lock); 848 849 pagevec_release(&pvec); 850 } 851 852 /* 853 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 854 */ 855 static unsigned long shrink_zone(int priority, struct zone *zone, 856 struct scan_control *sc) 857 { 858 unsigned long nr_active; 859 unsigned long nr_inactive; 860 unsigned long nr_to_scan; 861 unsigned long nr_reclaimed = 0; 862 863 atomic_inc(&zone->reclaim_in_progress); 864 865 /* 866 * Add one to `nr_to_scan' just to make sure that the kernel will 867 * slowly sift through the active list. 868 */ 869 zone->nr_scan_active += (zone->nr_active >> priority) + 1; 870 nr_active = zone->nr_scan_active; 871 if (nr_active >= sc->swap_cluster_max) 872 zone->nr_scan_active = 0; 873 else 874 nr_active = 0; 875 876 zone->nr_scan_inactive += (zone->nr_inactive >> priority) + 1; 877 nr_inactive = zone->nr_scan_inactive; 878 if (nr_inactive >= sc->swap_cluster_max) 879 zone->nr_scan_inactive = 0; 880 else 881 nr_inactive = 0; 882 883 while (nr_active || nr_inactive) { 884 if (nr_active) { 885 nr_to_scan = min(nr_active, 886 (unsigned long)sc->swap_cluster_max); 887 nr_active -= nr_to_scan; 888 shrink_active_list(nr_to_scan, zone, sc); 889 } 890 891 if (nr_inactive) { 892 nr_to_scan = min(nr_inactive, 893 (unsigned long)sc->swap_cluster_max); 894 nr_inactive -= nr_to_scan; 895 nr_reclaimed += shrink_inactive_list(nr_to_scan, zone, 896 sc); 897 } 898 } 899 900 throttle_vm_writeout(); 901 902 atomic_dec(&zone->reclaim_in_progress); 903 return nr_reclaimed; 904 } 905 906 /* 907 * This is the direct reclaim path, for page-allocating processes. We only 908 * try to reclaim pages from zones which will satisfy the caller's allocation 909 * request. 910 * 911 * We reclaim from a zone even if that zone is over pages_high. Because: 912 * a) The caller may be trying to free *extra* pages to satisfy a higher-order 913 * allocation or 914 * b) The zones may be over pages_high but they must go *over* pages_high to 915 * satisfy the `incremental min' zone defense algorithm. 916 * 917 * Returns the number of reclaimed pages. 918 * 919 * If a zone is deemed to be full of pinned pages then just give it a light 920 * scan then give up on it. 921 */ 922 static unsigned long shrink_zones(int priority, struct zone **zones, 923 struct scan_control *sc) 924 { 925 unsigned long nr_reclaimed = 0; 926 int i; 927 928 for (i = 0; zones[i] != NULL; i++) { 929 struct zone *zone = zones[i]; 930 931 if (!populated_zone(zone)) 932 continue; 933 934 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) 935 continue; 936 937 zone->temp_priority = priority; 938 if (zone->prev_priority > priority) 939 zone->prev_priority = priority; 940 941 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 942 continue; /* Let kswapd poll it */ 943 944 nr_reclaimed += shrink_zone(priority, zone, sc); 945 } 946 return nr_reclaimed; 947 } 948 949 /* 950 * This is the main entry point to direct page reclaim. 951 * 952 * If a full scan of the inactive list fails to free enough memory then we 953 * are "out of memory" and something needs to be killed. 954 * 955 * If the caller is !__GFP_FS then the probability of a failure is reasonably 956 * high - the zone may be full of dirty or under-writeback pages, which this 957 * caller can't do much about. We kick pdflush and take explicit naps in the 958 * hope that some of these pages can be written. But if the allocating task 959 * holds filesystem locks which prevent writeout this might not work, and the 960 * allocation attempt will fail. 961 */ 962 unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) 963 { 964 int priority; 965 int ret = 0; 966 unsigned long total_scanned = 0; 967 unsigned long nr_reclaimed = 0; 968 struct reclaim_state *reclaim_state = current->reclaim_state; 969 unsigned long lru_pages = 0; 970 int i; 971 struct scan_control sc = { 972 .gfp_mask = gfp_mask, 973 .may_writepage = !laptop_mode, 974 .swap_cluster_max = SWAP_CLUSTER_MAX, 975 .may_swap = 1, 976 .swappiness = vm_swappiness, 977 }; 978 979 count_vm_event(ALLOCSTALL); 980 981 for (i = 0; zones[i] != NULL; i++) { 982 struct zone *zone = zones[i]; 983 984 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) 985 continue; 986 987 zone->temp_priority = DEF_PRIORITY; 988 lru_pages += zone->nr_active + zone->nr_inactive; 989 } 990 991 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 992 sc.nr_scanned = 0; 993 if (!priority) 994 disable_swap_token(); 995 nr_reclaimed += shrink_zones(priority, zones, &sc); 996 shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); 997 if (reclaim_state) { 998 nr_reclaimed += reclaim_state->reclaimed_slab; 999 reclaim_state->reclaimed_slab = 0; 1000 } 1001 total_scanned += sc.nr_scanned; 1002 if (nr_reclaimed >= sc.swap_cluster_max) { 1003 ret = 1; 1004 goto out; 1005 } 1006 1007 /* 1008 * Try to write back as many pages as we just scanned. This 1009 * tends to cause slow streaming writers to write data to the 1010 * disk smoothly, at the dirtying rate, which is nice. But 1011 * that's undesirable in laptop mode, where we *want* lumpy 1012 * writeout. So in laptop mode, write out the whole world. 1013 */ 1014 if (total_scanned > sc.swap_cluster_max + 1015 sc.swap_cluster_max / 2) { 1016 wakeup_pdflush(laptop_mode ? 0 : total_scanned); 1017 sc.may_writepage = 1; 1018 } 1019 1020 /* Take a nap, wait for some writeback to complete */ 1021 if (sc.nr_scanned && priority < DEF_PRIORITY - 2) 1022 blk_congestion_wait(WRITE, HZ/10); 1023 } 1024 out: 1025 for (i = 0; zones[i] != 0; i++) { 1026 struct zone *zone = zones[i]; 1027 1028 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) 1029 continue; 1030 1031 zone->prev_priority = zone->temp_priority; 1032 } 1033 return ret; 1034 } 1035 1036 /* 1037 * For kswapd, balance_pgdat() will work across all this node's zones until 1038 * they are all at pages_high. 1039 * 1040 * Returns the number of pages which were actually freed. 1041 * 1042 * There is special handling here for zones which are full of pinned pages. 1043 * This can happen if the pages are all mlocked, or if they are all used by 1044 * device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb. 1045 * What we do is to detect the case where all pages in the zone have been 1046 * scanned twice and there has been zero successful reclaim. Mark the zone as 1047 * dead and from now on, only perform a short scan. Basically we're polling 1048 * the zone for when the problem goes away. 1049 * 1050 * kswapd scans the zones in the highmem->normal->dma direction. It skips 1051 * zones which have free_pages > pages_high, but once a zone is found to have 1052 * free_pages <= pages_high, we scan that zone and the lower zones regardless 1053 * of the number of free pages in the lower zones. This interoperates with 1054 * the page allocator fallback scheme to ensure that aging of pages is balanced 1055 * across the zones. 1056 */ 1057 static unsigned long balance_pgdat(pg_data_t *pgdat, int order) 1058 { 1059 int all_zones_ok; 1060 int priority; 1061 int i; 1062 unsigned long total_scanned; 1063 unsigned long nr_reclaimed; 1064 struct reclaim_state *reclaim_state = current->reclaim_state; 1065 struct scan_control sc = { 1066 .gfp_mask = GFP_KERNEL, 1067 .may_swap = 1, 1068 .swap_cluster_max = SWAP_CLUSTER_MAX, 1069 .swappiness = vm_swappiness, 1070 }; 1071 1072 loop_again: 1073 total_scanned = 0; 1074 nr_reclaimed = 0; 1075 sc.may_writepage = !laptop_mode; 1076 count_vm_event(PAGEOUTRUN); 1077 1078 for (i = 0; i < pgdat->nr_zones; i++) { 1079 struct zone *zone = pgdat->node_zones + i; 1080 1081 zone->temp_priority = DEF_PRIORITY; 1082 } 1083 1084 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 1085 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 1086 unsigned long lru_pages = 0; 1087 1088 /* The swap token gets in the way of swapout... */ 1089 if (!priority) 1090 disable_swap_token(); 1091 1092 all_zones_ok = 1; 1093 1094 /* 1095 * Scan in the highmem->dma direction for the highest 1096 * zone which needs scanning 1097 */ 1098 for (i = pgdat->nr_zones - 1; i >= 0; i--) { 1099 struct zone *zone = pgdat->node_zones + i; 1100 1101 if (!populated_zone(zone)) 1102 continue; 1103 1104 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 1105 continue; 1106 1107 if (!zone_watermark_ok(zone, order, zone->pages_high, 1108 0, 0)) { 1109 end_zone = i; 1110 goto scan; 1111 } 1112 } 1113 goto out; 1114 scan: 1115 for (i = 0; i <= end_zone; i++) { 1116 struct zone *zone = pgdat->node_zones + i; 1117 1118 lru_pages += zone->nr_active + zone->nr_inactive; 1119 } 1120 1121 /* 1122 * Now scan the zone in the dma->highmem direction, stopping 1123 * at the last zone which needs scanning. 1124 * 1125 * We do this because the page allocator works in the opposite 1126 * direction. This prevents the page allocator from allocating 1127 * pages behind kswapd's direction of progress, which would 1128 * cause too much scanning of the lower zones. 1129 */ 1130 for (i = 0; i <= end_zone; i++) { 1131 struct zone *zone = pgdat->node_zones + i; 1132 int nr_slab; 1133 1134 if (!populated_zone(zone)) 1135 continue; 1136 1137 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 1138 continue; 1139 1140 if (!zone_watermark_ok(zone, order, zone->pages_high, 1141 end_zone, 0)) 1142 all_zones_ok = 0; 1143 zone->temp_priority = priority; 1144 if (zone->prev_priority > priority) 1145 zone->prev_priority = priority; 1146 sc.nr_scanned = 0; 1147 nr_reclaimed += shrink_zone(priority, zone, &sc); 1148 reclaim_state->reclaimed_slab = 0; 1149 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, 1150 lru_pages); 1151 nr_reclaimed += reclaim_state->reclaimed_slab; 1152 total_scanned += sc.nr_scanned; 1153 if (zone->all_unreclaimable) 1154 continue; 1155 if (nr_slab == 0 && zone->pages_scanned >= 1156 (zone->nr_active + zone->nr_inactive) * 4) 1157 zone->all_unreclaimable = 1; 1158 /* 1159 * If we've done a decent amount of scanning and 1160 * the reclaim ratio is low, start doing writepage 1161 * even in laptop mode 1162 */ 1163 if (total_scanned > SWAP_CLUSTER_MAX * 2 && 1164 total_scanned > nr_reclaimed + nr_reclaimed / 2) 1165 sc.may_writepage = 1; 1166 } 1167 if (all_zones_ok) 1168 break; /* kswapd: all done */ 1169 /* 1170 * OK, kswapd is getting into trouble. Take a nap, then take 1171 * another pass across the zones. 1172 */ 1173 if (total_scanned && priority < DEF_PRIORITY - 2) 1174 blk_congestion_wait(WRITE, HZ/10); 1175 1176 /* 1177 * We do this so kswapd doesn't build up large priorities for 1178 * example when it is freeing in parallel with allocators. It 1179 * matches the direct reclaim path behaviour in terms of impact 1180 * on zone->*_priority. 1181 */ 1182 if (nr_reclaimed >= SWAP_CLUSTER_MAX) 1183 break; 1184 } 1185 out: 1186 for (i = 0; i < pgdat->nr_zones; i++) { 1187 struct zone *zone = pgdat->node_zones + i; 1188 1189 zone->prev_priority = zone->temp_priority; 1190 } 1191 if (!all_zones_ok) { 1192 cond_resched(); 1193 goto loop_again; 1194 } 1195 1196 return nr_reclaimed; 1197 } 1198 1199 /* 1200 * The background pageout daemon, started as a kernel thread 1201 * from the init process. 1202 * 1203 * This basically trickles out pages so that we have _some_ 1204 * free memory available even if there is no other activity 1205 * that frees anything up. This is needed for things like routing 1206 * etc, where we otherwise might have all activity going on in 1207 * asynchronous contexts that cannot page things out. 1208 * 1209 * If there are applications that are active memory-allocators 1210 * (most normal use), this basically shouldn't matter. 1211 */ 1212 static int kswapd(void *p) 1213 { 1214 unsigned long order; 1215 pg_data_t *pgdat = (pg_data_t*)p; 1216 struct task_struct *tsk = current; 1217 DEFINE_WAIT(wait); 1218 struct reclaim_state reclaim_state = { 1219 .reclaimed_slab = 0, 1220 }; 1221 cpumask_t cpumask; 1222 1223 cpumask = node_to_cpumask(pgdat->node_id); 1224 if (!cpus_empty(cpumask)) 1225 set_cpus_allowed(tsk, cpumask); 1226 current->reclaim_state = &reclaim_state; 1227 1228 /* 1229 * Tell the memory management that we're a "memory allocator", 1230 * and that if we need more memory we should get access to it 1231 * regardless (see "__alloc_pages()"). "kswapd" should 1232 * never get caught in the normal page freeing logic. 1233 * 1234 * (Kswapd normally doesn't need memory anyway, but sometimes 1235 * you need a small amount of memory in order to be able to 1236 * page out something else, and this flag essentially protects 1237 * us from recursively trying to free more memory as we're 1238 * trying to free the first piece of memory in the first place). 1239 */ 1240 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; 1241 1242 order = 0; 1243 for ( ; ; ) { 1244 unsigned long new_order; 1245 1246 try_to_freeze(); 1247 1248 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 1249 new_order = pgdat->kswapd_max_order; 1250 pgdat->kswapd_max_order = 0; 1251 if (order < new_order) { 1252 /* 1253 * Don't sleep if someone wants a larger 'order' 1254 * allocation 1255 */ 1256 order = new_order; 1257 } else { 1258 schedule(); 1259 order = pgdat->kswapd_max_order; 1260 } 1261 finish_wait(&pgdat->kswapd_wait, &wait); 1262 1263 balance_pgdat(pgdat, order); 1264 } 1265 return 0; 1266 } 1267 1268 /* 1269 * A zone is low on free memory, so wake its kswapd task to service it. 1270 */ 1271 void wakeup_kswapd(struct zone *zone, int order) 1272 { 1273 pg_data_t *pgdat; 1274 1275 if (!populated_zone(zone)) 1276 return; 1277 1278 pgdat = zone->zone_pgdat; 1279 if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0)) 1280 return; 1281 if (pgdat->kswapd_max_order < order) 1282 pgdat->kswapd_max_order = order; 1283 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) 1284 return; 1285 if (!waitqueue_active(&pgdat->kswapd_wait)) 1286 return; 1287 wake_up_interruptible(&pgdat->kswapd_wait); 1288 } 1289 1290 #ifdef CONFIG_PM 1291 /* 1292 * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages 1293 * from LRU lists system-wide, for given pass and priority, and returns the 1294 * number of reclaimed pages 1295 * 1296 * For pass > 3 we also try to shrink the LRU lists that contain a few pages 1297 */ 1298 static unsigned long shrink_all_zones(unsigned long nr_pages, int pass, 1299 int prio, struct scan_control *sc) 1300 { 1301 struct zone *zone; 1302 unsigned long nr_to_scan, ret = 0; 1303 1304 for_each_zone(zone) { 1305 1306 if (!populated_zone(zone)) 1307 continue; 1308 1309 if (zone->all_unreclaimable && prio != DEF_PRIORITY) 1310 continue; 1311 1312 /* For pass = 0 we don't shrink the active list */ 1313 if (pass > 0) { 1314 zone->nr_scan_active += (zone->nr_active >> prio) + 1; 1315 if (zone->nr_scan_active >= nr_pages || pass > 3) { 1316 zone->nr_scan_active = 0; 1317 nr_to_scan = min(nr_pages, zone->nr_active); 1318 shrink_active_list(nr_to_scan, zone, sc); 1319 } 1320 } 1321 1322 zone->nr_scan_inactive += (zone->nr_inactive >> prio) + 1; 1323 if (zone->nr_scan_inactive >= nr_pages || pass > 3) { 1324 zone->nr_scan_inactive = 0; 1325 nr_to_scan = min(nr_pages, zone->nr_inactive); 1326 ret += shrink_inactive_list(nr_to_scan, zone, sc); 1327 if (ret >= nr_pages) 1328 return ret; 1329 } 1330 } 1331 1332 return ret; 1333 } 1334 1335 /* 1336 * Try to free `nr_pages' of memory, system-wide, and return the number of 1337 * freed pages. 1338 * 1339 * Rather than trying to age LRUs the aim is to preserve the overall 1340 * LRU order by reclaiming preferentially 1341 * inactive > active > active referenced > active mapped 1342 */ 1343 unsigned long shrink_all_memory(unsigned long nr_pages) 1344 { 1345 unsigned long lru_pages, nr_slab; 1346 unsigned long ret = 0; 1347 int pass; 1348 struct reclaim_state reclaim_state; 1349 struct zone *zone; 1350 struct scan_control sc = { 1351 .gfp_mask = GFP_KERNEL, 1352 .may_swap = 0, 1353 .swap_cluster_max = nr_pages, 1354 .may_writepage = 1, 1355 .swappiness = vm_swappiness, 1356 }; 1357 1358 current->reclaim_state = &reclaim_state; 1359 1360 lru_pages = 0; 1361 for_each_zone(zone) 1362 lru_pages += zone->nr_active + zone->nr_inactive; 1363 1364 nr_slab = global_page_state(NR_SLAB); 1365 /* If slab caches are huge, it's better to hit them first */ 1366 while (nr_slab >= lru_pages) { 1367 reclaim_state.reclaimed_slab = 0; 1368 shrink_slab(nr_pages, sc.gfp_mask, lru_pages); 1369 if (!reclaim_state.reclaimed_slab) 1370 break; 1371 1372 ret += reclaim_state.reclaimed_slab; 1373 if (ret >= nr_pages) 1374 goto out; 1375 1376 nr_slab -= reclaim_state.reclaimed_slab; 1377 } 1378 1379 /* 1380 * We try to shrink LRUs in 5 passes: 1381 * 0 = Reclaim from inactive_list only 1382 * 1 = Reclaim from active list but don't reclaim mapped 1383 * 2 = 2nd pass of type 1 1384 * 3 = Reclaim mapped (normal reclaim) 1385 * 4 = 2nd pass of type 3 1386 */ 1387 for (pass = 0; pass < 5; pass++) { 1388 int prio; 1389 1390 /* Needed for shrinking slab caches later on */ 1391 if (!lru_pages) 1392 for_each_zone(zone) { 1393 lru_pages += zone->nr_active; 1394 lru_pages += zone->nr_inactive; 1395 } 1396 1397 /* Force reclaiming mapped pages in the passes #3 and #4 */ 1398 if (pass > 2) { 1399 sc.may_swap = 1; 1400 sc.swappiness = 100; 1401 } 1402 1403 for (prio = DEF_PRIORITY; prio >= 0; prio--) { 1404 unsigned long nr_to_scan = nr_pages - ret; 1405 1406 sc.nr_scanned = 0; 1407 ret += shrink_all_zones(nr_to_scan, prio, pass, &sc); 1408 if (ret >= nr_pages) 1409 goto out; 1410 1411 reclaim_state.reclaimed_slab = 0; 1412 shrink_slab(sc.nr_scanned, sc.gfp_mask, lru_pages); 1413 ret += reclaim_state.reclaimed_slab; 1414 if (ret >= nr_pages) 1415 goto out; 1416 1417 if (sc.nr_scanned && prio < DEF_PRIORITY - 2) 1418 blk_congestion_wait(WRITE, HZ / 10); 1419 } 1420 1421 lru_pages = 0; 1422 } 1423 1424 /* 1425 * If ret = 0, we could not shrink LRUs, but there may be something 1426 * in slab caches 1427 */ 1428 if (!ret) 1429 do { 1430 reclaim_state.reclaimed_slab = 0; 1431 shrink_slab(nr_pages, sc.gfp_mask, lru_pages); 1432 ret += reclaim_state.reclaimed_slab; 1433 } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); 1434 1435 out: 1436 current->reclaim_state = NULL; 1437 1438 return ret; 1439 } 1440 #endif 1441 1442 #ifdef CONFIG_HOTPLUG_CPU 1443 /* It's optimal to keep kswapds on the same CPUs as their memory, but 1444 not required for correctness. So if the last cpu in a node goes 1445 away, we get changed to run anywhere: as the first one comes back, 1446 restore their cpu bindings. */ 1447 static int __devinit cpu_callback(struct notifier_block *nfb, 1448 unsigned long action, void *hcpu) 1449 { 1450 pg_data_t *pgdat; 1451 cpumask_t mask; 1452 1453 if (action == CPU_ONLINE) { 1454 for_each_online_pgdat(pgdat) { 1455 mask = node_to_cpumask(pgdat->node_id); 1456 if (any_online_cpu(mask) != NR_CPUS) 1457 /* One of our CPUs online: restore mask */ 1458 set_cpus_allowed(pgdat->kswapd, mask); 1459 } 1460 } 1461 return NOTIFY_OK; 1462 } 1463 #endif /* CONFIG_HOTPLUG_CPU */ 1464 1465 /* 1466 * This kswapd start function will be called by init and node-hot-add. 1467 * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. 1468 */ 1469 int kswapd_run(int nid) 1470 { 1471 pg_data_t *pgdat = NODE_DATA(nid); 1472 int ret = 0; 1473 1474 if (pgdat->kswapd) 1475 return 0; 1476 1477 pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); 1478 if (IS_ERR(pgdat->kswapd)) { 1479 /* failure at boot is fatal */ 1480 BUG_ON(system_state == SYSTEM_BOOTING); 1481 printk("Failed to start kswapd on node %d\n",nid); 1482 ret = -1; 1483 } 1484 return ret; 1485 } 1486 1487 static int __init kswapd_init(void) 1488 { 1489 int nid; 1490 1491 swap_setup(); 1492 for_each_online_node(nid) 1493 kswapd_run(nid); 1494 hotcpu_notifier(cpu_callback, 0); 1495 return 0; 1496 } 1497 1498 module_init(kswapd_init) 1499 1500 #ifdef CONFIG_NUMA 1501 /* 1502 * Zone reclaim mode 1503 * 1504 * If non-zero call zone_reclaim when the number of free pages falls below 1505 * the watermarks. 1506 */ 1507 int zone_reclaim_mode __read_mostly; 1508 1509 #define RECLAIM_OFF 0 1510 #define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */ 1511 #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ 1512 #define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ 1513 #define RECLAIM_SLAB (1<<3) /* Do a global slab shrink if the zone is out of memory */ 1514 1515 /* 1516 * Priority for ZONE_RECLAIM. This determines the fraction of pages 1517 * of a node considered for each zone_reclaim. 4 scans 1/16th of 1518 * a zone. 1519 */ 1520 #define ZONE_RECLAIM_PRIORITY 4 1521 1522 /* 1523 * Percentage of pages in a zone that must be unmapped for zone_reclaim to 1524 * occur. 1525 */ 1526 int sysctl_min_unmapped_ratio = 1; 1527 1528 /* 1529 * Try to free up some pages from this zone through reclaim. 1530 */ 1531 static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) 1532 { 1533 /* Minimum pages needed in order to stay on node */ 1534 const unsigned long nr_pages = 1 << order; 1535 struct task_struct *p = current; 1536 struct reclaim_state reclaim_state; 1537 int priority; 1538 unsigned long nr_reclaimed = 0; 1539 struct scan_control sc = { 1540 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), 1541 .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), 1542 .swap_cluster_max = max_t(unsigned long, nr_pages, 1543 SWAP_CLUSTER_MAX), 1544 .gfp_mask = gfp_mask, 1545 .swappiness = vm_swappiness, 1546 }; 1547 1548 disable_swap_token(); 1549 cond_resched(); 1550 /* 1551 * We need to be able to allocate from the reserves for RECLAIM_SWAP 1552 * and we also need to be able to write out pages for RECLAIM_WRITE 1553 * and RECLAIM_SWAP. 1554 */ 1555 p->flags |= PF_MEMALLOC | PF_SWAPWRITE; 1556 reclaim_state.reclaimed_slab = 0; 1557 p->reclaim_state = &reclaim_state; 1558 1559 /* 1560 * Free memory by calling shrink zone with increasing priorities 1561 * until we have enough memory freed. 1562 */ 1563 priority = ZONE_RECLAIM_PRIORITY; 1564 do { 1565 nr_reclaimed += shrink_zone(priority, zone, &sc); 1566 priority--; 1567 } while (priority >= 0 && nr_reclaimed < nr_pages); 1568 1569 if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) { 1570 /* 1571 * shrink_slab() does not currently allow us to determine how 1572 * many pages were freed in this zone. So we just shake the slab 1573 * a bit and then go off node for this particular allocation 1574 * despite possibly having freed enough memory to allocate in 1575 * this zone. If we freed local memory then the next 1576 * allocations will be local again. 1577 * 1578 * shrink_slab will free memory on all zones and may take 1579 * a long time. 1580 */ 1581 shrink_slab(sc.nr_scanned, gfp_mask, order); 1582 } 1583 1584 p->reclaim_state = NULL; 1585 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); 1586 return nr_reclaimed >= nr_pages; 1587 } 1588 1589 int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) 1590 { 1591 cpumask_t mask; 1592 int node_id; 1593 1594 /* 1595 * Zone reclaim reclaims unmapped file backed pages. 1596 * 1597 * A small portion of unmapped file backed pages is needed for 1598 * file I/O otherwise pages read by file I/O will be immediately 1599 * thrown out if the zone is overallocated. So we do not reclaim 1600 * if less than a specified percentage of the zone is used by 1601 * unmapped file backed pages. 1602 */ 1603 if (zone_page_state(zone, NR_FILE_PAGES) - 1604 zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_ratio) 1605 return 0; 1606 1607 /* 1608 * Avoid concurrent zone reclaims, do not reclaim in a zone that does 1609 * not have reclaimable pages and if we should not delay the allocation 1610 * then do not scan. 1611 */ 1612 if (!(gfp_mask & __GFP_WAIT) || 1613 zone->all_unreclaimable || 1614 atomic_read(&zone->reclaim_in_progress) > 0 || 1615 (current->flags & PF_MEMALLOC)) 1616 return 0; 1617 1618 /* 1619 * Only run zone reclaim on the local zone or on zones that do not 1620 * have associated processors. This will favor the local processor 1621 * over remote processors and spread off node memory allocations 1622 * as wide as possible. 1623 */ 1624 node_id = zone->zone_pgdat->node_id; 1625 mask = node_to_cpumask(node_id); 1626 if (!cpus_empty(mask) && node_id != numa_node_id()) 1627 return 0; 1628 return __zone_reclaim(zone, gfp_mask, order); 1629 } 1630 #endif 1631