1 /* 2 * linux/mm/swap.c 3 * 4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 5 */ 6 7 /* 8 * This file contains the default values for the operation of the 9 * Linux VM subsystem. Fine-tuning documentation can be found in 10 * Documentation/sysctl/vm.txt. 11 * Started 18.12.91 12 * Swap aging added 23.2.95, Stephen Tweedie. 13 * Buffermem limits added 12.3.98, Rik van Riel. 14 */ 15 16 #include <linux/mm.h> 17 #include <linux/sched.h> 18 #include <linux/kernel_stat.h> 19 #include <linux/swap.h> 20 #include <linux/mman.h> 21 #include <linux/pagemap.h> 22 #include <linux/pagevec.h> 23 #include <linux/init.h> 24 #include <linux/export.h> 25 #include <linux/mm_inline.h> 26 #include <linux/percpu_counter.h> 27 #include <linux/percpu.h> 28 #include <linux/cpu.h> 29 #include <linux/notifier.h> 30 #include <linux/backing-dev.h> 31 #include <linux/memcontrol.h> 32 #include <linux/gfp.h> 33 #include <linux/uio.h> 34 #include <linux/hugetlb.h> 35 36 #include "internal.h" 37 38 #define CREATE_TRACE_POINTS 39 #include <trace/events/pagemap.h> 40 41 /* How many pages do we try to swap or page in/out together? */ 42 int page_cluster; 43 44 static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); 45 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); 46 static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs); 47 48 /* 49 * This path almost never happens for VM activity - pages are normally 50 * freed via pagevecs. But it gets used by networking. 51 */ 52 static void __page_cache_release(struct page *page) 53 { 54 if (PageLRU(page)) { 55 struct zone *zone = page_zone(page); 56 struct lruvec *lruvec; 57 unsigned long flags; 58 59 spin_lock_irqsave(&zone->lru_lock, flags); 60 lruvec = mem_cgroup_page_lruvec(page, zone); 61 VM_BUG_ON_PAGE(!PageLRU(page), page); 62 __ClearPageLRU(page); 63 del_page_from_lru_list(page, lruvec, page_off_lru(page)); 64 spin_unlock_irqrestore(&zone->lru_lock, flags); 65 } 66 mem_cgroup_uncharge(page); 67 } 68 69 static void __put_single_page(struct page *page) 70 { 71 __page_cache_release(page); 72 free_hot_cold_page(page, false); 73 } 74 75 static void __put_compound_page(struct page *page) 76 { 77 compound_page_dtor *dtor; 78 79 /* 80 * __page_cache_release() is supposed to be called for thp, not for 81 * hugetlb. This is because hugetlb page does never have PageLRU set 82 * (it's never listed to any LRU lists) and no memcg routines should 83 * be called for hugetlb (it has a separate hugetlb_cgroup.) 84 */ 85 if (!PageHuge(page)) 86 __page_cache_release(page); 87 dtor = get_compound_page_dtor(page); 88 (*dtor)(page); 89 } 90 91 /** 92 * Two special cases here: we could avoid taking compound_lock_irqsave 93 * and could skip the tail refcounting(in _mapcount). 94 * 95 * 1. Hugetlbfs page: 96 * 97 * PageHeadHuge will remain true until the compound page 98 * is released and enters the buddy allocator, and it could 99 * not be split by __split_huge_page_refcount(). 100 * 101 * So if we see PageHeadHuge set, and we have the tail page pin, 102 * then we could safely put head page. 103 * 104 * 2. Slab THP page: 105 * 106 * PG_slab is cleared before the slab frees the head page, and 107 * tail pin cannot be the last reference left on the head page, 108 * because the slab code is free to reuse the compound page 109 * after a kfree/kmem_cache_free without having to check if 110 * there's any tail pin left. In turn all tail pinsmust be always 111 * released while the head is still pinned by the slab code 112 * and so we know PG_slab will be still set too. 113 * 114 * So if we see PageSlab set, and we have the tail page pin, 115 * then we could safely put head page. 116 */ 117 static __always_inline 118 void put_unrefcounted_compound_page(struct page *page_head, struct page *page) 119 { 120 /* 121 * If @page is a THP tail, we must read the tail page 122 * flags after the head page flags. The 123 * __split_huge_page_refcount side enforces write memory barriers 124 * between clearing PageTail and before the head page 125 * can be freed and reallocated. 126 */ 127 smp_rmb(); 128 if (likely(PageTail(page))) { 129 /* 130 * __split_huge_page_refcount cannot race 131 * here, see the comment above this function. 132 */ 133 VM_BUG_ON_PAGE(!PageHead(page_head), page_head); 134 if (put_page_testzero(page_head)) { 135 /* 136 * If this is the tail of a slab THP page, 137 * the tail pin must not be the last reference 138 * held on the page, because the PG_slab cannot 139 * be cleared before all tail pins (which skips 140 * the _mapcount tail refcounting) have been 141 * released. 142 * 143 * If this is the tail of a hugetlbfs page, 144 * the tail pin may be the last reference on 145 * the page instead, because PageHeadHuge will 146 * not go away until the compound page enters 147 * the buddy allocator. 148 */ 149 VM_BUG_ON_PAGE(PageSlab(page_head), page_head); 150 __put_compound_page(page_head); 151 } 152 } else 153 /* 154 * __split_huge_page_refcount run before us, 155 * @page was a THP tail. The split @page_head 156 * has been freed and reallocated as slab or 157 * hugetlbfs page of smaller order (only 158 * possible if reallocated as slab on x86). 159 */ 160 if (put_page_testzero(page)) 161 __put_single_page(page); 162 } 163 164 static __always_inline 165 void put_refcounted_compound_page(struct page *page_head, struct page *page) 166 { 167 if (likely(page != page_head && get_page_unless_zero(page_head))) { 168 unsigned long flags; 169 170 /* 171 * @page_head wasn't a dangling pointer but it may not 172 * be a head page anymore by the time we obtain the 173 * lock. That is ok as long as it can't be freed from 174 * under us. 175 */ 176 flags = compound_lock_irqsave(page_head); 177 if (unlikely(!PageTail(page))) { 178 /* __split_huge_page_refcount run before us */ 179 compound_unlock_irqrestore(page_head, flags); 180 if (put_page_testzero(page_head)) { 181 /* 182 * The @page_head may have been freed 183 * and reallocated as a compound page 184 * of smaller order and then freed 185 * again. All we know is that it 186 * cannot have become: a THP page, a 187 * compound page of higher order, a 188 * tail page. That is because we 189 * still hold the refcount of the 190 * split THP tail and page_head was 191 * the THP head before the split. 192 */ 193 if (PageHead(page_head)) 194 __put_compound_page(page_head); 195 else 196 __put_single_page(page_head); 197 } 198 out_put_single: 199 if (put_page_testzero(page)) 200 __put_single_page(page); 201 return; 202 } 203 VM_BUG_ON_PAGE(page_head != page->first_page, page); 204 /* 205 * We can release the refcount taken by 206 * get_page_unless_zero() now that 207 * __split_huge_page_refcount() is blocked on the 208 * compound_lock. 209 */ 210 if (put_page_testzero(page_head)) 211 VM_BUG_ON_PAGE(1, page_head); 212 /* __split_huge_page_refcount will wait now */ 213 VM_BUG_ON_PAGE(page_mapcount(page) <= 0, page); 214 atomic_dec(&page->_mapcount); 215 VM_BUG_ON_PAGE(atomic_read(&page_head->_count) <= 0, page_head); 216 VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page); 217 compound_unlock_irqrestore(page_head, flags); 218 219 if (put_page_testzero(page_head)) { 220 if (PageHead(page_head)) 221 __put_compound_page(page_head); 222 else 223 __put_single_page(page_head); 224 } 225 } else { 226 /* @page_head is a dangling pointer */ 227 VM_BUG_ON_PAGE(PageTail(page), page); 228 goto out_put_single; 229 } 230 } 231 232 static void put_compound_page(struct page *page) 233 { 234 struct page *page_head; 235 236 /* 237 * We see the PageCompound set and PageTail not set, so @page maybe: 238 * 1. hugetlbfs head page, or 239 * 2. THP head page. 240 */ 241 if (likely(!PageTail(page))) { 242 if (put_page_testzero(page)) { 243 /* 244 * By the time all refcounts have been released 245 * split_huge_page cannot run anymore from under us. 246 */ 247 if (PageHead(page)) 248 __put_compound_page(page); 249 else 250 __put_single_page(page); 251 } 252 return; 253 } 254 255 /* 256 * We see the PageCompound set and PageTail set, so @page maybe: 257 * 1. a tail hugetlbfs page, or 258 * 2. a tail THP page, or 259 * 3. a split THP page. 260 * 261 * Case 3 is possible, as we may race with 262 * __split_huge_page_refcount tearing down a THP page. 263 */ 264 page_head = compound_head_by_tail(page); 265 if (!__compound_tail_refcounted(page_head)) 266 put_unrefcounted_compound_page(page_head, page); 267 else 268 put_refcounted_compound_page(page_head, page); 269 } 270 271 void put_page(struct page *page) 272 { 273 if (unlikely(PageCompound(page))) 274 put_compound_page(page); 275 else if (put_page_testzero(page)) 276 __put_single_page(page); 277 } 278 EXPORT_SYMBOL(put_page); 279 280 /* 281 * This function is exported but must not be called by anything other 282 * than get_page(). It implements the slow path of get_page(). 283 */ 284 bool __get_page_tail(struct page *page) 285 { 286 /* 287 * This takes care of get_page() if run on a tail page 288 * returned by one of the get_user_pages/follow_page variants. 289 * get_user_pages/follow_page itself doesn't need the compound 290 * lock because it runs __get_page_tail_foll() under the 291 * proper PT lock that already serializes against 292 * split_huge_page(). 293 */ 294 unsigned long flags; 295 bool got; 296 struct page *page_head = compound_head(page); 297 298 /* Ref to put_compound_page() comment. */ 299 if (!__compound_tail_refcounted(page_head)) { 300 smp_rmb(); 301 if (likely(PageTail(page))) { 302 /* 303 * This is a hugetlbfs page or a slab 304 * page. __split_huge_page_refcount 305 * cannot race here. 306 */ 307 VM_BUG_ON_PAGE(!PageHead(page_head), page_head); 308 __get_page_tail_foll(page, true); 309 return true; 310 } else { 311 /* 312 * __split_huge_page_refcount run 313 * before us, "page" was a THP 314 * tail. The split page_head has been 315 * freed and reallocated as slab or 316 * hugetlbfs page of smaller order 317 * (only possible if reallocated as 318 * slab on x86). 319 */ 320 return false; 321 } 322 } 323 324 got = false; 325 if (likely(page != page_head && get_page_unless_zero(page_head))) { 326 /* 327 * page_head wasn't a dangling pointer but it 328 * may not be a head page anymore by the time 329 * we obtain the lock. That is ok as long as it 330 * can't be freed from under us. 331 */ 332 flags = compound_lock_irqsave(page_head); 333 /* here __split_huge_page_refcount won't run anymore */ 334 if (likely(PageTail(page))) { 335 __get_page_tail_foll(page, false); 336 got = true; 337 } 338 compound_unlock_irqrestore(page_head, flags); 339 if (unlikely(!got)) 340 put_page(page_head); 341 } 342 return got; 343 } 344 EXPORT_SYMBOL(__get_page_tail); 345 346 /** 347 * put_pages_list() - release a list of pages 348 * @pages: list of pages threaded on page->lru 349 * 350 * Release a list of pages which are strung together on page.lru. Currently 351 * used by read_cache_pages() and related error recovery code. 352 */ 353 void put_pages_list(struct list_head *pages) 354 { 355 while (!list_empty(pages)) { 356 struct page *victim; 357 358 victim = list_entry(pages->prev, struct page, lru); 359 list_del(&victim->lru); 360 page_cache_release(victim); 361 } 362 } 363 EXPORT_SYMBOL(put_pages_list); 364 365 /* 366 * get_kernel_pages() - pin kernel pages in memory 367 * @kiov: An array of struct kvec structures 368 * @nr_segs: number of segments to pin 369 * @write: pinning for read/write, currently ignored 370 * @pages: array that receives pointers to the pages pinned. 371 * Should be at least nr_segs long. 372 * 373 * Returns number of pages pinned. This may be fewer than the number 374 * requested. If nr_pages is 0 or negative, returns 0. If no pages 375 * were pinned, returns -errno. Each page returned must be released 376 * with a put_page() call when it is finished with. 377 */ 378 int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write, 379 struct page **pages) 380 { 381 int seg; 382 383 for (seg = 0; seg < nr_segs; seg++) { 384 if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE)) 385 return seg; 386 387 pages[seg] = kmap_to_page(kiov[seg].iov_base); 388 page_cache_get(pages[seg]); 389 } 390 391 return seg; 392 } 393 EXPORT_SYMBOL_GPL(get_kernel_pages); 394 395 /* 396 * get_kernel_page() - pin a kernel page in memory 397 * @start: starting kernel address 398 * @write: pinning for read/write, currently ignored 399 * @pages: array that receives pointer to the page pinned. 400 * Must be at least nr_segs long. 401 * 402 * Returns 1 if page is pinned. If the page was not pinned, returns 403 * -errno. The page returned must be released with a put_page() call 404 * when it is finished with. 405 */ 406 int get_kernel_page(unsigned long start, int write, struct page **pages) 407 { 408 const struct kvec kiov = { 409 .iov_base = (void *)start, 410 .iov_len = PAGE_SIZE 411 }; 412 413 return get_kernel_pages(&kiov, 1, write, pages); 414 } 415 EXPORT_SYMBOL_GPL(get_kernel_page); 416 417 static void pagevec_lru_move_fn(struct pagevec *pvec, 418 void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg), 419 void *arg) 420 { 421 int i; 422 struct zone *zone = NULL; 423 struct lruvec *lruvec; 424 unsigned long flags = 0; 425 426 for (i = 0; i < pagevec_count(pvec); i++) { 427 struct page *page = pvec->pages[i]; 428 struct zone *pagezone = page_zone(page); 429 430 if (pagezone != zone) { 431 if (zone) 432 spin_unlock_irqrestore(&zone->lru_lock, flags); 433 zone = pagezone; 434 spin_lock_irqsave(&zone->lru_lock, flags); 435 } 436 437 lruvec = mem_cgroup_page_lruvec(page, zone); 438 (*move_fn)(page, lruvec, arg); 439 } 440 if (zone) 441 spin_unlock_irqrestore(&zone->lru_lock, flags); 442 release_pages(pvec->pages, pvec->nr, pvec->cold); 443 pagevec_reinit(pvec); 444 } 445 446 static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec, 447 void *arg) 448 { 449 int *pgmoved = arg; 450 451 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 452 enum lru_list lru = page_lru_base_type(page); 453 list_move_tail(&page->lru, &lruvec->lists[lru]); 454 (*pgmoved)++; 455 } 456 } 457 458 /* 459 * pagevec_move_tail() must be called with IRQ disabled. 460 * Otherwise this may cause nasty races. 461 */ 462 static void pagevec_move_tail(struct pagevec *pvec) 463 { 464 int pgmoved = 0; 465 466 pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved); 467 __count_vm_events(PGROTATED, pgmoved); 468 } 469 470 /* 471 * Writeback is about to end against a page which has been marked for immediate 472 * reclaim. If it still appears to be reclaimable, move it to the tail of the 473 * inactive list. 474 */ 475 void rotate_reclaimable_page(struct page *page) 476 { 477 if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && 478 !PageUnevictable(page) && PageLRU(page)) { 479 struct pagevec *pvec; 480 unsigned long flags; 481 482 page_cache_get(page); 483 local_irq_save(flags); 484 pvec = this_cpu_ptr(&lru_rotate_pvecs); 485 if (!pagevec_add(pvec, page)) 486 pagevec_move_tail(pvec); 487 local_irq_restore(flags); 488 } 489 } 490 491 static void update_page_reclaim_stat(struct lruvec *lruvec, 492 int file, int rotated) 493 { 494 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; 495 496 reclaim_stat->recent_scanned[file]++; 497 if (rotated) 498 reclaim_stat->recent_rotated[file]++; 499 } 500 501 static void __activate_page(struct page *page, struct lruvec *lruvec, 502 void *arg) 503 { 504 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 505 int file = page_is_file_cache(page); 506 int lru = page_lru_base_type(page); 507 508 del_page_from_lru_list(page, lruvec, lru); 509 SetPageActive(page); 510 lru += LRU_ACTIVE; 511 add_page_to_lru_list(page, lruvec, lru); 512 trace_mm_lru_activate(page); 513 514 __count_vm_event(PGACTIVATE); 515 update_page_reclaim_stat(lruvec, file, 1); 516 } 517 } 518 519 #ifdef CONFIG_SMP 520 static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); 521 522 static void activate_page_drain(int cpu) 523 { 524 struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu); 525 526 if (pagevec_count(pvec)) 527 pagevec_lru_move_fn(pvec, __activate_page, NULL); 528 } 529 530 static bool need_activate_page_drain(int cpu) 531 { 532 return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0; 533 } 534 535 void activate_page(struct page *page) 536 { 537 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 538 struct pagevec *pvec = &get_cpu_var(activate_page_pvecs); 539 540 page_cache_get(page); 541 if (!pagevec_add(pvec, page)) 542 pagevec_lru_move_fn(pvec, __activate_page, NULL); 543 put_cpu_var(activate_page_pvecs); 544 } 545 } 546 547 #else 548 static inline void activate_page_drain(int cpu) 549 { 550 } 551 552 static bool need_activate_page_drain(int cpu) 553 { 554 return false; 555 } 556 557 void activate_page(struct page *page) 558 { 559 struct zone *zone = page_zone(page); 560 561 spin_lock_irq(&zone->lru_lock); 562 __activate_page(page, mem_cgroup_page_lruvec(page, zone), NULL); 563 spin_unlock_irq(&zone->lru_lock); 564 } 565 #endif 566 567 static void __lru_cache_activate_page(struct page *page) 568 { 569 struct pagevec *pvec = &get_cpu_var(lru_add_pvec); 570 int i; 571 572 /* 573 * Search backwards on the optimistic assumption that the page being 574 * activated has just been added to this pagevec. Note that only 575 * the local pagevec is examined as a !PageLRU page could be in the 576 * process of being released, reclaimed, migrated or on a remote 577 * pagevec that is currently being drained. Furthermore, marking 578 * a remote pagevec's page PageActive potentially hits a race where 579 * a page is marked PageActive just after it is added to the inactive 580 * list causing accounting errors and BUG_ON checks to trigger. 581 */ 582 for (i = pagevec_count(pvec) - 1; i >= 0; i--) { 583 struct page *pagevec_page = pvec->pages[i]; 584 585 if (pagevec_page == page) { 586 SetPageActive(page); 587 break; 588 } 589 } 590 591 put_cpu_var(lru_add_pvec); 592 } 593 594 /* 595 * Mark a page as having seen activity. 596 * 597 * inactive,unreferenced -> inactive,referenced 598 * inactive,referenced -> active,unreferenced 599 * active,unreferenced -> active,referenced 600 * 601 * When a newly allocated page is not yet visible, so safe for non-atomic ops, 602 * __SetPageReferenced(page) may be substituted for mark_page_accessed(page). 603 */ 604 void mark_page_accessed(struct page *page) 605 { 606 if (!PageActive(page) && !PageUnevictable(page) && 607 PageReferenced(page)) { 608 609 /* 610 * If the page is on the LRU, queue it for activation via 611 * activate_page_pvecs. Otherwise, assume the page is on a 612 * pagevec, mark it active and it'll be moved to the active 613 * LRU on the next drain. 614 */ 615 if (PageLRU(page)) 616 activate_page(page); 617 else 618 __lru_cache_activate_page(page); 619 ClearPageReferenced(page); 620 if (page_is_file_cache(page)) 621 workingset_activation(page); 622 } else if (!PageReferenced(page)) { 623 SetPageReferenced(page); 624 } 625 } 626 EXPORT_SYMBOL(mark_page_accessed); 627 628 static void __lru_cache_add(struct page *page) 629 { 630 struct pagevec *pvec = &get_cpu_var(lru_add_pvec); 631 632 page_cache_get(page); 633 if (!pagevec_space(pvec)) 634 __pagevec_lru_add(pvec); 635 pagevec_add(pvec, page); 636 put_cpu_var(lru_add_pvec); 637 } 638 639 /** 640 * lru_cache_add: add a page to the page lists 641 * @page: the page to add 642 */ 643 void lru_cache_add_anon(struct page *page) 644 { 645 if (PageActive(page)) 646 ClearPageActive(page); 647 __lru_cache_add(page); 648 } 649 650 void lru_cache_add_file(struct page *page) 651 { 652 if (PageActive(page)) 653 ClearPageActive(page); 654 __lru_cache_add(page); 655 } 656 EXPORT_SYMBOL(lru_cache_add_file); 657 658 /** 659 * lru_cache_add - add a page to a page list 660 * @page: the page to be added to the LRU. 661 * 662 * Queue the page for addition to the LRU via pagevec. The decision on whether 663 * to add the page to the [in]active [file|anon] list is deferred until the 664 * pagevec is drained. This gives a chance for the caller of lru_cache_add() 665 * have the page added to the active list using mark_page_accessed(). 666 */ 667 void lru_cache_add(struct page *page) 668 { 669 VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page); 670 VM_BUG_ON_PAGE(PageLRU(page), page); 671 __lru_cache_add(page); 672 } 673 674 /** 675 * add_page_to_unevictable_list - add a page to the unevictable list 676 * @page: the page to be added to the unevictable list 677 * 678 * Add page directly to its zone's unevictable list. To avoid races with 679 * tasks that might be making the page evictable, through eg. munlock, 680 * munmap or exit, while it's not on the lru, we want to add the page 681 * while it's locked or otherwise "invisible" to other tasks. This is 682 * difficult to do when using the pagevec cache, so bypass that. 683 */ 684 void add_page_to_unevictable_list(struct page *page) 685 { 686 struct zone *zone = page_zone(page); 687 struct lruvec *lruvec; 688 689 spin_lock_irq(&zone->lru_lock); 690 lruvec = mem_cgroup_page_lruvec(page, zone); 691 ClearPageActive(page); 692 SetPageUnevictable(page); 693 SetPageLRU(page); 694 add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE); 695 spin_unlock_irq(&zone->lru_lock); 696 } 697 698 /** 699 * lru_cache_add_active_or_unevictable 700 * @page: the page to be added to LRU 701 * @vma: vma in which page is mapped for determining reclaimability 702 * 703 * Place @page on the active or unevictable LRU list, depending on its 704 * evictability. Note that if the page is not evictable, it goes 705 * directly back onto it's zone's unevictable list, it does NOT use a 706 * per cpu pagevec. 707 */ 708 void lru_cache_add_active_or_unevictable(struct page *page, 709 struct vm_area_struct *vma) 710 { 711 VM_BUG_ON_PAGE(PageLRU(page), page); 712 713 if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) { 714 SetPageActive(page); 715 lru_cache_add(page); 716 return; 717 } 718 719 if (!TestSetPageMlocked(page)) { 720 /* 721 * We use the irq-unsafe __mod_zone_page_stat because this 722 * counter is not modified from interrupt context, and the pte 723 * lock is held(spinlock), which implies preemption disabled. 724 */ 725 __mod_zone_page_state(page_zone(page), NR_MLOCK, 726 hpage_nr_pages(page)); 727 count_vm_event(UNEVICTABLE_PGMLOCKED); 728 } 729 add_page_to_unevictable_list(page); 730 } 731 732 /* 733 * If the page can not be invalidated, it is moved to the 734 * inactive list to speed up its reclaim. It is moved to the 735 * head of the list, rather than the tail, to give the flusher 736 * threads some time to write it out, as this is much more 737 * effective than the single-page writeout from reclaim. 738 * 739 * If the page isn't page_mapped and dirty/writeback, the page 740 * could reclaim asap using PG_reclaim. 741 * 742 * 1. active, mapped page -> none 743 * 2. active, dirty/writeback page -> inactive, head, PG_reclaim 744 * 3. inactive, mapped page -> none 745 * 4. inactive, dirty/writeback page -> inactive, head, PG_reclaim 746 * 5. inactive, clean -> inactive, tail 747 * 6. Others -> none 748 * 749 * In 4, why it moves inactive's head, the VM expects the page would 750 * be write it out by flusher threads as this is much more effective 751 * than the single-page writeout from reclaim. 752 */ 753 static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, 754 void *arg) 755 { 756 int lru, file; 757 bool active; 758 759 if (!PageLRU(page)) 760 return; 761 762 if (PageUnevictable(page)) 763 return; 764 765 /* Some processes are using the page */ 766 if (page_mapped(page)) 767 return; 768 769 active = PageActive(page); 770 file = page_is_file_cache(page); 771 lru = page_lru_base_type(page); 772 773 del_page_from_lru_list(page, lruvec, lru + active); 774 ClearPageActive(page); 775 ClearPageReferenced(page); 776 add_page_to_lru_list(page, lruvec, lru); 777 778 if (PageWriteback(page) || PageDirty(page)) { 779 /* 780 * PG_reclaim could be raced with end_page_writeback 781 * It can make readahead confusing. But race window 782 * is _really_ small and it's non-critical problem. 783 */ 784 SetPageReclaim(page); 785 } else { 786 /* 787 * The page's writeback ends up during pagevec 788 * We moves tha page into tail of inactive. 789 */ 790 list_move_tail(&page->lru, &lruvec->lists[lru]); 791 __count_vm_event(PGROTATED); 792 } 793 794 if (active) 795 __count_vm_event(PGDEACTIVATE); 796 update_page_reclaim_stat(lruvec, file, 0); 797 } 798 799 /* 800 * Drain pages out of the cpu's pagevecs. 801 * Either "cpu" is the current CPU, and preemption has already been 802 * disabled; or "cpu" is being hot-unplugged, and is already dead. 803 */ 804 void lru_add_drain_cpu(int cpu) 805 { 806 struct pagevec *pvec = &per_cpu(lru_add_pvec, cpu); 807 808 if (pagevec_count(pvec)) 809 __pagevec_lru_add(pvec); 810 811 pvec = &per_cpu(lru_rotate_pvecs, cpu); 812 if (pagevec_count(pvec)) { 813 unsigned long flags; 814 815 /* No harm done if a racing interrupt already did this */ 816 local_irq_save(flags); 817 pagevec_move_tail(pvec); 818 local_irq_restore(flags); 819 } 820 821 pvec = &per_cpu(lru_deactivate_file_pvecs, cpu); 822 if (pagevec_count(pvec)) 823 pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); 824 825 activate_page_drain(cpu); 826 } 827 828 /** 829 * deactivate_file_page - forcefully deactivate a file page 830 * @page: page to deactivate 831 * 832 * This function hints the VM that @page is a good reclaim candidate, 833 * for example if its invalidation fails due to the page being dirty 834 * or under writeback. 835 */ 836 void deactivate_file_page(struct page *page) 837 { 838 /* 839 * In a workload with many unevictable page such as mprotect, 840 * unevictable page deactivation for accelerating reclaim is pointless. 841 */ 842 if (PageUnevictable(page)) 843 return; 844 845 if (likely(get_page_unless_zero(page))) { 846 struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs); 847 848 if (!pagevec_add(pvec, page)) 849 pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); 850 put_cpu_var(lru_deactivate_file_pvecs); 851 } 852 } 853 854 void lru_add_drain(void) 855 { 856 lru_add_drain_cpu(get_cpu()); 857 put_cpu(); 858 } 859 860 static void lru_add_drain_per_cpu(struct work_struct *dummy) 861 { 862 lru_add_drain(); 863 } 864 865 static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); 866 867 void lru_add_drain_all(void) 868 { 869 static DEFINE_MUTEX(lock); 870 static struct cpumask has_work; 871 int cpu; 872 873 mutex_lock(&lock); 874 get_online_cpus(); 875 cpumask_clear(&has_work); 876 877 for_each_online_cpu(cpu) { 878 struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); 879 880 if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || 881 pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || 882 pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) || 883 need_activate_page_drain(cpu)) { 884 INIT_WORK(work, lru_add_drain_per_cpu); 885 schedule_work_on(cpu, work); 886 cpumask_set_cpu(cpu, &has_work); 887 } 888 } 889 890 for_each_cpu(cpu, &has_work) 891 flush_work(&per_cpu(lru_add_drain_work, cpu)); 892 893 put_online_cpus(); 894 mutex_unlock(&lock); 895 } 896 897 /** 898 * release_pages - batched page_cache_release() 899 * @pages: array of pages to release 900 * @nr: number of pages 901 * @cold: whether the pages are cache cold 902 * 903 * Decrement the reference count on all the pages in @pages. If it 904 * fell to zero, remove the page from the LRU and free it. 905 */ 906 void release_pages(struct page **pages, int nr, bool cold) 907 { 908 int i; 909 LIST_HEAD(pages_to_free); 910 struct zone *zone = NULL; 911 struct lruvec *lruvec; 912 unsigned long uninitialized_var(flags); 913 unsigned int uninitialized_var(lock_batch); 914 915 for (i = 0; i < nr; i++) { 916 struct page *page = pages[i]; 917 918 if (unlikely(PageCompound(page))) { 919 if (zone) { 920 spin_unlock_irqrestore(&zone->lru_lock, flags); 921 zone = NULL; 922 } 923 put_compound_page(page); 924 continue; 925 } 926 927 /* 928 * Make sure the IRQ-safe lock-holding time does not get 929 * excessive with a continuous string of pages from the 930 * same zone. The lock is held only if zone != NULL. 931 */ 932 if (zone && ++lock_batch == SWAP_CLUSTER_MAX) { 933 spin_unlock_irqrestore(&zone->lru_lock, flags); 934 zone = NULL; 935 } 936 937 if (!put_page_testzero(page)) 938 continue; 939 940 if (PageLRU(page)) { 941 struct zone *pagezone = page_zone(page); 942 943 if (pagezone != zone) { 944 if (zone) 945 spin_unlock_irqrestore(&zone->lru_lock, 946 flags); 947 lock_batch = 0; 948 zone = pagezone; 949 spin_lock_irqsave(&zone->lru_lock, flags); 950 } 951 952 lruvec = mem_cgroup_page_lruvec(page, zone); 953 VM_BUG_ON_PAGE(!PageLRU(page), page); 954 __ClearPageLRU(page); 955 del_page_from_lru_list(page, lruvec, page_off_lru(page)); 956 } 957 958 /* Clear Active bit in case of parallel mark_page_accessed */ 959 __ClearPageActive(page); 960 961 list_add(&page->lru, &pages_to_free); 962 } 963 if (zone) 964 spin_unlock_irqrestore(&zone->lru_lock, flags); 965 966 mem_cgroup_uncharge_list(&pages_to_free); 967 free_hot_cold_page_list(&pages_to_free, cold); 968 } 969 EXPORT_SYMBOL(release_pages); 970 971 /* 972 * The pages which we're about to release may be in the deferred lru-addition 973 * queues. That would prevent them from really being freed right now. That's 974 * OK from a correctness point of view but is inefficient - those pages may be 975 * cache-warm and we want to give them back to the page allocator ASAP. 976 * 977 * So __pagevec_release() will drain those queues here. __pagevec_lru_add() 978 * and __pagevec_lru_add_active() call release_pages() directly to avoid 979 * mutual recursion. 980 */ 981 void __pagevec_release(struct pagevec *pvec) 982 { 983 lru_add_drain(); 984 release_pages(pvec->pages, pagevec_count(pvec), pvec->cold); 985 pagevec_reinit(pvec); 986 } 987 EXPORT_SYMBOL(__pagevec_release); 988 989 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 990 /* used by __split_huge_page_refcount() */ 991 void lru_add_page_tail(struct page *page, struct page *page_tail, 992 struct lruvec *lruvec, struct list_head *list) 993 { 994 const int file = 0; 995 996 VM_BUG_ON_PAGE(!PageHead(page), page); 997 VM_BUG_ON_PAGE(PageCompound(page_tail), page); 998 VM_BUG_ON_PAGE(PageLRU(page_tail), page); 999 VM_BUG_ON(NR_CPUS != 1 && 1000 !spin_is_locked(&lruvec_zone(lruvec)->lru_lock)); 1001 1002 if (!list) 1003 SetPageLRU(page_tail); 1004 1005 if (likely(PageLRU(page))) 1006 list_add_tail(&page_tail->lru, &page->lru); 1007 else if (list) { 1008 /* page reclaim is reclaiming a huge page */ 1009 get_page(page_tail); 1010 list_add_tail(&page_tail->lru, list); 1011 } else { 1012 struct list_head *list_head; 1013 /* 1014 * Head page has not yet been counted, as an hpage, 1015 * so we must account for each subpage individually. 1016 * 1017 * Use the standard add function to put page_tail on the list, 1018 * but then correct its position so they all end up in order. 1019 */ 1020 add_page_to_lru_list(page_tail, lruvec, page_lru(page_tail)); 1021 list_head = page_tail->lru.prev; 1022 list_move_tail(&page_tail->lru, list_head); 1023 } 1024 1025 if (!PageUnevictable(page)) 1026 update_page_reclaim_stat(lruvec, file, PageActive(page_tail)); 1027 } 1028 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1029 1030 static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, 1031 void *arg) 1032 { 1033 int file = page_is_file_cache(page); 1034 int active = PageActive(page); 1035 enum lru_list lru = page_lru(page); 1036 1037 VM_BUG_ON_PAGE(PageLRU(page), page); 1038 1039 SetPageLRU(page); 1040 add_page_to_lru_list(page, lruvec, lru); 1041 update_page_reclaim_stat(lruvec, file, active); 1042 trace_mm_lru_insertion(page, lru); 1043 } 1044 1045 /* 1046 * Add the passed pages to the LRU, then drop the caller's refcount 1047 * on them. Reinitialises the caller's pagevec. 1048 */ 1049 void __pagevec_lru_add(struct pagevec *pvec) 1050 { 1051 pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL); 1052 } 1053 EXPORT_SYMBOL(__pagevec_lru_add); 1054 1055 /** 1056 * pagevec_lookup_entries - gang pagecache lookup 1057 * @pvec: Where the resulting entries are placed 1058 * @mapping: The address_space to search 1059 * @start: The starting entry index 1060 * @nr_entries: The maximum number of entries 1061 * @indices: The cache indices corresponding to the entries in @pvec 1062 * 1063 * pagevec_lookup_entries() will search for and return a group of up 1064 * to @nr_entries pages and shadow entries in the mapping. All 1065 * entries are placed in @pvec. pagevec_lookup_entries() takes a 1066 * reference against actual pages in @pvec. 1067 * 1068 * The search returns a group of mapping-contiguous entries with 1069 * ascending indexes. There may be holes in the indices due to 1070 * not-present entries. 1071 * 1072 * pagevec_lookup_entries() returns the number of entries which were 1073 * found. 1074 */ 1075 unsigned pagevec_lookup_entries(struct pagevec *pvec, 1076 struct address_space *mapping, 1077 pgoff_t start, unsigned nr_pages, 1078 pgoff_t *indices) 1079 { 1080 pvec->nr = find_get_entries(mapping, start, nr_pages, 1081 pvec->pages, indices); 1082 return pagevec_count(pvec); 1083 } 1084 1085 /** 1086 * pagevec_remove_exceptionals - pagevec exceptionals pruning 1087 * @pvec: The pagevec to prune 1088 * 1089 * pagevec_lookup_entries() fills both pages and exceptional radix 1090 * tree entries into the pagevec. This function prunes all 1091 * exceptionals from @pvec without leaving holes, so that it can be 1092 * passed on to page-only pagevec operations. 1093 */ 1094 void pagevec_remove_exceptionals(struct pagevec *pvec) 1095 { 1096 int i, j; 1097 1098 for (i = 0, j = 0; i < pagevec_count(pvec); i++) { 1099 struct page *page = pvec->pages[i]; 1100 if (!radix_tree_exceptional_entry(page)) 1101 pvec->pages[j++] = page; 1102 } 1103 pvec->nr = j; 1104 } 1105 1106 /** 1107 * pagevec_lookup - gang pagecache lookup 1108 * @pvec: Where the resulting pages are placed 1109 * @mapping: The address_space to search 1110 * @start: The starting page index 1111 * @nr_pages: The maximum number of pages 1112 * 1113 * pagevec_lookup() will search for and return a group of up to @nr_pages pages 1114 * in the mapping. The pages are placed in @pvec. pagevec_lookup() takes a 1115 * reference against the pages in @pvec. 1116 * 1117 * The search returns a group of mapping-contiguous pages with ascending 1118 * indexes. There may be holes in the indices due to not-present pages. 1119 * 1120 * pagevec_lookup() returns the number of pages which were found. 1121 */ 1122 unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, 1123 pgoff_t start, unsigned nr_pages) 1124 { 1125 pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages); 1126 return pagevec_count(pvec); 1127 } 1128 EXPORT_SYMBOL(pagevec_lookup); 1129 1130 unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, 1131 pgoff_t *index, int tag, unsigned nr_pages) 1132 { 1133 pvec->nr = find_get_pages_tag(mapping, index, tag, 1134 nr_pages, pvec->pages); 1135 return pagevec_count(pvec); 1136 } 1137 EXPORT_SYMBOL(pagevec_lookup_tag); 1138 1139 /* 1140 * Perform any setup for the swap system 1141 */ 1142 void __init swap_setup(void) 1143 { 1144 unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); 1145 #ifdef CONFIG_SWAP 1146 int i; 1147 1148 for (i = 0; i < MAX_SWAPFILES; i++) 1149 spin_lock_init(&swapper_spaces[i].tree_lock); 1150 #endif 1151 1152 /* Use a smaller cluster for small-memory machines */ 1153 if (megs < 16) 1154 page_cluster = 2; 1155 else 1156 page_cluster = 3; 1157 /* 1158 * Right now other parts of the system means that we 1159 * _really_ don't want to cluster much more 1160 */ 1161 } 1162