1 /* 2 * linux/mm/swap.c 3 * 4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 5 */ 6 7 /* 8 * This file contains the default values for the operation of the 9 * Linux VM subsystem. Fine-tuning documentation can be found in 10 * Documentation/sysctl/vm.txt. 11 * Started 18.12.91 12 * Swap aging added 23.2.95, Stephen Tweedie. 13 * Buffermem limits added 12.3.98, Rik van Riel. 14 */ 15 16 #include <linux/mm.h> 17 #include <linux/sched.h> 18 #include <linux/kernel_stat.h> 19 #include <linux/swap.h> 20 #include <linux/mman.h> 21 #include <linux/pagemap.h> 22 #include <linux/pagevec.h> 23 #include <linux/init.h> 24 #include <linux/export.h> 25 #include <linux/mm_inline.h> 26 #include <linux/percpu_counter.h> 27 #include <linux/percpu.h> 28 #include <linux/cpu.h> 29 #include <linux/notifier.h> 30 #include <linux/backing-dev.h> 31 #include <linux/memcontrol.h> 32 #include <linux/gfp.h> 33 #include <linux/uio.h> 34 #include <linux/hugetlb.h> 35 #include <linux/page_idle.h> 36 37 #include "internal.h" 38 39 #define CREATE_TRACE_POINTS 40 #include <trace/events/pagemap.h> 41 42 /* How many pages do we try to swap or page in/out together? */ 43 int page_cluster; 44 45 static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); 46 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); 47 static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs); 48 49 /* 50 * This path almost never happens for VM activity - pages are normally 51 * freed via pagevecs. But it gets used by networking. 52 */ 53 static void __page_cache_release(struct page *page) 54 { 55 if (PageLRU(page)) { 56 struct zone *zone = page_zone(page); 57 struct lruvec *lruvec; 58 unsigned long flags; 59 60 spin_lock_irqsave(&zone->lru_lock, flags); 61 lruvec = mem_cgroup_page_lruvec(page, zone); 62 VM_BUG_ON_PAGE(!PageLRU(page), page); 63 __ClearPageLRU(page); 64 del_page_from_lru_list(page, lruvec, page_off_lru(page)); 65 spin_unlock_irqrestore(&zone->lru_lock, flags); 66 } 67 mem_cgroup_uncharge(page); 68 } 69 70 static void __put_single_page(struct page *page) 71 { 72 __page_cache_release(page); 73 free_hot_cold_page(page, false); 74 } 75 76 static void __put_compound_page(struct page *page) 77 { 78 compound_page_dtor *dtor; 79 80 /* 81 * __page_cache_release() is supposed to be called for thp, not for 82 * hugetlb. This is because hugetlb page does never have PageLRU set 83 * (it's never listed to any LRU lists) and no memcg routines should 84 * be called for hugetlb (it has a separate hugetlb_cgroup.) 85 */ 86 if (!PageHuge(page)) 87 __page_cache_release(page); 88 dtor = get_compound_page_dtor(page); 89 (*dtor)(page); 90 } 91 92 /** 93 * Two special cases here: we could avoid taking compound_lock_irqsave 94 * and could skip the tail refcounting(in _mapcount). 95 * 96 * 1. Hugetlbfs page: 97 * 98 * PageHeadHuge will remain true until the compound page 99 * is released and enters the buddy allocator, and it could 100 * not be split by __split_huge_page_refcount(). 101 * 102 * So if we see PageHeadHuge set, and we have the tail page pin, 103 * then we could safely put head page. 104 * 105 * 2. Slab THP page: 106 * 107 * PG_slab is cleared before the slab frees the head page, and 108 * tail pin cannot be the last reference left on the head page, 109 * because the slab code is free to reuse the compound page 110 * after a kfree/kmem_cache_free without having to check if 111 * there's any tail pin left. In turn all tail pinsmust be always 112 * released while the head is still pinned by the slab code 113 * and so we know PG_slab will be still set too. 114 * 115 * So if we see PageSlab set, and we have the tail page pin, 116 * then we could safely put head page. 117 */ 118 static __always_inline 119 void put_unrefcounted_compound_page(struct page *page_head, struct page *page) 120 { 121 /* 122 * If @page is a THP tail, we must read the tail page 123 * flags after the head page flags. The 124 * __split_huge_page_refcount side enforces write memory barriers 125 * between clearing PageTail and before the head page 126 * can be freed and reallocated. 127 */ 128 smp_rmb(); 129 if (likely(PageTail(page))) { 130 /* 131 * __split_huge_page_refcount cannot race 132 * here, see the comment above this function. 133 */ 134 VM_BUG_ON_PAGE(!PageHead(page_head), page_head); 135 if (put_page_testzero(page_head)) { 136 /* 137 * If this is the tail of a slab THP page, 138 * the tail pin must not be the last reference 139 * held on the page, because the PG_slab cannot 140 * be cleared before all tail pins (which skips 141 * the _mapcount tail refcounting) have been 142 * released. 143 * 144 * If this is the tail of a hugetlbfs page, 145 * the tail pin may be the last reference on 146 * the page instead, because PageHeadHuge will 147 * not go away until the compound page enters 148 * the buddy allocator. 149 */ 150 VM_BUG_ON_PAGE(PageSlab(page_head), page_head); 151 __put_compound_page(page_head); 152 } 153 } else 154 /* 155 * __split_huge_page_refcount run before us, 156 * @page was a THP tail. The split @page_head 157 * has been freed and reallocated as slab or 158 * hugetlbfs page of smaller order (only 159 * possible if reallocated as slab on x86). 160 */ 161 if (put_page_testzero(page)) 162 __put_single_page(page); 163 } 164 165 static __always_inline 166 void put_refcounted_compound_page(struct page *page_head, struct page *page) 167 { 168 if (likely(page != page_head && get_page_unless_zero(page_head))) { 169 unsigned long flags; 170 171 /* 172 * @page_head wasn't a dangling pointer but it may not 173 * be a head page anymore by the time we obtain the 174 * lock. That is ok as long as it can't be freed from 175 * under us. 176 */ 177 flags = compound_lock_irqsave(page_head); 178 if (unlikely(!PageTail(page))) { 179 /* __split_huge_page_refcount run before us */ 180 compound_unlock_irqrestore(page_head, flags); 181 if (put_page_testzero(page_head)) { 182 /* 183 * The @page_head may have been freed 184 * and reallocated as a compound page 185 * of smaller order and then freed 186 * again. All we know is that it 187 * cannot have become: a THP page, a 188 * compound page of higher order, a 189 * tail page. That is because we 190 * still hold the refcount of the 191 * split THP tail and page_head was 192 * the THP head before the split. 193 */ 194 if (PageHead(page_head)) 195 __put_compound_page(page_head); 196 else 197 __put_single_page(page_head); 198 } 199 out_put_single: 200 if (put_page_testzero(page)) 201 __put_single_page(page); 202 return; 203 } 204 VM_BUG_ON_PAGE(page_head != page->first_page, page); 205 /* 206 * We can release the refcount taken by 207 * get_page_unless_zero() now that 208 * __split_huge_page_refcount() is blocked on the 209 * compound_lock. 210 */ 211 if (put_page_testzero(page_head)) 212 VM_BUG_ON_PAGE(1, page_head); 213 /* __split_huge_page_refcount will wait now */ 214 VM_BUG_ON_PAGE(page_mapcount(page) <= 0, page); 215 atomic_dec(&page->_mapcount); 216 VM_BUG_ON_PAGE(atomic_read(&page_head->_count) <= 0, page_head); 217 VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page); 218 compound_unlock_irqrestore(page_head, flags); 219 220 if (put_page_testzero(page_head)) { 221 if (PageHead(page_head)) 222 __put_compound_page(page_head); 223 else 224 __put_single_page(page_head); 225 } 226 } else { 227 /* @page_head is a dangling pointer */ 228 VM_BUG_ON_PAGE(PageTail(page), page); 229 goto out_put_single; 230 } 231 } 232 233 static void put_compound_page(struct page *page) 234 { 235 struct page *page_head; 236 237 /* 238 * We see the PageCompound set and PageTail not set, so @page maybe: 239 * 1. hugetlbfs head page, or 240 * 2. THP head page. 241 */ 242 if (likely(!PageTail(page))) { 243 if (put_page_testzero(page)) { 244 /* 245 * By the time all refcounts have been released 246 * split_huge_page cannot run anymore from under us. 247 */ 248 if (PageHead(page)) 249 __put_compound_page(page); 250 else 251 __put_single_page(page); 252 } 253 return; 254 } 255 256 /* 257 * We see the PageCompound set and PageTail set, so @page maybe: 258 * 1. a tail hugetlbfs page, or 259 * 2. a tail THP page, or 260 * 3. a split THP page. 261 * 262 * Case 3 is possible, as we may race with 263 * __split_huge_page_refcount tearing down a THP page. 264 */ 265 page_head = compound_head_by_tail(page); 266 if (!__compound_tail_refcounted(page_head)) 267 put_unrefcounted_compound_page(page_head, page); 268 else 269 put_refcounted_compound_page(page_head, page); 270 } 271 272 void put_page(struct page *page) 273 { 274 if (unlikely(PageCompound(page))) 275 put_compound_page(page); 276 else if (put_page_testzero(page)) 277 __put_single_page(page); 278 } 279 EXPORT_SYMBOL(put_page); 280 281 /* 282 * This function is exported but must not be called by anything other 283 * than get_page(). It implements the slow path of get_page(). 284 */ 285 bool __get_page_tail(struct page *page) 286 { 287 /* 288 * This takes care of get_page() if run on a tail page 289 * returned by one of the get_user_pages/follow_page variants. 290 * get_user_pages/follow_page itself doesn't need the compound 291 * lock because it runs __get_page_tail_foll() under the 292 * proper PT lock that already serializes against 293 * split_huge_page(). 294 */ 295 unsigned long flags; 296 bool got; 297 struct page *page_head = compound_head(page); 298 299 /* Ref to put_compound_page() comment. */ 300 if (!__compound_tail_refcounted(page_head)) { 301 smp_rmb(); 302 if (likely(PageTail(page))) { 303 /* 304 * This is a hugetlbfs page or a slab 305 * page. __split_huge_page_refcount 306 * cannot race here. 307 */ 308 VM_BUG_ON_PAGE(!PageHead(page_head), page_head); 309 __get_page_tail_foll(page, true); 310 return true; 311 } else { 312 /* 313 * __split_huge_page_refcount run 314 * before us, "page" was a THP 315 * tail. The split page_head has been 316 * freed and reallocated as slab or 317 * hugetlbfs page of smaller order 318 * (only possible if reallocated as 319 * slab on x86). 320 */ 321 return false; 322 } 323 } 324 325 got = false; 326 if (likely(page != page_head && get_page_unless_zero(page_head))) { 327 /* 328 * page_head wasn't a dangling pointer but it 329 * may not be a head page anymore by the time 330 * we obtain the lock. That is ok as long as it 331 * can't be freed from under us. 332 */ 333 flags = compound_lock_irqsave(page_head); 334 /* here __split_huge_page_refcount won't run anymore */ 335 if (likely(PageTail(page))) { 336 __get_page_tail_foll(page, false); 337 got = true; 338 } 339 compound_unlock_irqrestore(page_head, flags); 340 if (unlikely(!got)) 341 put_page(page_head); 342 } 343 return got; 344 } 345 EXPORT_SYMBOL(__get_page_tail); 346 347 /** 348 * put_pages_list() - release a list of pages 349 * @pages: list of pages threaded on page->lru 350 * 351 * Release a list of pages which are strung together on page.lru. Currently 352 * used by read_cache_pages() and related error recovery code. 353 */ 354 void put_pages_list(struct list_head *pages) 355 { 356 while (!list_empty(pages)) { 357 struct page *victim; 358 359 victim = list_entry(pages->prev, struct page, lru); 360 list_del(&victim->lru); 361 page_cache_release(victim); 362 } 363 } 364 EXPORT_SYMBOL(put_pages_list); 365 366 /* 367 * get_kernel_pages() - pin kernel pages in memory 368 * @kiov: An array of struct kvec structures 369 * @nr_segs: number of segments to pin 370 * @write: pinning for read/write, currently ignored 371 * @pages: array that receives pointers to the pages pinned. 372 * Should be at least nr_segs long. 373 * 374 * Returns number of pages pinned. This may be fewer than the number 375 * requested. If nr_pages is 0 or negative, returns 0. If no pages 376 * were pinned, returns -errno. Each page returned must be released 377 * with a put_page() call when it is finished with. 378 */ 379 int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write, 380 struct page **pages) 381 { 382 int seg; 383 384 for (seg = 0; seg < nr_segs; seg++) { 385 if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE)) 386 return seg; 387 388 pages[seg] = kmap_to_page(kiov[seg].iov_base); 389 page_cache_get(pages[seg]); 390 } 391 392 return seg; 393 } 394 EXPORT_SYMBOL_GPL(get_kernel_pages); 395 396 /* 397 * get_kernel_page() - pin a kernel page in memory 398 * @start: starting kernel address 399 * @write: pinning for read/write, currently ignored 400 * @pages: array that receives pointer to the page pinned. 401 * Must be at least nr_segs long. 402 * 403 * Returns 1 if page is pinned. If the page was not pinned, returns 404 * -errno. The page returned must be released with a put_page() call 405 * when it is finished with. 406 */ 407 int get_kernel_page(unsigned long start, int write, struct page **pages) 408 { 409 const struct kvec kiov = { 410 .iov_base = (void *)start, 411 .iov_len = PAGE_SIZE 412 }; 413 414 return get_kernel_pages(&kiov, 1, write, pages); 415 } 416 EXPORT_SYMBOL_GPL(get_kernel_page); 417 418 static void pagevec_lru_move_fn(struct pagevec *pvec, 419 void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg), 420 void *arg) 421 { 422 int i; 423 struct zone *zone = NULL; 424 struct lruvec *lruvec; 425 unsigned long flags = 0; 426 427 for (i = 0; i < pagevec_count(pvec); i++) { 428 struct page *page = pvec->pages[i]; 429 struct zone *pagezone = page_zone(page); 430 431 if (pagezone != zone) { 432 if (zone) 433 spin_unlock_irqrestore(&zone->lru_lock, flags); 434 zone = pagezone; 435 spin_lock_irqsave(&zone->lru_lock, flags); 436 } 437 438 lruvec = mem_cgroup_page_lruvec(page, zone); 439 (*move_fn)(page, lruvec, arg); 440 } 441 if (zone) 442 spin_unlock_irqrestore(&zone->lru_lock, flags); 443 release_pages(pvec->pages, pvec->nr, pvec->cold); 444 pagevec_reinit(pvec); 445 } 446 447 static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec, 448 void *arg) 449 { 450 int *pgmoved = arg; 451 452 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 453 enum lru_list lru = page_lru_base_type(page); 454 list_move_tail(&page->lru, &lruvec->lists[lru]); 455 (*pgmoved)++; 456 } 457 } 458 459 /* 460 * pagevec_move_tail() must be called with IRQ disabled. 461 * Otherwise this may cause nasty races. 462 */ 463 static void pagevec_move_tail(struct pagevec *pvec) 464 { 465 int pgmoved = 0; 466 467 pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved); 468 __count_vm_events(PGROTATED, pgmoved); 469 } 470 471 /* 472 * Writeback is about to end against a page which has been marked for immediate 473 * reclaim. If it still appears to be reclaimable, move it to the tail of the 474 * inactive list. 475 */ 476 void rotate_reclaimable_page(struct page *page) 477 { 478 if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && 479 !PageUnevictable(page) && PageLRU(page)) { 480 struct pagevec *pvec; 481 unsigned long flags; 482 483 page_cache_get(page); 484 local_irq_save(flags); 485 pvec = this_cpu_ptr(&lru_rotate_pvecs); 486 if (!pagevec_add(pvec, page)) 487 pagevec_move_tail(pvec); 488 local_irq_restore(flags); 489 } 490 } 491 492 static void update_page_reclaim_stat(struct lruvec *lruvec, 493 int file, int rotated) 494 { 495 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; 496 497 reclaim_stat->recent_scanned[file]++; 498 if (rotated) 499 reclaim_stat->recent_rotated[file]++; 500 } 501 502 static void __activate_page(struct page *page, struct lruvec *lruvec, 503 void *arg) 504 { 505 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 506 int file = page_is_file_cache(page); 507 int lru = page_lru_base_type(page); 508 509 del_page_from_lru_list(page, lruvec, lru); 510 SetPageActive(page); 511 lru += LRU_ACTIVE; 512 add_page_to_lru_list(page, lruvec, lru); 513 trace_mm_lru_activate(page); 514 515 __count_vm_event(PGACTIVATE); 516 update_page_reclaim_stat(lruvec, file, 1); 517 } 518 } 519 520 #ifdef CONFIG_SMP 521 static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); 522 523 static void activate_page_drain(int cpu) 524 { 525 struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu); 526 527 if (pagevec_count(pvec)) 528 pagevec_lru_move_fn(pvec, __activate_page, NULL); 529 } 530 531 static bool need_activate_page_drain(int cpu) 532 { 533 return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0; 534 } 535 536 void activate_page(struct page *page) 537 { 538 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 539 struct pagevec *pvec = &get_cpu_var(activate_page_pvecs); 540 541 page_cache_get(page); 542 if (!pagevec_add(pvec, page)) 543 pagevec_lru_move_fn(pvec, __activate_page, NULL); 544 put_cpu_var(activate_page_pvecs); 545 } 546 } 547 548 #else 549 static inline void activate_page_drain(int cpu) 550 { 551 } 552 553 static bool need_activate_page_drain(int cpu) 554 { 555 return false; 556 } 557 558 void activate_page(struct page *page) 559 { 560 struct zone *zone = page_zone(page); 561 562 spin_lock_irq(&zone->lru_lock); 563 __activate_page(page, mem_cgroup_page_lruvec(page, zone), NULL); 564 spin_unlock_irq(&zone->lru_lock); 565 } 566 #endif 567 568 static void __lru_cache_activate_page(struct page *page) 569 { 570 struct pagevec *pvec = &get_cpu_var(lru_add_pvec); 571 int i; 572 573 /* 574 * Search backwards on the optimistic assumption that the page being 575 * activated has just been added to this pagevec. Note that only 576 * the local pagevec is examined as a !PageLRU page could be in the 577 * process of being released, reclaimed, migrated or on a remote 578 * pagevec that is currently being drained. Furthermore, marking 579 * a remote pagevec's page PageActive potentially hits a race where 580 * a page is marked PageActive just after it is added to the inactive 581 * list causing accounting errors and BUG_ON checks to trigger. 582 */ 583 for (i = pagevec_count(pvec) - 1; i >= 0; i--) { 584 struct page *pagevec_page = pvec->pages[i]; 585 586 if (pagevec_page == page) { 587 SetPageActive(page); 588 break; 589 } 590 } 591 592 put_cpu_var(lru_add_pvec); 593 } 594 595 /* 596 * Mark a page as having seen activity. 597 * 598 * inactive,unreferenced -> inactive,referenced 599 * inactive,referenced -> active,unreferenced 600 * active,unreferenced -> active,referenced 601 * 602 * When a newly allocated page is not yet visible, so safe for non-atomic ops, 603 * __SetPageReferenced(page) may be substituted for mark_page_accessed(page). 604 */ 605 void mark_page_accessed(struct page *page) 606 { 607 if (!PageActive(page) && !PageUnevictable(page) && 608 PageReferenced(page)) { 609 610 /* 611 * If the page is on the LRU, queue it for activation via 612 * activate_page_pvecs. Otherwise, assume the page is on a 613 * pagevec, mark it active and it'll be moved to the active 614 * LRU on the next drain. 615 */ 616 if (PageLRU(page)) 617 activate_page(page); 618 else 619 __lru_cache_activate_page(page); 620 ClearPageReferenced(page); 621 if (page_is_file_cache(page)) 622 workingset_activation(page); 623 } else if (!PageReferenced(page)) { 624 SetPageReferenced(page); 625 } 626 if (page_is_idle(page)) 627 clear_page_idle(page); 628 } 629 EXPORT_SYMBOL(mark_page_accessed); 630 631 static void __lru_cache_add(struct page *page) 632 { 633 struct pagevec *pvec = &get_cpu_var(lru_add_pvec); 634 635 page_cache_get(page); 636 if (!pagevec_space(pvec)) 637 __pagevec_lru_add(pvec); 638 pagevec_add(pvec, page); 639 put_cpu_var(lru_add_pvec); 640 } 641 642 /** 643 * lru_cache_add: add a page to the page lists 644 * @page: the page to add 645 */ 646 void lru_cache_add_anon(struct page *page) 647 { 648 if (PageActive(page)) 649 ClearPageActive(page); 650 __lru_cache_add(page); 651 } 652 653 void lru_cache_add_file(struct page *page) 654 { 655 if (PageActive(page)) 656 ClearPageActive(page); 657 __lru_cache_add(page); 658 } 659 EXPORT_SYMBOL(lru_cache_add_file); 660 661 /** 662 * lru_cache_add - add a page to a page list 663 * @page: the page to be added to the LRU. 664 * 665 * Queue the page for addition to the LRU via pagevec. The decision on whether 666 * to add the page to the [in]active [file|anon] list is deferred until the 667 * pagevec is drained. This gives a chance for the caller of lru_cache_add() 668 * have the page added to the active list using mark_page_accessed(). 669 */ 670 void lru_cache_add(struct page *page) 671 { 672 VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page); 673 VM_BUG_ON_PAGE(PageLRU(page), page); 674 __lru_cache_add(page); 675 } 676 677 /** 678 * add_page_to_unevictable_list - add a page to the unevictable list 679 * @page: the page to be added to the unevictable list 680 * 681 * Add page directly to its zone's unevictable list. To avoid races with 682 * tasks that might be making the page evictable, through eg. munlock, 683 * munmap or exit, while it's not on the lru, we want to add the page 684 * while it's locked or otherwise "invisible" to other tasks. This is 685 * difficult to do when using the pagevec cache, so bypass that. 686 */ 687 void add_page_to_unevictable_list(struct page *page) 688 { 689 struct zone *zone = page_zone(page); 690 struct lruvec *lruvec; 691 692 spin_lock_irq(&zone->lru_lock); 693 lruvec = mem_cgroup_page_lruvec(page, zone); 694 ClearPageActive(page); 695 SetPageUnevictable(page); 696 SetPageLRU(page); 697 add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE); 698 spin_unlock_irq(&zone->lru_lock); 699 } 700 701 /** 702 * lru_cache_add_active_or_unevictable 703 * @page: the page to be added to LRU 704 * @vma: vma in which page is mapped for determining reclaimability 705 * 706 * Place @page on the active or unevictable LRU list, depending on its 707 * evictability. Note that if the page is not evictable, it goes 708 * directly back onto it's zone's unevictable list, it does NOT use a 709 * per cpu pagevec. 710 */ 711 void lru_cache_add_active_or_unevictable(struct page *page, 712 struct vm_area_struct *vma) 713 { 714 VM_BUG_ON_PAGE(PageLRU(page), page); 715 716 if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) { 717 SetPageActive(page); 718 lru_cache_add(page); 719 return; 720 } 721 722 if (!TestSetPageMlocked(page)) { 723 /* 724 * We use the irq-unsafe __mod_zone_page_stat because this 725 * counter is not modified from interrupt context, and the pte 726 * lock is held(spinlock), which implies preemption disabled. 727 */ 728 __mod_zone_page_state(page_zone(page), NR_MLOCK, 729 hpage_nr_pages(page)); 730 count_vm_event(UNEVICTABLE_PGMLOCKED); 731 } 732 add_page_to_unevictable_list(page); 733 } 734 735 /* 736 * If the page can not be invalidated, it is moved to the 737 * inactive list to speed up its reclaim. It is moved to the 738 * head of the list, rather than the tail, to give the flusher 739 * threads some time to write it out, as this is much more 740 * effective than the single-page writeout from reclaim. 741 * 742 * If the page isn't page_mapped and dirty/writeback, the page 743 * could reclaim asap using PG_reclaim. 744 * 745 * 1. active, mapped page -> none 746 * 2. active, dirty/writeback page -> inactive, head, PG_reclaim 747 * 3. inactive, mapped page -> none 748 * 4. inactive, dirty/writeback page -> inactive, head, PG_reclaim 749 * 5. inactive, clean -> inactive, tail 750 * 6. Others -> none 751 * 752 * In 4, why it moves inactive's head, the VM expects the page would 753 * be write it out by flusher threads as this is much more effective 754 * than the single-page writeout from reclaim. 755 */ 756 static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, 757 void *arg) 758 { 759 int lru, file; 760 bool active; 761 762 if (!PageLRU(page)) 763 return; 764 765 if (PageUnevictable(page)) 766 return; 767 768 /* Some processes are using the page */ 769 if (page_mapped(page)) 770 return; 771 772 active = PageActive(page); 773 file = page_is_file_cache(page); 774 lru = page_lru_base_type(page); 775 776 del_page_from_lru_list(page, lruvec, lru + active); 777 ClearPageActive(page); 778 ClearPageReferenced(page); 779 add_page_to_lru_list(page, lruvec, lru); 780 781 if (PageWriteback(page) || PageDirty(page)) { 782 /* 783 * PG_reclaim could be raced with end_page_writeback 784 * It can make readahead confusing. But race window 785 * is _really_ small and it's non-critical problem. 786 */ 787 SetPageReclaim(page); 788 } else { 789 /* 790 * The page's writeback ends up during pagevec 791 * We moves tha page into tail of inactive. 792 */ 793 list_move_tail(&page->lru, &lruvec->lists[lru]); 794 __count_vm_event(PGROTATED); 795 } 796 797 if (active) 798 __count_vm_event(PGDEACTIVATE); 799 update_page_reclaim_stat(lruvec, file, 0); 800 } 801 802 /* 803 * Drain pages out of the cpu's pagevecs. 804 * Either "cpu" is the current CPU, and preemption has already been 805 * disabled; or "cpu" is being hot-unplugged, and is already dead. 806 */ 807 void lru_add_drain_cpu(int cpu) 808 { 809 struct pagevec *pvec = &per_cpu(lru_add_pvec, cpu); 810 811 if (pagevec_count(pvec)) 812 __pagevec_lru_add(pvec); 813 814 pvec = &per_cpu(lru_rotate_pvecs, cpu); 815 if (pagevec_count(pvec)) { 816 unsigned long flags; 817 818 /* No harm done if a racing interrupt already did this */ 819 local_irq_save(flags); 820 pagevec_move_tail(pvec); 821 local_irq_restore(flags); 822 } 823 824 pvec = &per_cpu(lru_deactivate_file_pvecs, cpu); 825 if (pagevec_count(pvec)) 826 pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); 827 828 activate_page_drain(cpu); 829 } 830 831 /** 832 * deactivate_file_page - forcefully deactivate a file page 833 * @page: page to deactivate 834 * 835 * This function hints the VM that @page is a good reclaim candidate, 836 * for example if its invalidation fails due to the page being dirty 837 * or under writeback. 838 */ 839 void deactivate_file_page(struct page *page) 840 { 841 /* 842 * In a workload with many unevictable page such as mprotect, 843 * unevictable page deactivation for accelerating reclaim is pointless. 844 */ 845 if (PageUnevictable(page)) 846 return; 847 848 if (likely(get_page_unless_zero(page))) { 849 struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs); 850 851 if (!pagevec_add(pvec, page)) 852 pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); 853 put_cpu_var(lru_deactivate_file_pvecs); 854 } 855 } 856 857 void lru_add_drain(void) 858 { 859 lru_add_drain_cpu(get_cpu()); 860 put_cpu(); 861 } 862 863 static void lru_add_drain_per_cpu(struct work_struct *dummy) 864 { 865 lru_add_drain(); 866 } 867 868 static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); 869 870 void lru_add_drain_all(void) 871 { 872 static DEFINE_MUTEX(lock); 873 static struct cpumask has_work; 874 int cpu; 875 876 mutex_lock(&lock); 877 get_online_cpus(); 878 cpumask_clear(&has_work); 879 880 for_each_online_cpu(cpu) { 881 struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); 882 883 if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || 884 pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || 885 pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) || 886 need_activate_page_drain(cpu)) { 887 INIT_WORK(work, lru_add_drain_per_cpu); 888 schedule_work_on(cpu, work); 889 cpumask_set_cpu(cpu, &has_work); 890 } 891 } 892 893 for_each_cpu(cpu, &has_work) 894 flush_work(&per_cpu(lru_add_drain_work, cpu)); 895 896 put_online_cpus(); 897 mutex_unlock(&lock); 898 } 899 900 /** 901 * release_pages - batched page_cache_release() 902 * @pages: array of pages to release 903 * @nr: number of pages 904 * @cold: whether the pages are cache cold 905 * 906 * Decrement the reference count on all the pages in @pages. If it 907 * fell to zero, remove the page from the LRU and free it. 908 */ 909 void release_pages(struct page **pages, int nr, bool cold) 910 { 911 int i; 912 LIST_HEAD(pages_to_free); 913 struct zone *zone = NULL; 914 struct lruvec *lruvec; 915 unsigned long uninitialized_var(flags); 916 unsigned int uninitialized_var(lock_batch); 917 918 for (i = 0; i < nr; i++) { 919 struct page *page = pages[i]; 920 921 if (unlikely(PageCompound(page))) { 922 if (zone) { 923 spin_unlock_irqrestore(&zone->lru_lock, flags); 924 zone = NULL; 925 } 926 put_compound_page(page); 927 continue; 928 } 929 930 /* 931 * Make sure the IRQ-safe lock-holding time does not get 932 * excessive with a continuous string of pages from the 933 * same zone. The lock is held only if zone != NULL. 934 */ 935 if (zone && ++lock_batch == SWAP_CLUSTER_MAX) { 936 spin_unlock_irqrestore(&zone->lru_lock, flags); 937 zone = NULL; 938 } 939 940 if (!put_page_testzero(page)) 941 continue; 942 943 if (PageLRU(page)) { 944 struct zone *pagezone = page_zone(page); 945 946 if (pagezone != zone) { 947 if (zone) 948 spin_unlock_irqrestore(&zone->lru_lock, 949 flags); 950 lock_batch = 0; 951 zone = pagezone; 952 spin_lock_irqsave(&zone->lru_lock, flags); 953 } 954 955 lruvec = mem_cgroup_page_lruvec(page, zone); 956 VM_BUG_ON_PAGE(!PageLRU(page), page); 957 __ClearPageLRU(page); 958 del_page_from_lru_list(page, lruvec, page_off_lru(page)); 959 } 960 961 /* Clear Active bit in case of parallel mark_page_accessed */ 962 __ClearPageActive(page); 963 964 list_add(&page->lru, &pages_to_free); 965 } 966 if (zone) 967 spin_unlock_irqrestore(&zone->lru_lock, flags); 968 969 mem_cgroup_uncharge_list(&pages_to_free); 970 free_hot_cold_page_list(&pages_to_free, cold); 971 } 972 EXPORT_SYMBOL(release_pages); 973 974 /* 975 * The pages which we're about to release may be in the deferred lru-addition 976 * queues. That would prevent them from really being freed right now. That's 977 * OK from a correctness point of view but is inefficient - those pages may be 978 * cache-warm and we want to give them back to the page allocator ASAP. 979 * 980 * So __pagevec_release() will drain those queues here. __pagevec_lru_add() 981 * and __pagevec_lru_add_active() call release_pages() directly to avoid 982 * mutual recursion. 983 */ 984 void __pagevec_release(struct pagevec *pvec) 985 { 986 lru_add_drain(); 987 release_pages(pvec->pages, pagevec_count(pvec), pvec->cold); 988 pagevec_reinit(pvec); 989 } 990 EXPORT_SYMBOL(__pagevec_release); 991 992 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 993 /* used by __split_huge_page_refcount() */ 994 void lru_add_page_tail(struct page *page, struct page *page_tail, 995 struct lruvec *lruvec, struct list_head *list) 996 { 997 const int file = 0; 998 999 VM_BUG_ON_PAGE(!PageHead(page), page); 1000 VM_BUG_ON_PAGE(PageCompound(page_tail), page); 1001 VM_BUG_ON_PAGE(PageLRU(page_tail), page); 1002 VM_BUG_ON(NR_CPUS != 1 && 1003 !spin_is_locked(&lruvec_zone(lruvec)->lru_lock)); 1004 1005 if (!list) 1006 SetPageLRU(page_tail); 1007 1008 if (likely(PageLRU(page))) 1009 list_add_tail(&page_tail->lru, &page->lru); 1010 else if (list) { 1011 /* page reclaim is reclaiming a huge page */ 1012 get_page(page_tail); 1013 list_add_tail(&page_tail->lru, list); 1014 } else { 1015 struct list_head *list_head; 1016 /* 1017 * Head page has not yet been counted, as an hpage, 1018 * so we must account for each subpage individually. 1019 * 1020 * Use the standard add function to put page_tail on the list, 1021 * but then correct its position so they all end up in order. 1022 */ 1023 add_page_to_lru_list(page_tail, lruvec, page_lru(page_tail)); 1024 list_head = page_tail->lru.prev; 1025 list_move_tail(&page_tail->lru, list_head); 1026 } 1027 1028 if (!PageUnevictable(page)) 1029 update_page_reclaim_stat(lruvec, file, PageActive(page_tail)); 1030 } 1031 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1032 1033 static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, 1034 void *arg) 1035 { 1036 int file = page_is_file_cache(page); 1037 int active = PageActive(page); 1038 enum lru_list lru = page_lru(page); 1039 1040 VM_BUG_ON_PAGE(PageLRU(page), page); 1041 1042 SetPageLRU(page); 1043 add_page_to_lru_list(page, lruvec, lru); 1044 update_page_reclaim_stat(lruvec, file, active); 1045 trace_mm_lru_insertion(page, lru); 1046 } 1047 1048 /* 1049 * Add the passed pages to the LRU, then drop the caller's refcount 1050 * on them. Reinitialises the caller's pagevec. 1051 */ 1052 void __pagevec_lru_add(struct pagevec *pvec) 1053 { 1054 pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL); 1055 } 1056 EXPORT_SYMBOL(__pagevec_lru_add); 1057 1058 /** 1059 * pagevec_lookup_entries - gang pagecache lookup 1060 * @pvec: Where the resulting entries are placed 1061 * @mapping: The address_space to search 1062 * @start: The starting entry index 1063 * @nr_entries: The maximum number of entries 1064 * @indices: The cache indices corresponding to the entries in @pvec 1065 * 1066 * pagevec_lookup_entries() will search for and return a group of up 1067 * to @nr_entries pages and shadow entries in the mapping. All 1068 * entries are placed in @pvec. pagevec_lookup_entries() takes a 1069 * reference against actual pages in @pvec. 1070 * 1071 * The search returns a group of mapping-contiguous entries with 1072 * ascending indexes. There may be holes in the indices due to 1073 * not-present entries. 1074 * 1075 * pagevec_lookup_entries() returns the number of entries which were 1076 * found. 1077 */ 1078 unsigned pagevec_lookup_entries(struct pagevec *pvec, 1079 struct address_space *mapping, 1080 pgoff_t start, unsigned nr_pages, 1081 pgoff_t *indices) 1082 { 1083 pvec->nr = find_get_entries(mapping, start, nr_pages, 1084 pvec->pages, indices); 1085 return pagevec_count(pvec); 1086 } 1087 1088 /** 1089 * pagevec_remove_exceptionals - pagevec exceptionals pruning 1090 * @pvec: The pagevec to prune 1091 * 1092 * pagevec_lookup_entries() fills both pages and exceptional radix 1093 * tree entries into the pagevec. This function prunes all 1094 * exceptionals from @pvec without leaving holes, so that it can be 1095 * passed on to page-only pagevec operations. 1096 */ 1097 void pagevec_remove_exceptionals(struct pagevec *pvec) 1098 { 1099 int i, j; 1100 1101 for (i = 0, j = 0; i < pagevec_count(pvec); i++) { 1102 struct page *page = pvec->pages[i]; 1103 if (!radix_tree_exceptional_entry(page)) 1104 pvec->pages[j++] = page; 1105 } 1106 pvec->nr = j; 1107 } 1108 1109 /** 1110 * pagevec_lookup - gang pagecache lookup 1111 * @pvec: Where the resulting pages are placed 1112 * @mapping: The address_space to search 1113 * @start: The starting page index 1114 * @nr_pages: The maximum number of pages 1115 * 1116 * pagevec_lookup() will search for and return a group of up to @nr_pages pages 1117 * in the mapping. The pages are placed in @pvec. pagevec_lookup() takes a 1118 * reference against the pages in @pvec. 1119 * 1120 * The search returns a group of mapping-contiguous pages with ascending 1121 * indexes. There may be holes in the indices due to not-present pages. 1122 * 1123 * pagevec_lookup() returns the number of pages which were found. 1124 */ 1125 unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, 1126 pgoff_t start, unsigned nr_pages) 1127 { 1128 pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages); 1129 return pagevec_count(pvec); 1130 } 1131 EXPORT_SYMBOL(pagevec_lookup); 1132 1133 unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, 1134 pgoff_t *index, int tag, unsigned nr_pages) 1135 { 1136 pvec->nr = find_get_pages_tag(mapping, index, tag, 1137 nr_pages, pvec->pages); 1138 return pagevec_count(pvec); 1139 } 1140 EXPORT_SYMBOL(pagevec_lookup_tag); 1141 1142 /* 1143 * Perform any setup for the swap system 1144 */ 1145 void __init swap_setup(void) 1146 { 1147 unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); 1148 #ifdef CONFIG_SWAP 1149 int i; 1150 1151 for (i = 0; i < MAX_SWAPFILES; i++) 1152 spin_lock_init(&swapper_spaces[i].tree_lock); 1153 #endif 1154 1155 /* Use a smaller cluster for small-memory machines */ 1156 if (megs < 16) 1157 page_cluster = 2; 1158 else 1159 page_cluster = 3; 1160 /* 1161 * Right now other parts of the system means that we 1162 * _really_ don't want to cluster much more 1163 */ 1164 } 1165