1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/mm/swap_state.c 4 * 5 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 6 * Swap reorganised 29.12.95, Stephen Tweedie 7 * 8 * Rewritten to use page cache, (C) 1998 Stephen Tweedie 9 */ 10 #include <linux/mm.h> 11 #include <linux/gfp.h> 12 #include <linux/kernel_stat.h> 13 #include <linux/swap.h> 14 #include <linux/swapops.h> 15 #include <linux/init.h> 16 #include <linux/pagemap.h> 17 #include <linux/backing-dev.h> 18 #include <linux/blkdev.h> 19 #include <linux/pagevec.h> 20 #include <linux/migrate.h> 21 #include <linux/vmalloc.h> 22 #include <linux/swap_slots.h> 23 #include <linux/huge_mm.h> 24 25 #include <asm/pgtable.h> 26 27 /* 28 * swapper_space is a fiction, retained to simplify the path through 29 * vmscan's shrink_page_list. 30 */ 31 static const struct address_space_operations swap_aops = { 32 .writepage = swap_writepage, 33 .set_page_dirty = swap_set_page_dirty, 34 #ifdef CONFIG_MIGRATION 35 .migratepage = migrate_page, 36 #endif 37 }; 38 39 struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly; 40 static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly; 41 static bool enable_vma_readahead __read_mostly = true; 42 43 #define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) 44 #define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) 45 #define SWAP_RA_HITS_MAX SWAP_RA_HITS_MASK 46 #define SWAP_RA_WIN_MASK (~PAGE_MASK & ~SWAP_RA_HITS_MASK) 47 48 #define SWAP_RA_HITS(v) ((v) & SWAP_RA_HITS_MASK) 49 #define SWAP_RA_WIN(v) (((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT) 50 #define SWAP_RA_ADDR(v) ((v) & PAGE_MASK) 51 52 #define SWAP_RA_VAL(addr, win, hits) \ 53 (((addr) & PAGE_MASK) | \ 54 (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) | \ 55 ((hits) & SWAP_RA_HITS_MASK)) 56 57 /* Initial readahead hits is 4 to start up with a small window */ 58 #define GET_SWAP_RA_VAL(vma) \ 59 (atomic_long_read(&(vma)->swap_readahead_info) ? : 4) 60 61 #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) 62 #define ADD_CACHE_INFO(x, nr) do { swap_cache_info.x += (nr); } while (0) 63 64 static struct { 65 unsigned long add_total; 66 unsigned long del_total; 67 unsigned long find_success; 68 unsigned long find_total; 69 } swap_cache_info; 70 71 unsigned long total_swapcache_pages(void) 72 { 73 unsigned int i, j, nr; 74 unsigned long ret = 0; 75 struct address_space *spaces; 76 77 rcu_read_lock(); 78 for (i = 0; i < MAX_SWAPFILES; i++) { 79 /* 80 * The corresponding entries in nr_swapper_spaces and 81 * swapper_spaces will be reused only after at least 82 * one grace period. So it is impossible for them 83 * belongs to different usage. 84 */ 85 nr = nr_swapper_spaces[i]; 86 spaces = rcu_dereference(swapper_spaces[i]); 87 if (!nr || !spaces) 88 continue; 89 for (j = 0; j < nr; j++) 90 ret += spaces[j].nrpages; 91 } 92 rcu_read_unlock(); 93 return ret; 94 } 95 96 static atomic_t swapin_readahead_hits = ATOMIC_INIT(4); 97 98 void show_swap_cache_info(void) 99 { 100 printk("%lu pages in swap cache\n", total_swapcache_pages()); 101 printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", 102 swap_cache_info.add_total, swap_cache_info.del_total, 103 swap_cache_info.find_success, swap_cache_info.find_total); 104 printk("Free swap = %ldkB\n", 105 get_nr_swap_pages() << (PAGE_SHIFT - 10)); 106 printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); 107 } 108 109 /* 110 * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, 111 * but sets SwapCache flag and private instead of mapping and index. 112 */ 113 int __add_to_swap_cache(struct page *page, swp_entry_t entry) 114 { 115 int error, i, nr = hpage_nr_pages(page); 116 struct address_space *address_space; 117 pgoff_t idx = swp_offset(entry); 118 119 VM_BUG_ON_PAGE(!PageLocked(page), page); 120 VM_BUG_ON_PAGE(PageSwapCache(page), page); 121 VM_BUG_ON_PAGE(!PageSwapBacked(page), page); 122 123 page_ref_add(page, nr); 124 SetPageSwapCache(page); 125 126 address_space = swap_address_space(entry); 127 xa_lock_irq(&address_space->i_pages); 128 for (i = 0; i < nr; i++) { 129 set_page_private(page + i, entry.val + i); 130 error = radix_tree_insert(&address_space->i_pages, 131 idx + i, page + i); 132 if (unlikely(error)) 133 break; 134 } 135 if (likely(!error)) { 136 address_space->nrpages += nr; 137 __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); 138 ADD_CACHE_INFO(add_total, nr); 139 } else { 140 /* 141 * Only the context which have set SWAP_HAS_CACHE flag 142 * would call add_to_swap_cache(). 143 * So add_to_swap_cache() doesn't returns -EEXIST. 144 */ 145 VM_BUG_ON(error == -EEXIST); 146 set_page_private(page + i, 0UL); 147 while (i--) { 148 radix_tree_delete(&address_space->i_pages, idx + i); 149 set_page_private(page + i, 0UL); 150 } 151 ClearPageSwapCache(page); 152 page_ref_sub(page, nr); 153 } 154 xa_unlock_irq(&address_space->i_pages); 155 156 return error; 157 } 158 159 160 int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) 161 { 162 int error; 163 164 error = radix_tree_maybe_preload_order(gfp_mask, compound_order(page)); 165 if (!error) { 166 error = __add_to_swap_cache(page, entry); 167 radix_tree_preload_end(); 168 } 169 return error; 170 } 171 172 /* 173 * This must be called only on pages that have 174 * been verified to be in the swap cache. 175 */ 176 void __delete_from_swap_cache(struct page *page) 177 { 178 struct address_space *address_space; 179 int i, nr = hpage_nr_pages(page); 180 swp_entry_t entry; 181 pgoff_t idx; 182 183 VM_BUG_ON_PAGE(!PageLocked(page), page); 184 VM_BUG_ON_PAGE(!PageSwapCache(page), page); 185 VM_BUG_ON_PAGE(PageWriteback(page), page); 186 187 entry.val = page_private(page); 188 address_space = swap_address_space(entry); 189 idx = swp_offset(entry); 190 for (i = 0; i < nr; i++) { 191 radix_tree_delete(&address_space->i_pages, idx + i); 192 set_page_private(page + i, 0); 193 } 194 ClearPageSwapCache(page); 195 address_space->nrpages -= nr; 196 __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); 197 ADD_CACHE_INFO(del_total, nr); 198 } 199 200 /** 201 * add_to_swap - allocate swap space for a page 202 * @page: page we want to move to swap 203 * 204 * Allocate swap space for the page and add the page to the 205 * swap cache. Caller needs to hold the page lock. 206 */ 207 int add_to_swap(struct page *page) 208 { 209 swp_entry_t entry; 210 int err; 211 212 VM_BUG_ON_PAGE(!PageLocked(page), page); 213 VM_BUG_ON_PAGE(!PageUptodate(page), page); 214 215 entry = get_swap_page(page); 216 if (!entry.val) 217 return 0; 218 219 if (mem_cgroup_try_charge_swap(page, entry)) 220 goto fail; 221 222 /* 223 * Radix-tree node allocations from PF_MEMALLOC contexts could 224 * completely exhaust the page allocator. __GFP_NOMEMALLOC 225 * stops emergency reserves from being allocated. 226 * 227 * TODO: this could cause a theoretical memory reclaim 228 * deadlock in the swap out path. 229 */ 230 /* 231 * Add it to the swap cache. 232 */ 233 err = add_to_swap_cache(page, entry, 234 __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); 235 /* -ENOMEM radix-tree allocation failure */ 236 if (err) 237 /* 238 * add_to_swap_cache() doesn't return -EEXIST, so we can safely 239 * clear SWAP_HAS_CACHE flag. 240 */ 241 goto fail; 242 /* 243 * Normally the page will be dirtied in unmap because its pte should be 244 * dirty. A special case is MADV_FREE page. The page'e pte could have 245 * dirty bit cleared but the page's SwapBacked bit is still set because 246 * clearing the dirty bit and SwapBacked bit has no lock protected. For 247 * such page, unmap will not set dirty bit for it, so page reclaim will 248 * not write the page out. This can cause data corruption when the page 249 * is swap in later. Always setting the dirty bit for the page solves 250 * the problem. 251 */ 252 set_page_dirty(page); 253 254 return 1; 255 256 fail: 257 put_swap_page(page, entry); 258 return 0; 259 } 260 261 /* 262 * This must be called only on pages that have 263 * been verified to be in the swap cache and locked. 264 * It will never put the page into the free list, 265 * the caller has a reference on the page. 266 */ 267 void delete_from_swap_cache(struct page *page) 268 { 269 swp_entry_t entry; 270 struct address_space *address_space; 271 272 entry.val = page_private(page); 273 274 address_space = swap_address_space(entry); 275 xa_lock_irq(&address_space->i_pages); 276 __delete_from_swap_cache(page); 277 xa_unlock_irq(&address_space->i_pages); 278 279 put_swap_page(page, entry); 280 page_ref_sub(page, hpage_nr_pages(page)); 281 } 282 283 /* 284 * If we are the only user, then try to free up the swap cache. 285 * 286 * Its ok to check for PageSwapCache without the page lock 287 * here because we are going to recheck again inside 288 * try_to_free_swap() _with_ the lock. 289 * - Marcelo 290 */ 291 static inline void free_swap_cache(struct page *page) 292 { 293 if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) { 294 try_to_free_swap(page); 295 unlock_page(page); 296 } 297 } 298 299 /* 300 * Perform a free_page(), also freeing any swap cache associated with 301 * this page if it is the last user of the page. 302 */ 303 void free_page_and_swap_cache(struct page *page) 304 { 305 free_swap_cache(page); 306 if (!is_huge_zero_page(page)) 307 put_page(page); 308 } 309 310 /* 311 * Passed an array of pages, drop them all from swapcache and then release 312 * them. They are removed from the LRU and freed if this is their last use. 313 */ 314 void free_pages_and_swap_cache(struct page **pages, int nr) 315 { 316 struct page **pagep = pages; 317 int i; 318 319 lru_add_drain(); 320 for (i = 0; i < nr; i++) 321 free_swap_cache(pagep[i]); 322 release_pages(pagep, nr); 323 } 324 325 static inline bool swap_use_vma_readahead(void) 326 { 327 return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap); 328 } 329 330 /* 331 * Lookup a swap entry in the swap cache. A found page will be returned 332 * unlocked and with its refcount incremented - we rely on the kernel 333 * lock getting page table operations atomic even if we drop the page 334 * lock before returning. 335 */ 336 struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, 337 unsigned long addr) 338 { 339 struct page *page; 340 341 page = find_get_page(swap_address_space(entry), swp_offset(entry)); 342 343 INC_CACHE_INFO(find_total); 344 if (page) { 345 bool vma_ra = swap_use_vma_readahead(); 346 bool readahead; 347 348 INC_CACHE_INFO(find_success); 349 /* 350 * At the moment, we don't support PG_readahead for anon THP 351 * so let's bail out rather than confusing the readahead stat. 352 */ 353 if (unlikely(PageTransCompound(page))) 354 return page; 355 356 readahead = TestClearPageReadahead(page); 357 if (vma && vma_ra) { 358 unsigned long ra_val; 359 int win, hits; 360 361 ra_val = GET_SWAP_RA_VAL(vma); 362 win = SWAP_RA_WIN(ra_val); 363 hits = SWAP_RA_HITS(ra_val); 364 if (readahead) 365 hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX); 366 atomic_long_set(&vma->swap_readahead_info, 367 SWAP_RA_VAL(addr, win, hits)); 368 } 369 370 if (readahead) { 371 count_vm_event(SWAP_RA_HIT); 372 if (!vma || !vma_ra) 373 atomic_inc(&swapin_readahead_hits); 374 } 375 } 376 377 return page; 378 } 379 380 struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, 381 struct vm_area_struct *vma, unsigned long addr, 382 bool *new_page_allocated) 383 { 384 struct page *found_page, *new_page = NULL; 385 struct address_space *swapper_space = swap_address_space(entry); 386 int err; 387 *new_page_allocated = false; 388 389 do { 390 /* 391 * First check the swap cache. Since this is normally 392 * called after lookup_swap_cache() failed, re-calling 393 * that would confuse statistics. 394 */ 395 found_page = find_get_page(swapper_space, swp_offset(entry)); 396 if (found_page) 397 break; 398 399 /* 400 * Just skip read ahead for unused swap slot. 401 * During swap_off when swap_slot_cache is disabled, 402 * we have to handle the race between putting 403 * swap entry in swap cache and marking swap slot 404 * as SWAP_HAS_CACHE. That's done in later part of code or 405 * else swap_off will be aborted if we return NULL. 406 */ 407 if (!__swp_swapcount(entry) && swap_slot_cache_enabled) 408 break; 409 410 /* 411 * Get a new page to read into from swap. 412 */ 413 if (!new_page) { 414 new_page = alloc_page_vma(gfp_mask, vma, addr); 415 if (!new_page) 416 break; /* Out of memory */ 417 } 418 419 /* 420 * call radix_tree_preload() while we can wait. 421 */ 422 err = radix_tree_maybe_preload(gfp_mask & GFP_KERNEL); 423 if (err) 424 break; 425 426 /* 427 * Swap entry may have been freed since our caller observed it. 428 */ 429 err = swapcache_prepare(entry); 430 if (err == -EEXIST) { 431 radix_tree_preload_end(); 432 /* 433 * We might race against get_swap_page() and stumble 434 * across a SWAP_HAS_CACHE swap_map entry whose page 435 * has not been brought into the swapcache yet. 436 */ 437 cond_resched(); 438 continue; 439 } 440 if (err) { /* swp entry is obsolete ? */ 441 radix_tree_preload_end(); 442 break; 443 } 444 445 /* May fail (-ENOMEM) if radix-tree node allocation failed. */ 446 __SetPageLocked(new_page); 447 __SetPageSwapBacked(new_page); 448 err = __add_to_swap_cache(new_page, entry); 449 if (likely(!err)) { 450 radix_tree_preload_end(); 451 /* 452 * Initiate read into locked page and return. 453 */ 454 lru_cache_add_anon(new_page); 455 *new_page_allocated = true; 456 return new_page; 457 } 458 radix_tree_preload_end(); 459 __ClearPageLocked(new_page); 460 /* 461 * add_to_swap_cache() doesn't return -EEXIST, so we can safely 462 * clear SWAP_HAS_CACHE flag. 463 */ 464 put_swap_page(new_page, entry); 465 } while (err != -ENOMEM); 466 467 if (new_page) 468 put_page(new_page); 469 return found_page; 470 } 471 472 /* 473 * Locate a page of swap in physical memory, reserving swap cache space 474 * and reading the disk if it is not already cached. 475 * A failure return means that either the page allocation failed or that 476 * the swap entry is no longer in use. 477 */ 478 struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, 479 struct vm_area_struct *vma, unsigned long addr, bool do_poll) 480 { 481 bool page_was_allocated; 482 struct page *retpage = __read_swap_cache_async(entry, gfp_mask, 483 vma, addr, &page_was_allocated); 484 485 if (page_was_allocated) 486 swap_readpage(retpage, do_poll); 487 488 return retpage; 489 } 490 491 static unsigned int __swapin_nr_pages(unsigned long prev_offset, 492 unsigned long offset, 493 int hits, 494 int max_pages, 495 int prev_win) 496 { 497 unsigned int pages, last_ra; 498 499 /* 500 * This heuristic has been found to work well on both sequential and 501 * random loads, swapping to hard disk or to SSD: please don't ask 502 * what the "+ 2" means, it just happens to work well, that's all. 503 */ 504 pages = hits + 2; 505 if (pages == 2) { 506 /* 507 * We can have no readahead hits to judge by: but must not get 508 * stuck here forever, so check for an adjacent offset instead 509 * (and don't even bother to check whether swap type is same). 510 */ 511 if (offset != prev_offset + 1 && offset != prev_offset - 1) 512 pages = 1; 513 } else { 514 unsigned int roundup = 4; 515 while (roundup < pages) 516 roundup <<= 1; 517 pages = roundup; 518 } 519 520 if (pages > max_pages) 521 pages = max_pages; 522 523 /* Don't shrink readahead too fast */ 524 last_ra = prev_win / 2; 525 if (pages < last_ra) 526 pages = last_ra; 527 528 return pages; 529 } 530 531 static unsigned long swapin_nr_pages(unsigned long offset) 532 { 533 static unsigned long prev_offset; 534 unsigned int hits, pages, max_pages; 535 static atomic_t last_readahead_pages; 536 537 max_pages = 1 << READ_ONCE(page_cluster); 538 if (max_pages <= 1) 539 return 1; 540 541 hits = atomic_xchg(&swapin_readahead_hits, 0); 542 pages = __swapin_nr_pages(prev_offset, offset, hits, max_pages, 543 atomic_read(&last_readahead_pages)); 544 if (!hits) 545 prev_offset = offset; 546 atomic_set(&last_readahead_pages, pages); 547 548 return pages; 549 } 550 551 /** 552 * swap_cluster_readahead - swap in pages in hope we need them soon 553 * @entry: swap entry of this memory 554 * @gfp_mask: memory allocation flags 555 * @vmf: fault information 556 * 557 * Returns the struct page for entry and addr, after queueing swapin. 558 * 559 * Primitive swap readahead code. We simply read an aligned block of 560 * (1 << page_cluster) entries in the swap area. This method is chosen 561 * because it doesn't cost us any seek time. We also make sure to queue 562 * the 'original' request together with the readahead ones... 563 * 564 * This has been extended to use the NUMA policies from the mm triggering 565 * the readahead. 566 * 567 * Caller must hold down_read on the vma->vm_mm if vmf->vma is not NULL. 568 */ 569 struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, 570 struct vm_fault *vmf) 571 { 572 struct page *page; 573 unsigned long entry_offset = swp_offset(entry); 574 unsigned long offset = entry_offset; 575 unsigned long start_offset, end_offset; 576 unsigned long mask; 577 struct swap_info_struct *si = swp_swap_info(entry); 578 struct blk_plug plug; 579 bool do_poll = true, page_allocated; 580 struct vm_area_struct *vma = vmf->vma; 581 unsigned long addr = vmf->address; 582 583 mask = swapin_nr_pages(offset) - 1; 584 if (!mask) 585 goto skip; 586 587 do_poll = false; 588 /* Read a page_cluster sized and aligned cluster around offset. */ 589 start_offset = offset & ~mask; 590 end_offset = offset | mask; 591 if (!start_offset) /* First page is swap header. */ 592 start_offset++; 593 if (end_offset >= si->max) 594 end_offset = si->max - 1; 595 596 blk_start_plug(&plug); 597 for (offset = start_offset; offset <= end_offset ; offset++) { 598 /* Ok, do the async read-ahead now */ 599 page = __read_swap_cache_async( 600 swp_entry(swp_type(entry), offset), 601 gfp_mask, vma, addr, &page_allocated); 602 if (!page) 603 continue; 604 if (page_allocated) { 605 swap_readpage(page, false); 606 if (offset != entry_offset) { 607 SetPageReadahead(page); 608 count_vm_event(SWAP_RA); 609 } 610 } 611 put_page(page); 612 } 613 blk_finish_plug(&plug); 614 615 lru_add_drain(); /* Push any new pages onto the LRU now */ 616 skip: 617 return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll); 618 } 619 620 int init_swap_address_space(unsigned int type, unsigned long nr_pages) 621 { 622 struct address_space *spaces, *space; 623 unsigned int i, nr; 624 625 nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); 626 spaces = kvzalloc(sizeof(struct address_space) * nr, GFP_KERNEL); 627 if (!spaces) 628 return -ENOMEM; 629 for (i = 0; i < nr; i++) { 630 space = spaces + i; 631 INIT_RADIX_TREE(&space->i_pages, GFP_ATOMIC|__GFP_NOWARN); 632 atomic_set(&space->i_mmap_writable, 0); 633 space->a_ops = &swap_aops; 634 /* swap cache doesn't use writeback related tags */ 635 mapping_set_no_writeback_tags(space); 636 } 637 nr_swapper_spaces[type] = nr; 638 rcu_assign_pointer(swapper_spaces[type], spaces); 639 640 return 0; 641 } 642 643 void exit_swap_address_space(unsigned int type) 644 { 645 struct address_space *spaces; 646 647 spaces = swapper_spaces[type]; 648 nr_swapper_spaces[type] = 0; 649 rcu_assign_pointer(swapper_spaces[type], NULL); 650 synchronize_rcu(); 651 kvfree(spaces); 652 } 653 654 static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma, 655 unsigned long faddr, 656 unsigned long lpfn, 657 unsigned long rpfn, 658 unsigned long *start, 659 unsigned long *end) 660 { 661 *start = max3(lpfn, PFN_DOWN(vma->vm_start), 662 PFN_DOWN(faddr & PMD_MASK)); 663 *end = min3(rpfn, PFN_DOWN(vma->vm_end), 664 PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE)); 665 } 666 667 static void swap_ra_info(struct vm_fault *vmf, 668 struct vma_swap_readahead *ra_info) 669 { 670 struct vm_area_struct *vma = vmf->vma; 671 unsigned long ra_val; 672 swp_entry_t entry; 673 unsigned long faddr, pfn, fpfn; 674 unsigned long start, end; 675 pte_t *pte, *orig_pte; 676 unsigned int max_win, hits, prev_win, win, left; 677 #ifndef CONFIG_64BIT 678 pte_t *tpte; 679 #endif 680 681 max_win = 1 << min_t(unsigned int, READ_ONCE(page_cluster), 682 SWAP_RA_ORDER_CEILING); 683 if (max_win == 1) { 684 ra_info->win = 1; 685 return; 686 } 687 688 faddr = vmf->address; 689 orig_pte = pte = pte_offset_map(vmf->pmd, faddr); 690 entry = pte_to_swp_entry(*pte); 691 if ((unlikely(non_swap_entry(entry)))) { 692 pte_unmap(orig_pte); 693 return; 694 } 695 696 fpfn = PFN_DOWN(faddr); 697 ra_val = GET_SWAP_RA_VAL(vma); 698 pfn = PFN_DOWN(SWAP_RA_ADDR(ra_val)); 699 prev_win = SWAP_RA_WIN(ra_val); 700 hits = SWAP_RA_HITS(ra_val); 701 ra_info->win = win = __swapin_nr_pages(pfn, fpfn, hits, 702 max_win, prev_win); 703 atomic_long_set(&vma->swap_readahead_info, 704 SWAP_RA_VAL(faddr, win, 0)); 705 706 if (win == 1) { 707 pte_unmap(orig_pte); 708 return; 709 } 710 711 /* Copy the PTEs because the page table may be unmapped */ 712 if (fpfn == pfn + 1) 713 swap_ra_clamp_pfn(vma, faddr, fpfn, fpfn + win, &start, &end); 714 else if (pfn == fpfn + 1) 715 swap_ra_clamp_pfn(vma, faddr, fpfn - win + 1, fpfn + 1, 716 &start, &end); 717 else { 718 left = (win - 1) / 2; 719 swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left, 720 &start, &end); 721 } 722 ra_info->nr_pte = end - start; 723 ra_info->offset = fpfn - start; 724 pte -= ra_info->offset; 725 #ifdef CONFIG_64BIT 726 ra_info->ptes = pte; 727 #else 728 tpte = ra_info->ptes; 729 for (pfn = start; pfn != end; pfn++) 730 *tpte++ = *pte++; 731 #endif 732 pte_unmap(orig_pte); 733 } 734 735 static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, 736 struct vm_fault *vmf) 737 { 738 struct blk_plug plug; 739 struct vm_area_struct *vma = vmf->vma; 740 struct page *page; 741 pte_t *pte, pentry; 742 swp_entry_t entry; 743 unsigned int i; 744 bool page_allocated; 745 struct vma_swap_readahead ra_info = {0,}; 746 747 swap_ra_info(vmf, &ra_info); 748 if (ra_info.win == 1) 749 goto skip; 750 751 blk_start_plug(&plug); 752 for (i = 0, pte = ra_info.ptes; i < ra_info.nr_pte; 753 i++, pte++) { 754 pentry = *pte; 755 if (pte_none(pentry)) 756 continue; 757 if (pte_present(pentry)) 758 continue; 759 entry = pte_to_swp_entry(pentry); 760 if (unlikely(non_swap_entry(entry))) 761 continue; 762 page = __read_swap_cache_async(entry, gfp_mask, vma, 763 vmf->address, &page_allocated); 764 if (!page) 765 continue; 766 if (page_allocated) { 767 swap_readpage(page, false); 768 if (i != ra_info.offset) { 769 SetPageReadahead(page); 770 count_vm_event(SWAP_RA); 771 } 772 } 773 put_page(page); 774 } 775 blk_finish_plug(&plug); 776 lru_add_drain(); 777 skip: 778 return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address, 779 ra_info.win == 1); 780 } 781 782 /** 783 * swapin_readahead - swap in pages in hope we need them soon 784 * @entry: swap entry of this memory 785 * @gfp_mask: memory allocation flags 786 * @vmf: fault information 787 * 788 * Returns the struct page for entry and addr, after queueing swapin. 789 * 790 * It's a main entry function for swap readahead. By the configuration, 791 * it will read ahead blocks by cluster-based(ie, physical disk based) 792 * or vma-based(ie, virtual address based on faulty address) readahead. 793 */ 794 struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, 795 struct vm_fault *vmf) 796 { 797 return swap_use_vma_readahead() ? 798 swap_vma_readahead(entry, gfp_mask, vmf) : 799 swap_cluster_readahead(entry, gfp_mask, vmf); 800 } 801 802 #ifdef CONFIG_SYSFS 803 static ssize_t vma_ra_enabled_show(struct kobject *kobj, 804 struct kobj_attribute *attr, char *buf) 805 { 806 return sprintf(buf, "%s\n", enable_vma_readahead ? "true" : "false"); 807 } 808 static ssize_t vma_ra_enabled_store(struct kobject *kobj, 809 struct kobj_attribute *attr, 810 const char *buf, size_t count) 811 { 812 if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1)) 813 enable_vma_readahead = true; 814 else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1)) 815 enable_vma_readahead = false; 816 else 817 return -EINVAL; 818 819 return count; 820 } 821 static struct kobj_attribute vma_ra_enabled_attr = 822 __ATTR(vma_ra_enabled, 0644, vma_ra_enabled_show, 823 vma_ra_enabled_store); 824 825 static struct attribute *swap_attrs[] = { 826 &vma_ra_enabled_attr.attr, 827 NULL, 828 }; 829 830 static struct attribute_group swap_attr_group = { 831 .attrs = swap_attrs, 832 }; 833 834 static int __init swap_init_sysfs(void) 835 { 836 int err; 837 struct kobject *swap_kobj; 838 839 swap_kobj = kobject_create_and_add("swap", mm_kobj); 840 if (!swap_kobj) { 841 pr_err("failed to create swap kobject\n"); 842 return -ENOMEM; 843 } 844 err = sysfs_create_group(swap_kobj, &swap_attr_group); 845 if (err) { 846 pr_err("failed to register swap group\n"); 847 goto delete_obj; 848 } 849 return 0; 850 851 delete_obj: 852 kobject_put(swap_kobj); 853 return err; 854 } 855 subsys_initcall(swap_init_sysfs); 856 #endif 857