1 /* 2 * linux/mm/swap_state.c 3 * 4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 5 * Swap reorganised 29.12.95, Stephen Tweedie 6 * 7 * Rewritten to use page cache, (C) 1998 Stephen Tweedie 8 */ 9 #include <linux/mm.h> 10 #include <linux/gfp.h> 11 #include <linux/kernel_stat.h> 12 #include <linux/swap.h> 13 #include <linux/swapops.h> 14 #include <linux/init.h> 15 #include <linux/pagemap.h> 16 #include <linux/backing-dev.h> 17 #include <linux/blkdev.h> 18 #include <linux/pagevec.h> 19 #include <linux/migrate.h> 20 #include <linux/vmalloc.h> 21 #include <linux/swap_slots.h> 22 #include <linux/huge_mm.h> 23 24 #include <asm/pgtable.h> 25 26 /* 27 * swapper_space is a fiction, retained to simplify the path through 28 * vmscan's shrink_page_list. 29 */ 30 static const struct address_space_operations swap_aops = { 31 .writepage = swap_writepage, 32 .set_page_dirty = swap_set_page_dirty, 33 #ifdef CONFIG_MIGRATION 34 .migratepage = migrate_page, 35 #endif 36 }; 37 38 struct address_space *swapper_spaces[MAX_SWAPFILES]; 39 static unsigned int nr_swapper_spaces[MAX_SWAPFILES]; 40 bool swap_vma_readahead = true; 41 42 #define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) 43 #define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) 44 #define SWAP_RA_HITS_MAX SWAP_RA_HITS_MASK 45 #define SWAP_RA_WIN_MASK (~PAGE_MASK & ~SWAP_RA_HITS_MASK) 46 47 #define SWAP_RA_HITS(v) ((v) & SWAP_RA_HITS_MASK) 48 #define SWAP_RA_WIN(v) (((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT) 49 #define SWAP_RA_ADDR(v) ((v) & PAGE_MASK) 50 51 #define SWAP_RA_VAL(addr, win, hits) \ 52 (((addr) & PAGE_MASK) | \ 53 (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) | \ 54 ((hits) & SWAP_RA_HITS_MASK)) 55 56 /* Initial readahead hits is 4 to start up with a small window */ 57 #define GET_SWAP_RA_VAL(vma) \ 58 (atomic_long_read(&(vma)->swap_readahead_info) ? : 4) 59 60 #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) 61 #define ADD_CACHE_INFO(x, nr) do { swap_cache_info.x += (nr); } while (0) 62 63 static struct { 64 unsigned long add_total; 65 unsigned long del_total; 66 unsigned long find_success; 67 unsigned long find_total; 68 } swap_cache_info; 69 70 unsigned long total_swapcache_pages(void) 71 { 72 unsigned int i, j, nr; 73 unsigned long ret = 0; 74 struct address_space *spaces; 75 76 rcu_read_lock(); 77 for (i = 0; i < MAX_SWAPFILES; i++) { 78 /* 79 * The corresponding entries in nr_swapper_spaces and 80 * swapper_spaces will be reused only after at least 81 * one grace period. So it is impossible for them 82 * belongs to different usage. 83 */ 84 nr = nr_swapper_spaces[i]; 85 spaces = rcu_dereference(swapper_spaces[i]); 86 if (!nr || !spaces) 87 continue; 88 for (j = 0; j < nr; j++) 89 ret += spaces[j].nrpages; 90 } 91 rcu_read_unlock(); 92 return ret; 93 } 94 95 static atomic_t swapin_readahead_hits = ATOMIC_INIT(4); 96 97 void show_swap_cache_info(void) 98 { 99 printk("%lu pages in swap cache\n", total_swapcache_pages()); 100 printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", 101 swap_cache_info.add_total, swap_cache_info.del_total, 102 swap_cache_info.find_success, swap_cache_info.find_total); 103 printk("Free swap = %ldkB\n", 104 get_nr_swap_pages() << (PAGE_SHIFT - 10)); 105 printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); 106 } 107 108 /* 109 * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, 110 * but sets SwapCache flag and private instead of mapping and index. 111 */ 112 int __add_to_swap_cache(struct page *page, swp_entry_t entry) 113 { 114 int error, i, nr = hpage_nr_pages(page); 115 struct address_space *address_space; 116 pgoff_t idx = swp_offset(entry); 117 118 VM_BUG_ON_PAGE(!PageLocked(page), page); 119 VM_BUG_ON_PAGE(PageSwapCache(page), page); 120 VM_BUG_ON_PAGE(!PageSwapBacked(page), page); 121 122 page_ref_add(page, nr); 123 SetPageSwapCache(page); 124 125 address_space = swap_address_space(entry); 126 spin_lock_irq(&address_space->tree_lock); 127 for (i = 0; i < nr; i++) { 128 set_page_private(page + i, entry.val + i); 129 error = radix_tree_insert(&address_space->page_tree, 130 idx + i, page + i); 131 if (unlikely(error)) 132 break; 133 } 134 if (likely(!error)) { 135 address_space->nrpages += nr; 136 __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); 137 ADD_CACHE_INFO(add_total, nr); 138 } else { 139 /* 140 * Only the context which have set SWAP_HAS_CACHE flag 141 * would call add_to_swap_cache(). 142 * So add_to_swap_cache() doesn't returns -EEXIST. 143 */ 144 VM_BUG_ON(error == -EEXIST); 145 set_page_private(page + i, 0UL); 146 while (i--) { 147 radix_tree_delete(&address_space->page_tree, idx + i); 148 set_page_private(page + i, 0UL); 149 } 150 ClearPageSwapCache(page); 151 page_ref_sub(page, nr); 152 } 153 spin_unlock_irq(&address_space->tree_lock); 154 155 return error; 156 } 157 158 159 int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) 160 { 161 int error; 162 163 error = radix_tree_maybe_preload_order(gfp_mask, compound_order(page)); 164 if (!error) { 165 error = __add_to_swap_cache(page, entry); 166 radix_tree_preload_end(); 167 } 168 return error; 169 } 170 171 /* 172 * This must be called only on pages that have 173 * been verified to be in the swap cache. 174 */ 175 void __delete_from_swap_cache(struct page *page) 176 { 177 struct address_space *address_space; 178 int i, nr = hpage_nr_pages(page); 179 swp_entry_t entry; 180 pgoff_t idx; 181 182 VM_BUG_ON_PAGE(!PageLocked(page), page); 183 VM_BUG_ON_PAGE(!PageSwapCache(page), page); 184 VM_BUG_ON_PAGE(PageWriteback(page), page); 185 186 entry.val = page_private(page); 187 address_space = swap_address_space(entry); 188 idx = swp_offset(entry); 189 for (i = 0; i < nr; i++) { 190 radix_tree_delete(&address_space->page_tree, idx + i); 191 set_page_private(page + i, 0); 192 } 193 ClearPageSwapCache(page); 194 address_space->nrpages -= nr; 195 __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); 196 ADD_CACHE_INFO(del_total, nr); 197 } 198 199 /** 200 * add_to_swap - allocate swap space for a page 201 * @page: page we want to move to swap 202 * 203 * Allocate swap space for the page and add the page to the 204 * swap cache. Caller needs to hold the page lock. 205 */ 206 int add_to_swap(struct page *page) 207 { 208 swp_entry_t entry; 209 int err; 210 211 VM_BUG_ON_PAGE(!PageLocked(page), page); 212 VM_BUG_ON_PAGE(!PageUptodate(page), page); 213 214 entry = get_swap_page(page); 215 if (!entry.val) 216 return 0; 217 218 if (mem_cgroup_try_charge_swap(page, entry)) 219 goto fail; 220 221 /* 222 * Radix-tree node allocations from PF_MEMALLOC contexts could 223 * completely exhaust the page allocator. __GFP_NOMEMALLOC 224 * stops emergency reserves from being allocated. 225 * 226 * TODO: this could cause a theoretical memory reclaim 227 * deadlock in the swap out path. 228 */ 229 /* 230 * Add it to the swap cache. 231 */ 232 err = add_to_swap_cache(page, entry, 233 __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); 234 /* -ENOMEM radix-tree allocation failure */ 235 if (err) 236 /* 237 * add_to_swap_cache() doesn't return -EEXIST, so we can safely 238 * clear SWAP_HAS_CACHE flag. 239 */ 240 goto fail; 241 /* 242 * Normally the page will be dirtied in unmap because its pte should be 243 * dirty. A special case is MADV_FREE page. The page'e pte could have 244 * dirty bit cleared but the page's SwapBacked bit is still set because 245 * clearing the dirty bit and SwapBacked bit has no lock protected. For 246 * such page, unmap will not set dirty bit for it, so page reclaim will 247 * not write the page out. This can cause data corruption when the page 248 * is swap in later. Always setting the dirty bit for the page solves 249 * the problem. 250 */ 251 set_page_dirty(page); 252 253 return 1; 254 255 fail: 256 put_swap_page(page, entry); 257 return 0; 258 } 259 260 /* 261 * This must be called only on pages that have 262 * been verified to be in the swap cache and locked. 263 * It will never put the page into the free list, 264 * the caller has a reference on the page. 265 */ 266 void delete_from_swap_cache(struct page *page) 267 { 268 swp_entry_t entry; 269 struct address_space *address_space; 270 271 entry.val = page_private(page); 272 273 address_space = swap_address_space(entry); 274 spin_lock_irq(&address_space->tree_lock); 275 __delete_from_swap_cache(page); 276 spin_unlock_irq(&address_space->tree_lock); 277 278 put_swap_page(page, entry); 279 page_ref_sub(page, hpage_nr_pages(page)); 280 } 281 282 /* 283 * If we are the only user, then try to free up the swap cache. 284 * 285 * Its ok to check for PageSwapCache without the page lock 286 * here because we are going to recheck again inside 287 * try_to_free_swap() _with_ the lock. 288 * - Marcelo 289 */ 290 static inline void free_swap_cache(struct page *page) 291 { 292 if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) { 293 try_to_free_swap(page); 294 unlock_page(page); 295 } 296 } 297 298 /* 299 * Perform a free_page(), also freeing any swap cache associated with 300 * this page if it is the last user of the page. 301 */ 302 void free_page_and_swap_cache(struct page *page) 303 { 304 free_swap_cache(page); 305 if (!is_huge_zero_page(page)) 306 put_page(page); 307 } 308 309 /* 310 * Passed an array of pages, drop them all from swapcache and then release 311 * them. They are removed from the LRU and freed if this is their last use. 312 */ 313 void free_pages_and_swap_cache(struct page **pages, int nr) 314 { 315 struct page **pagep = pages; 316 int i; 317 318 lru_add_drain(); 319 for (i = 0; i < nr; i++) 320 free_swap_cache(pagep[i]); 321 release_pages(pagep, nr, false); 322 } 323 324 /* 325 * Lookup a swap entry in the swap cache. A found page will be returned 326 * unlocked and with its refcount incremented - we rely on the kernel 327 * lock getting page table operations atomic even if we drop the page 328 * lock before returning. 329 */ 330 struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, 331 unsigned long addr) 332 { 333 struct page *page; 334 unsigned long ra_info; 335 int win, hits, readahead; 336 337 page = find_get_page(swap_address_space(entry), swp_offset(entry)); 338 339 INC_CACHE_INFO(find_total); 340 if (page) { 341 INC_CACHE_INFO(find_success); 342 if (unlikely(PageTransCompound(page))) 343 return page; 344 readahead = TestClearPageReadahead(page); 345 if (vma) { 346 ra_info = GET_SWAP_RA_VAL(vma); 347 win = SWAP_RA_WIN(ra_info); 348 hits = SWAP_RA_HITS(ra_info); 349 if (readahead) 350 hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX); 351 atomic_long_set(&vma->swap_readahead_info, 352 SWAP_RA_VAL(addr, win, hits)); 353 } 354 if (readahead) { 355 count_vm_event(SWAP_RA_HIT); 356 if (!vma) 357 atomic_inc(&swapin_readahead_hits); 358 } 359 } 360 return page; 361 } 362 363 struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, 364 struct vm_area_struct *vma, unsigned long addr, 365 bool *new_page_allocated) 366 { 367 struct page *found_page, *new_page = NULL; 368 struct address_space *swapper_space = swap_address_space(entry); 369 int err; 370 *new_page_allocated = false; 371 372 do { 373 /* 374 * First check the swap cache. Since this is normally 375 * called after lookup_swap_cache() failed, re-calling 376 * that would confuse statistics. 377 */ 378 found_page = find_get_page(swapper_space, swp_offset(entry)); 379 if (found_page) 380 break; 381 382 /* 383 * Just skip read ahead for unused swap slot. 384 * During swap_off when swap_slot_cache is disabled, 385 * we have to handle the race between putting 386 * swap entry in swap cache and marking swap slot 387 * as SWAP_HAS_CACHE. That's done in later part of code or 388 * else swap_off will be aborted if we return NULL. 389 */ 390 if (!__swp_swapcount(entry) && swap_slot_cache_enabled) 391 break; 392 393 /* 394 * Get a new page to read into from swap. 395 */ 396 if (!new_page) { 397 new_page = alloc_page_vma(gfp_mask, vma, addr); 398 if (!new_page) 399 break; /* Out of memory */ 400 } 401 402 /* 403 * call radix_tree_preload() while we can wait. 404 */ 405 err = radix_tree_maybe_preload(gfp_mask & GFP_KERNEL); 406 if (err) 407 break; 408 409 /* 410 * Swap entry may have been freed since our caller observed it. 411 */ 412 err = swapcache_prepare(entry); 413 if (err == -EEXIST) { 414 radix_tree_preload_end(); 415 /* 416 * We might race against get_swap_page() and stumble 417 * across a SWAP_HAS_CACHE swap_map entry whose page 418 * has not been brought into the swapcache yet. 419 */ 420 cond_resched(); 421 continue; 422 } 423 if (err) { /* swp entry is obsolete ? */ 424 radix_tree_preload_end(); 425 break; 426 } 427 428 /* May fail (-ENOMEM) if radix-tree node allocation failed. */ 429 __SetPageLocked(new_page); 430 __SetPageSwapBacked(new_page); 431 err = __add_to_swap_cache(new_page, entry); 432 if (likely(!err)) { 433 radix_tree_preload_end(); 434 /* 435 * Initiate read into locked page and return. 436 */ 437 lru_cache_add_anon(new_page); 438 *new_page_allocated = true; 439 return new_page; 440 } 441 radix_tree_preload_end(); 442 __ClearPageLocked(new_page); 443 /* 444 * add_to_swap_cache() doesn't return -EEXIST, so we can safely 445 * clear SWAP_HAS_CACHE flag. 446 */ 447 put_swap_page(new_page, entry); 448 } while (err != -ENOMEM); 449 450 if (new_page) 451 put_page(new_page); 452 return found_page; 453 } 454 455 /* 456 * Locate a page of swap in physical memory, reserving swap cache space 457 * and reading the disk if it is not already cached. 458 * A failure return means that either the page allocation failed or that 459 * the swap entry is no longer in use. 460 */ 461 struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, 462 struct vm_area_struct *vma, unsigned long addr, bool do_poll) 463 { 464 bool page_was_allocated; 465 struct page *retpage = __read_swap_cache_async(entry, gfp_mask, 466 vma, addr, &page_was_allocated); 467 468 if (page_was_allocated) 469 swap_readpage(retpage, do_poll); 470 471 return retpage; 472 } 473 474 static unsigned int __swapin_nr_pages(unsigned long prev_offset, 475 unsigned long offset, 476 int hits, 477 int max_pages, 478 int prev_win) 479 { 480 unsigned int pages, last_ra; 481 482 /* 483 * This heuristic has been found to work well on both sequential and 484 * random loads, swapping to hard disk or to SSD: please don't ask 485 * what the "+ 2" means, it just happens to work well, that's all. 486 */ 487 pages = hits + 2; 488 if (pages == 2) { 489 /* 490 * We can have no readahead hits to judge by: but must not get 491 * stuck here forever, so check for an adjacent offset instead 492 * (and don't even bother to check whether swap type is same). 493 */ 494 if (offset != prev_offset + 1 && offset != prev_offset - 1) 495 pages = 1; 496 } else { 497 unsigned int roundup = 4; 498 while (roundup < pages) 499 roundup <<= 1; 500 pages = roundup; 501 } 502 503 if (pages > max_pages) 504 pages = max_pages; 505 506 /* Don't shrink readahead too fast */ 507 last_ra = prev_win / 2; 508 if (pages < last_ra) 509 pages = last_ra; 510 511 return pages; 512 } 513 514 static unsigned long swapin_nr_pages(unsigned long offset) 515 { 516 static unsigned long prev_offset; 517 unsigned int hits, pages, max_pages; 518 static atomic_t last_readahead_pages; 519 520 max_pages = 1 << READ_ONCE(page_cluster); 521 if (max_pages <= 1) 522 return 1; 523 524 hits = atomic_xchg(&swapin_readahead_hits, 0); 525 pages = __swapin_nr_pages(prev_offset, offset, hits, max_pages, 526 atomic_read(&last_readahead_pages)); 527 if (!hits) 528 prev_offset = offset; 529 atomic_set(&last_readahead_pages, pages); 530 531 return pages; 532 } 533 534 /** 535 * swapin_readahead - swap in pages in hope we need them soon 536 * @entry: swap entry of this memory 537 * @gfp_mask: memory allocation flags 538 * @vma: user vma this address belongs to 539 * @addr: target address for mempolicy 540 * 541 * Returns the struct page for entry and addr, after queueing swapin. 542 * 543 * Primitive swap readahead code. We simply read an aligned block of 544 * (1 << page_cluster) entries in the swap area. This method is chosen 545 * because it doesn't cost us any seek time. We also make sure to queue 546 * the 'original' request together with the readahead ones... 547 * 548 * This has been extended to use the NUMA policies from the mm triggering 549 * the readahead. 550 * 551 * Caller must hold down_read on the vma->vm_mm if vma is not NULL. 552 */ 553 struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, 554 struct vm_area_struct *vma, unsigned long addr) 555 { 556 struct page *page; 557 unsigned long entry_offset = swp_offset(entry); 558 unsigned long offset = entry_offset; 559 unsigned long start_offset, end_offset; 560 unsigned long mask; 561 struct blk_plug plug; 562 bool do_poll = true, page_allocated; 563 564 mask = swapin_nr_pages(offset) - 1; 565 if (!mask) 566 goto skip; 567 568 do_poll = false; 569 /* Read a page_cluster sized and aligned cluster around offset. */ 570 start_offset = offset & ~mask; 571 end_offset = offset | mask; 572 if (!start_offset) /* First page is swap header. */ 573 start_offset++; 574 575 blk_start_plug(&plug); 576 for (offset = start_offset; offset <= end_offset ; offset++) { 577 /* Ok, do the async read-ahead now */ 578 page = __read_swap_cache_async( 579 swp_entry(swp_type(entry), offset), 580 gfp_mask, vma, addr, &page_allocated); 581 if (!page) 582 continue; 583 if (page_allocated) { 584 swap_readpage(page, false); 585 if (offset != entry_offset && 586 likely(!PageTransCompound(page))) { 587 SetPageReadahead(page); 588 count_vm_event(SWAP_RA); 589 } 590 } 591 put_page(page); 592 } 593 blk_finish_plug(&plug); 594 595 lru_add_drain(); /* Push any new pages onto the LRU now */ 596 skip: 597 return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll); 598 } 599 600 int init_swap_address_space(unsigned int type, unsigned long nr_pages) 601 { 602 struct address_space *spaces, *space; 603 unsigned int i, nr; 604 605 nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); 606 spaces = kvzalloc(sizeof(struct address_space) * nr, GFP_KERNEL); 607 if (!spaces) 608 return -ENOMEM; 609 for (i = 0; i < nr; i++) { 610 space = spaces + i; 611 INIT_RADIX_TREE(&space->page_tree, GFP_ATOMIC|__GFP_NOWARN); 612 atomic_set(&space->i_mmap_writable, 0); 613 space->a_ops = &swap_aops; 614 /* swap cache doesn't use writeback related tags */ 615 mapping_set_no_writeback_tags(space); 616 spin_lock_init(&space->tree_lock); 617 } 618 nr_swapper_spaces[type] = nr; 619 rcu_assign_pointer(swapper_spaces[type], spaces); 620 621 return 0; 622 } 623 624 void exit_swap_address_space(unsigned int type) 625 { 626 struct address_space *spaces; 627 628 spaces = swapper_spaces[type]; 629 nr_swapper_spaces[type] = 0; 630 rcu_assign_pointer(swapper_spaces[type], NULL); 631 synchronize_rcu(); 632 kvfree(spaces); 633 } 634 635 static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma, 636 unsigned long faddr, 637 unsigned long lpfn, 638 unsigned long rpfn, 639 unsigned long *start, 640 unsigned long *end) 641 { 642 *start = max3(lpfn, PFN_DOWN(vma->vm_start), 643 PFN_DOWN(faddr & PMD_MASK)); 644 *end = min3(rpfn, PFN_DOWN(vma->vm_end), 645 PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE)); 646 } 647 648 struct page *swap_readahead_detect(struct vm_fault *vmf, 649 struct vma_swap_readahead *swap_ra) 650 { 651 struct vm_area_struct *vma = vmf->vma; 652 unsigned long swap_ra_info; 653 struct page *page; 654 swp_entry_t entry; 655 unsigned long faddr, pfn, fpfn; 656 unsigned long start, end; 657 pte_t *pte; 658 unsigned int max_win, hits, prev_win, win, left; 659 #ifndef CONFIG_64BIT 660 pte_t *tpte; 661 #endif 662 663 max_win = 1 << min_t(unsigned int, READ_ONCE(page_cluster), 664 SWAP_RA_ORDER_CEILING); 665 if (max_win == 1) { 666 swap_ra->win = 1; 667 return NULL; 668 } 669 670 faddr = vmf->address; 671 entry = pte_to_swp_entry(vmf->orig_pte); 672 if ((unlikely(non_swap_entry(entry)))) 673 return NULL; 674 page = lookup_swap_cache(entry, vma, faddr); 675 if (page) 676 return page; 677 678 fpfn = PFN_DOWN(faddr); 679 swap_ra_info = GET_SWAP_RA_VAL(vma); 680 pfn = PFN_DOWN(SWAP_RA_ADDR(swap_ra_info)); 681 prev_win = SWAP_RA_WIN(swap_ra_info); 682 hits = SWAP_RA_HITS(swap_ra_info); 683 swap_ra->win = win = __swapin_nr_pages(pfn, fpfn, hits, 684 max_win, prev_win); 685 atomic_long_set(&vma->swap_readahead_info, 686 SWAP_RA_VAL(faddr, win, 0)); 687 688 if (win == 1) 689 return NULL; 690 691 /* Copy the PTEs because the page table may be unmapped */ 692 if (fpfn == pfn + 1) 693 swap_ra_clamp_pfn(vma, faddr, fpfn, fpfn + win, &start, &end); 694 else if (pfn == fpfn + 1) 695 swap_ra_clamp_pfn(vma, faddr, fpfn - win + 1, fpfn + 1, 696 &start, &end); 697 else { 698 left = (win - 1) / 2; 699 swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left, 700 &start, &end); 701 } 702 swap_ra->nr_pte = end - start; 703 swap_ra->offset = fpfn - start; 704 pte = vmf->pte - swap_ra->offset; 705 #ifdef CONFIG_64BIT 706 swap_ra->ptes = pte; 707 #else 708 tpte = swap_ra->ptes; 709 for (pfn = start; pfn != end; pfn++) 710 *tpte++ = *pte++; 711 #endif 712 713 return NULL; 714 } 715 716 struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask, 717 struct vm_fault *vmf, 718 struct vma_swap_readahead *swap_ra) 719 { 720 struct blk_plug plug; 721 struct vm_area_struct *vma = vmf->vma; 722 struct page *page; 723 pte_t *pte, pentry; 724 swp_entry_t entry; 725 unsigned int i; 726 bool page_allocated; 727 728 if (swap_ra->win == 1) 729 goto skip; 730 731 blk_start_plug(&plug); 732 for (i = 0, pte = swap_ra->ptes; i < swap_ra->nr_pte; 733 i++, pte++) { 734 pentry = *pte; 735 if (pte_none(pentry)) 736 continue; 737 if (pte_present(pentry)) 738 continue; 739 entry = pte_to_swp_entry(pentry); 740 if (unlikely(non_swap_entry(entry))) 741 continue; 742 page = __read_swap_cache_async(entry, gfp_mask, vma, 743 vmf->address, &page_allocated); 744 if (!page) 745 continue; 746 if (page_allocated) { 747 swap_readpage(page, false); 748 if (i != swap_ra->offset && 749 likely(!PageTransCompound(page))) { 750 SetPageReadahead(page); 751 count_vm_event(SWAP_RA); 752 } 753 } 754 put_page(page); 755 } 756 blk_finish_plug(&plug); 757 lru_add_drain(); 758 skip: 759 return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address, 760 swap_ra->win == 1); 761 } 762 763 #ifdef CONFIG_SYSFS 764 static ssize_t vma_ra_enabled_show(struct kobject *kobj, 765 struct kobj_attribute *attr, char *buf) 766 { 767 return sprintf(buf, "%s\n", swap_vma_readahead ? "true" : "false"); 768 } 769 static ssize_t vma_ra_enabled_store(struct kobject *kobj, 770 struct kobj_attribute *attr, 771 const char *buf, size_t count) 772 { 773 if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1)) 774 swap_vma_readahead = true; 775 else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1)) 776 swap_vma_readahead = false; 777 else 778 return -EINVAL; 779 780 return count; 781 } 782 static struct kobj_attribute vma_ra_enabled_attr = 783 __ATTR(vma_ra_enabled, 0644, vma_ra_enabled_show, 784 vma_ra_enabled_store); 785 786 static struct attribute *swap_attrs[] = { 787 &vma_ra_enabled_attr.attr, 788 NULL, 789 }; 790 791 static struct attribute_group swap_attr_group = { 792 .attrs = swap_attrs, 793 }; 794 795 static int __init swap_init_sysfs(void) 796 { 797 int err; 798 struct kobject *swap_kobj; 799 800 swap_kobj = kobject_create_and_add("swap", mm_kobj); 801 if (!swap_kobj) { 802 pr_err("failed to create swap kobject\n"); 803 return -ENOMEM; 804 } 805 err = sysfs_create_group(swap_kobj, &swap_attr_group); 806 if (err) { 807 pr_err("failed to register swap group\n"); 808 goto delete_obj; 809 } 810 return 0; 811 812 delete_obj: 813 kobject_put(swap_kobj); 814 return err; 815 } 816 subsys_initcall(swap_init_sysfs); 817 #endif 818