1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/mm/swap_state.c 4 * 5 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 6 * Swap reorganised 29.12.95, Stephen Tweedie 7 * 8 * Rewritten to use page cache, (C) 1998 Stephen Tweedie 9 */ 10 #include <linux/mm.h> 11 #include <linux/gfp.h> 12 #include <linux/kernel_stat.h> 13 #include <linux/mempolicy.h> 14 #include <linux/swap.h> 15 #include <linux/swapops.h> 16 #include <linux/init.h> 17 #include <linux/pagemap.h> 18 #include <linux/pagevec.h> 19 #include <linux/backing-dev.h> 20 #include <linux/blkdev.h> 21 #include <linux/migrate.h> 22 #include <linux/vmalloc.h> 23 #include <linux/swap_slots.h> 24 #include <linux/huge_mm.h> 25 #include <linux/shmem_fs.h> 26 #include "internal.h" 27 #include "swap.h" 28 29 /* 30 * swapper_space is a fiction, retained to simplify the path through 31 * vmscan's shrink_folio_list. 32 */ 33 static const struct address_space_operations swap_aops = { 34 .writepage = swap_writepage, 35 .dirty_folio = noop_dirty_folio, 36 #ifdef CONFIG_MIGRATION 37 .migrate_folio = migrate_folio, 38 #endif 39 }; 40 41 struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly; 42 static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly; 43 static bool enable_vma_readahead __read_mostly = true; 44 45 #define SWAP_RA_ORDER_CEILING 5 46 47 #define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) 48 #define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) 49 #define SWAP_RA_HITS_MAX SWAP_RA_HITS_MASK 50 #define SWAP_RA_WIN_MASK (~PAGE_MASK & ~SWAP_RA_HITS_MASK) 51 52 #define SWAP_RA_HITS(v) ((v) & SWAP_RA_HITS_MASK) 53 #define SWAP_RA_WIN(v) (((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT) 54 #define SWAP_RA_ADDR(v) ((v) & PAGE_MASK) 55 56 #define SWAP_RA_VAL(addr, win, hits) \ 57 (((addr) & PAGE_MASK) | \ 58 (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) | \ 59 ((hits) & SWAP_RA_HITS_MASK)) 60 61 /* Initial readahead hits is 4 to start up with a small window */ 62 #define GET_SWAP_RA_VAL(vma) \ 63 (atomic_long_read(&(vma)->swap_readahead_info) ? : 4) 64 65 static atomic_t swapin_readahead_hits = ATOMIC_INIT(4); 66 67 void show_swap_cache_info(void) 68 { 69 printk("%lu pages in swap cache\n", total_swapcache_pages()); 70 printk("Free swap = %ldkB\n", K(get_nr_swap_pages())); 71 printk("Total swap = %lukB\n", K(total_swap_pages)); 72 } 73 74 void *get_shadow_from_swap_cache(swp_entry_t entry) 75 { 76 struct address_space *address_space = swap_address_space(entry); 77 pgoff_t idx = swap_cache_index(entry); 78 void *shadow; 79 80 shadow = xa_load(&address_space->i_pages, idx); 81 if (xa_is_value(shadow)) 82 return shadow; 83 return NULL; 84 } 85 86 /* 87 * add_to_swap_cache resembles filemap_add_folio on swapper_space, 88 * but sets SwapCache flag and 'swap' instead of mapping and index. 89 */ 90 int add_to_swap_cache(struct folio *folio, swp_entry_t entry, 91 gfp_t gfp, void **shadowp) 92 { 93 struct address_space *address_space = swap_address_space(entry); 94 pgoff_t idx = swap_cache_index(entry); 95 XA_STATE_ORDER(xas, &address_space->i_pages, idx, folio_order(folio)); 96 unsigned long i, nr = folio_nr_pages(folio); 97 void *old; 98 99 xas_set_update(&xas, workingset_update_node); 100 101 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 102 VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio); 103 VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio); 104 105 folio_ref_add(folio, nr); 106 folio_set_swapcache(folio); 107 folio->swap = entry; 108 109 do { 110 xas_lock_irq(&xas); 111 xas_create_range(&xas); 112 if (xas_error(&xas)) 113 goto unlock; 114 for (i = 0; i < nr; i++) { 115 VM_BUG_ON_FOLIO(xas.xa_index != idx + i, folio); 116 if (shadowp) { 117 old = xas_load(&xas); 118 if (xa_is_value(old)) 119 *shadowp = old; 120 } 121 xas_store(&xas, folio); 122 xas_next(&xas); 123 } 124 address_space->nrpages += nr; 125 __node_stat_mod_folio(folio, NR_FILE_PAGES, nr); 126 __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr); 127 unlock: 128 xas_unlock_irq(&xas); 129 } while (xas_nomem(&xas, gfp)); 130 131 if (!xas_error(&xas)) 132 return 0; 133 134 folio_clear_swapcache(folio); 135 folio_ref_sub(folio, nr); 136 return xas_error(&xas); 137 } 138 139 /* 140 * This must be called only on folios that have 141 * been verified to be in the swap cache. 142 */ 143 void __delete_from_swap_cache(struct folio *folio, 144 swp_entry_t entry, void *shadow) 145 { 146 struct address_space *address_space = swap_address_space(entry); 147 int i; 148 long nr = folio_nr_pages(folio); 149 pgoff_t idx = swap_cache_index(entry); 150 XA_STATE(xas, &address_space->i_pages, idx); 151 152 xas_set_update(&xas, workingset_update_node); 153 154 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 155 VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio); 156 VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); 157 158 for (i = 0; i < nr; i++) { 159 void *entry = xas_store(&xas, shadow); 160 VM_BUG_ON_PAGE(entry != folio, entry); 161 xas_next(&xas); 162 } 163 folio->swap.val = 0; 164 folio_clear_swapcache(folio); 165 address_space->nrpages -= nr; 166 __node_stat_mod_folio(folio, NR_FILE_PAGES, -nr); 167 __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr); 168 } 169 170 /** 171 * add_to_swap - allocate swap space for a folio 172 * @folio: folio we want to move to swap 173 * 174 * Allocate swap space for the folio and add the folio to the 175 * swap cache. 176 * 177 * Context: Caller needs to hold the folio lock. 178 * Return: Whether the folio was added to the swap cache. 179 */ 180 bool add_to_swap(struct folio *folio) 181 { 182 swp_entry_t entry; 183 int err; 184 185 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 186 VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio); 187 188 entry = folio_alloc_swap(folio); 189 if (!entry.val) 190 return false; 191 192 /* 193 * XArray node allocations from PF_MEMALLOC contexts could 194 * completely exhaust the page allocator. __GFP_NOMEMALLOC 195 * stops emergency reserves from being allocated. 196 * 197 * TODO: this could cause a theoretical memory reclaim 198 * deadlock in the swap out path. 199 */ 200 /* 201 * Add it to the swap cache. 202 */ 203 err = add_to_swap_cache(folio, entry, 204 __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL); 205 if (err) 206 goto fail; 207 /* 208 * Normally the folio will be dirtied in unmap because its 209 * pte should be dirty. A special case is MADV_FREE page. The 210 * page's pte could have dirty bit cleared but the folio's 211 * SwapBacked flag is still set because clearing the dirty bit 212 * and SwapBacked flag has no lock protected. For such folio, 213 * unmap will not set dirty bit for it, so folio reclaim will 214 * not write the folio out. This can cause data corruption when 215 * the folio is swapped in later. Always setting the dirty flag 216 * for the folio solves the problem. 217 */ 218 folio_mark_dirty(folio); 219 220 return true; 221 222 fail: 223 put_swap_folio(folio, entry); 224 return false; 225 } 226 227 /* 228 * This must be called only on folios that have 229 * been verified to be in the swap cache and locked. 230 * It will never put the folio into the free list, 231 * the caller has a reference on the folio. 232 */ 233 void delete_from_swap_cache(struct folio *folio) 234 { 235 swp_entry_t entry = folio->swap; 236 struct address_space *address_space = swap_address_space(entry); 237 238 xa_lock_irq(&address_space->i_pages); 239 __delete_from_swap_cache(folio, entry, NULL); 240 xa_unlock_irq(&address_space->i_pages); 241 242 put_swap_folio(folio, entry); 243 folio_ref_sub(folio, folio_nr_pages(folio)); 244 } 245 246 void clear_shadow_from_swap_cache(int type, unsigned long begin, 247 unsigned long end) 248 { 249 unsigned long curr = begin; 250 void *old; 251 252 for (;;) { 253 swp_entry_t entry = swp_entry(type, curr); 254 unsigned long index = curr & SWAP_ADDRESS_SPACE_MASK; 255 struct address_space *address_space = swap_address_space(entry); 256 XA_STATE(xas, &address_space->i_pages, index); 257 258 xas_set_update(&xas, workingset_update_node); 259 260 xa_lock_irq(&address_space->i_pages); 261 xas_for_each(&xas, old, min(index + (end - curr), SWAP_ADDRESS_SPACE_PAGES)) { 262 if (!xa_is_value(old)) 263 continue; 264 xas_store(&xas, NULL); 265 } 266 xa_unlock_irq(&address_space->i_pages); 267 268 /* search the next swapcache until we meet end */ 269 curr = ALIGN((curr + 1), SWAP_ADDRESS_SPACE_PAGES); 270 if (curr > end) 271 break; 272 } 273 } 274 275 /* 276 * If we are the only user, then try to free up the swap cache. 277 * 278 * Its ok to check the swapcache flag without the folio lock 279 * here because we are going to recheck again inside 280 * folio_free_swap() _with_ the lock. 281 * - Marcelo 282 */ 283 void free_swap_cache(struct folio *folio) 284 { 285 if (folio_test_swapcache(folio) && !folio_mapped(folio) && 286 folio_trylock(folio)) { 287 folio_free_swap(folio); 288 folio_unlock(folio); 289 } 290 } 291 292 /* 293 * Perform a free_page(), also freeing any swap cache associated with 294 * this page if it is the last user of the page. 295 */ 296 void free_page_and_swap_cache(struct page *page) 297 { 298 struct folio *folio = page_folio(page); 299 300 free_swap_cache(folio); 301 if (!is_huge_zero_folio(folio)) 302 folio_put(folio); 303 } 304 305 /* 306 * Passed an array of pages, drop them all from swapcache and then release 307 * them. They are removed from the LRU and freed if this is their last use. 308 */ 309 void free_pages_and_swap_cache(struct encoded_page **pages, int nr) 310 { 311 struct folio_batch folios; 312 unsigned int refs[PAGEVEC_SIZE]; 313 314 folio_batch_init(&folios); 315 for (int i = 0; i < nr; i++) { 316 struct folio *folio = page_folio(encoded_page_ptr(pages[i])); 317 318 free_swap_cache(folio); 319 refs[folios.nr] = 1; 320 if (unlikely(encoded_page_flags(pages[i]) & 321 ENCODED_PAGE_BIT_NR_PAGES_NEXT)) 322 refs[folios.nr] = encoded_nr_pages(pages[++i]); 323 324 if (folio_batch_add(&folios, folio) == 0) 325 folios_put_refs(&folios, refs); 326 } 327 if (folios.nr) 328 folios_put_refs(&folios, refs); 329 } 330 331 static inline bool swap_use_vma_readahead(void) 332 { 333 return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap); 334 } 335 336 /* 337 * Lookup a swap entry in the swap cache. A found folio will be returned 338 * unlocked and with its refcount incremented - we rely on the kernel 339 * lock getting page table operations atomic even if we drop the folio 340 * lock before returning. 341 * 342 * Caller must lock the swap device or hold a reference to keep it valid. 343 */ 344 struct folio *swap_cache_get_folio(swp_entry_t entry, 345 struct vm_area_struct *vma, unsigned long addr) 346 { 347 struct folio *folio; 348 349 folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry)); 350 if (!IS_ERR(folio)) { 351 bool vma_ra = swap_use_vma_readahead(); 352 bool readahead; 353 354 /* 355 * At the moment, we don't support PG_readahead for anon THP 356 * so let's bail out rather than confusing the readahead stat. 357 */ 358 if (unlikely(folio_test_large(folio))) 359 return folio; 360 361 readahead = folio_test_clear_readahead(folio); 362 if (vma && vma_ra) { 363 unsigned long ra_val; 364 int win, hits; 365 366 ra_val = GET_SWAP_RA_VAL(vma); 367 win = SWAP_RA_WIN(ra_val); 368 hits = SWAP_RA_HITS(ra_val); 369 if (readahead) 370 hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX); 371 atomic_long_set(&vma->swap_readahead_info, 372 SWAP_RA_VAL(addr, win, hits)); 373 } 374 375 if (readahead) { 376 count_vm_event(SWAP_RA_HIT); 377 if (!vma || !vma_ra) 378 atomic_inc(&swapin_readahead_hits); 379 } 380 } else { 381 folio = NULL; 382 } 383 384 return folio; 385 } 386 387 /** 388 * filemap_get_incore_folio - Find and get a folio from the page or swap caches. 389 * @mapping: The address_space to search. 390 * @index: The page cache index. 391 * 392 * This differs from filemap_get_folio() in that it will also look for the 393 * folio in the swap cache. 394 * 395 * Return: The found folio or %NULL. 396 */ 397 struct folio *filemap_get_incore_folio(struct address_space *mapping, 398 pgoff_t index) 399 { 400 swp_entry_t swp; 401 struct swap_info_struct *si; 402 struct folio *folio = filemap_get_entry(mapping, index); 403 404 if (!folio) 405 return ERR_PTR(-ENOENT); 406 if (!xa_is_value(folio)) 407 return folio; 408 if (!shmem_mapping(mapping)) 409 return ERR_PTR(-ENOENT); 410 411 swp = radix_to_swp_entry(folio); 412 /* There might be swapin error entries in shmem mapping. */ 413 if (non_swap_entry(swp)) 414 return ERR_PTR(-ENOENT); 415 /* Prevent swapoff from happening to us */ 416 si = get_swap_device(swp); 417 if (!si) 418 return ERR_PTR(-ENOENT); 419 index = swap_cache_index(swp); 420 folio = filemap_get_folio(swap_address_space(swp), index); 421 put_swap_device(si); 422 return folio; 423 } 424 425 struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, 426 struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated, 427 bool skip_if_exists) 428 { 429 struct swap_info_struct *si; 430 struct folio *folio; 431 struct folio *new_folio = NULL; 432 struct folio *result = NULL; 433 void *shadow = NULL; 434 435 *new_page_allocated = false; 436 si = get_swap_device(entry); 437 if (!si) 438 return NULL; 439 440 for (;;) { 441 int err; 442 /* 443 * First check the swap cache. Since this is normally 444 * called after swap_cache_get_folio() failed, re-calling 445 * that would confuse statistics. 446 */ 447 folio = filemap_get_folio(swap_address_space(entry), 448 swap_cache_index(entry)); 449 if (!IS_ERR(folio)) 450 goto got_folio; 451 452 /* 453 * Just skip read ahead for unused swap slot. 454 * During swap_off when swap_slot_cache is disabled, 455 * we have to handle the race between putting 456 * swap entry in swap cache and marking swap slot 457 * as SWAP_HAS_CACHE. That's done in later part of code or 458 * else swap_off will be aborted if we return NULL. 459 */ 460 if (!swap_entry_swapped(si, entry) && swap_slot_cache_enabled) 461 goto put_and_return; 462 463 /* 464 * Get a new folio to read into from swap. Allocate it now if 465 * new_folio not exist, before marking swap_map SWAP_HAS_CACHE, 466 * when -EEXIST will cause any racers to loop around until we 467 * add it to cache. 468 */ 469 if (!new_folio) { 470 new_folio = folio_alloc_mpol(gfp_mask, 0, mpol, ilx, numa_node_id()); 471 if (!new_folio) 472 goto put_and_return; 473 } 474 475 /* 476 * Swap entry may have been freed since our caller observed it. 477 */ 478 err = swapcache_prepare(entry, 1); 479 if (!err) 480 break; 481 else if (err != -EEXIST) 482 goto put_and_return; 483 484 /* 485 * Protect against a recursive call to __read_swap_cache_async() 486 * on the same entry waiting forever here because SWAP_HAS_CACHE 487 * is set but the folio is not the swap cache yet. This can 488 * happen today if mem_cgroup_swapin_charge_folio() below 489 * triggers reclaim through zswap, which may call 490 * __read_swap_cache_async() in the writeback path. 491 */ 492 if (skip_if_exists) 493 goto put_and_return; 494 495 /* 496 * We might race against __delete_from_swap_cache(), and 497 * stumble across a swap_map entry whose SWAP_HAS_CACHE 498 * has not yet been cleared. Or race against another 499 * __read_swap_cache_async(), which has set SWAP_HAS_CACHE 500 * in swap_map, but not yet added its folio to swap cache. 501 */ 502 schedule_timeout_uninterruptible(1); 503 } 504 505 /* 506 * The swap entry is ours to swap in. Prepare the new folio. 507 */ 508 __folio_set_locked(new_folio); 509 __folio_set_swapbacked(new_folio); 510 511 if (mem_cgroup_swapin_charge_folio(new_folio, NULL, gfp_mask, entry)) 512 goto fail_unlock; 513 514 /* May fail (-ENOMEM) if XArray node allocation failed. */ 515 if (add_to_swap_cache(new_folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) 516 goto fail_unlock; 517 518 memcg1_swapin(entry, 1); 519 520 if (shadow) 521 workingset_refault(new_folio, shadow); 522 523 /* Caller will initiate read into locked new_folio */ 524 folio_add_lru(new_folio); 525 *new_page_allocated = true; 526 folio = new_folio; 527 got_folio: 528 result = folio; 529 goto put_and_return; 530 531 fail_unlock: 532 put_swap_folio(new_folio, entry); 533 folio_unlock(new_folio); 534 put_and_return: 535 put_swap_device(si); 536 if (!(*new_page_allocated) && new_folio) 537 folio_put(new_folio); 538 return result; 539 } 540 541 /* 542 * Locate a page of swap in physical memory, reserving swap cache space 543 * and reading the disk if it is not already cached. 544 * A failure return means that either the page allocation failed or that 545 * the swap entry is no longer in use. 546 * 547 * get/put_swap_device() aren't needed to call this function, because 548 * __read_swap_cache_async() call them and swap_read_folio() holds the 549 * swap cache folio lock. 550 */ 551 struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, 552 struct vm_area_struct *vma, unsigned long addr, 553 struct swap_iocb **plug) 554 { 555 bool page_allocated; 556 struct mempolicy *mpol; 557 pgoff_t ilx; 558 struct folio *folio; 559 560 mpol = get_vma_policy(vma, addr, 0, &ilx); 561 folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, 562 &page_allocated, false); 563 mpol_cond_put(mpol); 564 565 if (page_allocated) 566 swap_read_folio(folio, plug); 567 return folio; 568 } 569 570 static unsigned int __swapin_nr_pages(unsigned long prev_offset, 571 unsigned long offset, 572 int hits, 573 int max_pages, 574 int prev_win) 575 { 576 unsigned int pages, last_ra; 577 578 /* 579 * This heuristic has been found to work well on both sequential and 580 * random loads, swapping to hard disk or to SSD: please don't ask 581 * what the "+ 2" means, it just happens to work well, that's all. 582 */ 583 pages = hits + 2; 584 if (pages == 2) { 585 /* 586 * We can have no readahead hits to judge by: but must not get 587 * stuck here forever, so check for an adjacent offset instead 588 * (and don't even bother to check whether swap type is same). 589 */ 590 if (offset != prev_offset + 1 && offset != prev_offset - 1) 591 pages = 1; 592 } else { 593 unsigned int roundup = 4; 594 while (roundup < pages) 595 roundup <<= 1; 596 pages = roundup; 597 } 598 599 if (pages > max_pages) 600 pages = max_pages; 601 602 /* Don't shrink readahead too fast */ 603 last_ra = prev_win / 2; 604 if (pages < last_ra) 605 pages = last_ra; 606 607 return pages; 608 } 609 610 static unsigned long swapin_nr_pages(unsigned long offset) 611 { 612 static unsigned long prev_offset; 613 unsigned int hits, pages, max_pages; 614 static atomic_t last_readahead_pages; 615 616 max_pages = 1 << READ_ONCE(page_cluster); 617 if (max_pages <= 1) 618 return 1; 619 620 hits = atomic_xchg(&swapin_readahead_hits, 0); 621 pages = __swapin_nr_pages(READ_ONCE(prev_offset), offset, hits, 622 max_pages, 623 atomic_read(&last_readahead_pages)); 624 if (!hits) 625 WRITE_ONCE(prev_offset, offset); 626 atomic_set(&last_readahead_pages, pages); 627 628 return pages; 629 } 630 631 /** 632 * swap_cluster_readahead - swap in pages in hope we need them soon 633 * @entry: swap entry of this memory 634 * @gfp_mask: memory allocation flags 635 * @mpol: NUMA memory allocation policy to be applied 636 * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE 637 * 638 * Returns the struct folio for entry and addr, after queueing swapin. 639 * 640 * Primitive swap readahead code. We simply read an aligned block of 641 * (1 << page_cluster) entries in the swap area. This method is chosen 642 * because it doesn't cost us any seek time. We also make sure to queue 643 * the 'original' request together with the readahead ones... 644 * 645 * Note: it is intentional that the same NUMA policy and interleave index 646 * are used for every page of the readahead: neighbouring pages on swap 647 * are fairly likely to have been swapped out from the same node. 648 */ 649 struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, 650 struct mempolicy *mpol, pgoff_t ilx) 651 { 652 struct folio *folio; 653 unsigned long entry_offset = swp_offset(entry); 654 unsigned long offset = entry_offset; 655 unsigned long start_offset, end_offset; 656 unsigned long mask; 657 struct swap_info_struct *si = swp_swap_info(entry); 658 struct blk_plug plug; 659 struct swap_iocb *splug = NULL; 660 bool page_allocated; 661 662 mask = swapin_nr_pages(offset) - 1; 663 if (!mask) 664 goto skip; 665 666 /* Read a page_cluster sized and aligned cluster around offset. */ 667 start_offset = offset & ~mask; 668 end_offset = offset | mask; 669 if (!start_offset) /* First page is swap header. */ 670 start_offset++; 671 if (end_offset >= si->max) 672 end_offset = si->max - 1; 673 674 blk_start_plug(&plug); 675 for (offset = start_offset; offset <= end_offset ; offset++) { 676 /* Ok, do the async read-ahead now */ 677 folio = __read_swap_cache_async( 678 swp_entry(swp_type(entry), offset), 679 gfp_mask, mpol, ilx, &page_allocated, false); 680 if (!folio) 681 continue; 682 if (page_allocated) { 683 swap_read_folio(folio, &splug); 684 if (offset != entry_offset) { 685 folio_set_readahead(folio); 686 count_vm_event(SWAP_RA); 687 } 688 } 689 folio_put(folio); 690 } 691 blk_finish_plug(&plug); 692 swap_read_unplug(splug); 693 lru_add_drain(); /* Push any new pages onto the LRU now */ 694 skip: 695 /* The page was likely read above, so no need for plugging here */ 696 folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, 697 &page_allocated, false); 698 if (unlikely(page_allocated)) 699 swap_read_folio(folio, NULL); 700 return folio; 701 } 702 703 int init_swap_address_space(unsigned int type, unsigned long nr_pages) 704 { 705 struct address_space *spaces, *space; 706 unsigned int i, nr; 707 708 nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); 709 spaces = kvcalloc(nr, sizeof(struct address_space), GFP_KERNEL); 710 if (!spaces) 711 return -ENOMEM; 712 for (i = 0; i < nr; i++) { 713 space = spaces + i; 714 xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ); 715 atomic_set(&space->i_mmap_writable, 0); 716 space->a_ops = &swap_aops; 717 /* swap cache doesn't use writeback related tags */ 718 mapping_set_no_writeback_tags(space); 719 } 720 nr_swapper_spaces[type] = nr; 721 swapper_spaces[type] = spaces; 722 723 return 0; 724 } 725 726 void exit_swap_address_space(unsigned int type) 727 { 728 int i; 729 struct address_space *spaces = swapper_spaces[type]; 730 731 for (i = 0; i < nr_swapper_spaces[type]; i++) 732 VM_WARN_ON_ONCE(!mapping_empty(&spaces[i])); 733 kvfree(spaces); 734 nr_swapper_spaces[type] = 0; 735 swapper_spaces[type] = NULL; 736 } 737 738 static int swap_vma_ra_win(struct vm_fault *vmf, unsigned long *start, 739 unsigned long *end) 740 { 741 struct vm_area_struct *vma = vmf->vma; 742 unsigned long ra_val; 743 unsigned long faddr, prev_faddr, left, right; 744 unsigned int max_win, hits, prev_win, win; 745 746 max_win = 1 << min(READ_ONCE(page_cluster), SWAP_RA_ORDER_CEILING); 747 if (max_win == 1) 748 return 1; 749 750 faddr = vmf->address; 751 ra_val = GET_SWAP_RA_VAL(vma); 752 prev_faddr = SWAP_RA_ADDR(ra_val); 753 prev_win = SWAP_RA_WIN(ra_val); 754 hits = SWAP_RA_HITS(ra_val); 755 win = __swapin_nr_pages(PFN_DOWN(prev_faddr), PFN_DOWN(faddr), hits, 756 max_win, prev_win); 757 atomic_long_set(&vma->swap_readahead_info, SWAP_RA_VAL(faddr, win, 0)); 758 if (win == 1) 759 return 1; 760 761 if (faddr == prev_faddr + PAGE_SIZE) 762 left = faddr; 763 else if (prev_faddr == faddr + PAGE_SIZE) 764 left = faddr - (win << PAGE_SHIFT) + PAGE_SIZE; 765 else 766 left = faddr - (((win - 1) / 2) << PAGE_SHIFT); 767 right = left + (win << PAGE_SHIFT); 768 if ((long)left < 0) 769 left = 0; 770 *start = max3(left, vma->vm_start, faddr & PMD_MASK); 771 *end = min3(right, vma->vm_end, (faddr & PMD_MASK) + PMD_SIZE); 772 773 return win; 774 } 775 776 /** 777 * swap_vma_readahead - swap in pages in hope we need them soon 778 * @targ_entry: swap entry of the targeted memory 779 * @gfp_mask: memory allocation flags 780 * @mpol: NUMA memory allocation policy to be applied 781 * @targ_ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE 782 * @vmf: fault information 783 * 784 * Returns the struct folio for entry and addr, after queueing swapin. 785 * 786 * Primitive swap readahead code. We simply read in a few pages whose 787 * virtual addresses are around the fault address in the same vma. 788 * 789 * Caller must hold read mmap_lock if vmf->vma is not NULL. 790 * 791 */ 792 static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, 793 struct mempolicy *mpol, pgoff_t targ_ilx, struct vm_fault *vmf) 794 { 795 struct blk_plug plug; 796 struct swap_iocb *splug = NULL; 797 struct folio *folio; 798 pte_t *pte = NULL, pentry; 799 int win; 800 unsigned long start, end, addr; 801 swp_entry_t entry; 802 pgoff_t ilx; 803 bool page_allocated; 804 805 win = swap_vma_ra_win(vmf, &start, &end); 806 if (win == 1) 807 goto skip; 808 809 ilx = targ_ilx - PFN_DOWN(vmf->address - start); 810 811 blk_start_plug(&plug); 812 for (addr = start; addr < end; ilx++, addr += PAGE_SIZE) { 813 if (!pte++) { 814 pte = pte_offset_map(vmf->pmd, addr); 815 if (!pte) 816 break; 817 } 818 pentry = ptep_get_lockless(pte); 819 if (!is_swap_pte(pentry)) 820 continue; 821 entry = pte_to_swp_entry(pentry); 822 if (unlikely(non_swap_entry(entry))) 823 continue; 824 pte_unmap(pte); 825 pte = NULL; 826 folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, 827 &page_allocated, false); 828 if (!folio) 829 continue; 830 if (page_allocated) { 831 swap_read_folio(folio, &splug); 832 if (addr != vmf->address) { 833 folio_set_readahead(folio); 834 count_vm_event(SWAP_RA); 835 } 836 } 837 folio_put(folio); 838 } 839 if (pte) 840 pte_unmap(pte); 841 blk_finish_plug(&plug); 842 swap_read_unplug(splug); 843 lru_add_drain(); 844 skip: 845 /* The folio was likely read above, so no need for plugging here */ 846 folio = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx, 847 &page_allocated, false); 848 if (unlikely(page_allocated)) 849 swap_read_folio(folio, NULL); 850 return folio; 851 } 852 853 /** 854 * swapin_readahead - swap in pages in hope we need them soon 855 * @entry: swap entry of this memory 856 * @gfp_mask: memory allocation flags 857 * @vmf: fault information 858 * 859 * Returns the struct folio for entry and addr, after queueing swapin. 860 * 861 * It's a main entry function for swap readahead. By the configuration, 862 * it will read ahead blocks by cluster-based(ie, physical disk based) 863 * or vma-based(ie, virtual address based on faulty address) readahead. 864 */ 865 struct folio *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, 866 struct vm_fault *vmf) 867 { 868 struct mempolicy *mpol; 869 pgoff_t ilx; 870 struct folio *folio; 871 872 mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx); 873 folio = swap_use_vma_readahead() ? 874 swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf) : 875 swap_cluster_readahead(entry, gfp_mask, mpol, ilx); 876 mpol_cond_put(mpol); 877 878 return folio; 879 } 880 881 #ifdef CONFIG_SYSFS 882 static ssize_t vma_ra_enabled_show(struct kobject *kobj, 883 struct kobj_attribute *attr, char *buf) 884 { 885 return sysfs_emit(buf, "%s\n", str_true_false(enable_vma_readahead)); 886 } 887 static ssize_t vma_ra_enabled_store(struct kobject *kobj, 888 struct kobj_attribute *attr, 889 const char *buf, size_t count) 890 { 891 ssize_t ret; 892 893 ret = kstrtobool(buf, &enable_vma_readahead); 894 if (ret) 895 return ret; 896 897 return count; 898 } 899 static struct kobj_attribute vma_ra_enabled_attr = __ATTR_RW(vma_ra_enabled); 900 901 static struct attribute *swap_attrs[] = { 902 &vma_ra_enabled_attr.attr, 903 NULL, 904 }; 905 906 static const struct attribute_group swap_attr_group = { 907 .attrs = swap_attrs, 908 }; 909 910 static int __init swap_init_sysfs(void) 911 { 912 int err; 913 struct kobject *swap_kobj; 914 915 swap_kobj = kobject_create_and_add("swap", mm_kobj); 916 if (!swap_kobj) { 917 pr_err("failed to create swap kobject\n"); 918 return -ENOMEM; 919 } 920 err = sysfs_create_group(swap_kobj, &swap_attr_group); 921 if (err) { 922 pr_err("failed to register swap group\n"); 923 goto delete_obj; 924 } 925 return 0; 926 927 delete_obj: 928 kobject_put(swap_kobj); 929 return err; 930 } 931 subsys_initcall(swap_init_sysfs); 932 #endif 933