1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/mm/swap_state.c 4 * 5 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 6 * Swap reorganised 29.12.95, Stephen Tweedie 7 * 8 * Rewritten to use page cache, (C) 1998 Stephen Tweedie 9 */ 10 #include <linux/mm.h> 11 #include <linux/gfp.h> 12 #include <linux/kernel_stat.h> 13 #include <linux/mempolicy.h> 14 #include <linux/swap.h> 15 #include <linux/leafops.h> 16 #include <linux/init.h> 17 #include <linux/pagemap.h> 18 #include <linux/folio_batch.h> 19 #include <linux/backing-dev.h> 20 #include <linux/blkdev.h> 21 #include <linux/migrate.h> 22 #include <linux/vmalloc.h> 23 #include <linux/huge_mm.h> 24 #include <linux/shmem_fs.h> 25 #include "internal.h" 26 #include "swap_table.h" 27 #include "swap.h" 28 29 /* 30 * swapper_space is a fiction, retained to simplify the path through 31 * vmscan's shrink_folio_list. 32 */ 33 static const struct address_space_operations swap_aops = { 34 .dirty_folio = noop_dirty_folio, 35 #ifdef CONFIG_MIGRATION 36 .migrate_folio = migrate_folio, 37 #endif 38 }; 39 40 struct address_space swap_space __read_mostly = { 41 .a_ops = &swap_aops, 42 }; 43 44 static bool enable_vma_readahead __read_mostly = true; 45 46 #define SWAP_RA_ORDER_CEILING 5 47 48 #define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) 49 #define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) 50 #define SWAP_RA_HITS_MAX SWAP_RA_HITS_MASK 51 #define SWAP_RA_WIN_MASK (~PAGE_MASK & ~SWAP_RA_HITS_MASK) 52 53 #define SWAP_RA_HITS(v) ((v) & SWAP_RA_HITS_MASK) 54 #define SWAP_RA_WIN(v) (((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT) 55 #define SWAP_RA_ADDR(v) ((v) & PAGE_MASK) 56 57 #define SWAP_RA_VAL(addr, win, hits) \ 58 (((addr) & PAGE_MASK) | \ 59 (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) | \ 60 ((hits) & SWAP_RA_HITS_MASK)) 61 62 /* Initial readahead hits is 4 to start up with a small window */ 63 #define GET_SWAP_RA_VAL(vma) \ 64 (atomic_long_read(&(vma)->swap_readahead_info) ? : 4) 65 66 static atomic_t swapin_readahead_hits = ATOMIC_INIT(4); 67 68 void show_swap_cache_info(void) 69 { 70 printk("%lu pages in swap cache\n", total_swapcache_pages()); 71 printk("Free swap = %ldkB\n", K(get_nr_swap_pages())); 72 printk("Total swap = %lukB\n", K(total_swap_pages)); 73 } 74 75 /** 76 * swap_cache_get_folio - Looks up a folio in the swap cache. 77 * @entry: swap entry used for the lookup. 78 * 79 * A found folio will be returned unlocked and with its refcount increased. 80 * 81 * Context: Caller must ensure @entry is valid and protect the swap device 82 * with reference count or locks. 83 * Return: Returns the found folio on success, NULL otherwise. The caller 84 * must lock and check if the folio still matches the swap entry before 85 * use (e.g., folio_matches_swap_entry). 86 */ 87 struct folio *swap_cache_get_folio(swp_entry_t entry) 88 { 89 unsigned long swp_tb; 90 struct folio *folio; 91 92 for (;;) { 93 swp_tb = swap_table_get(__swap_entry_to_cluster(entry), 94 swp_cluster_offset(entry)); 95 if (!swp_tb_is_folio(swp_tb)) 96 return NULL; 97 folio = swp_tb_to_folio(swp_tb); 98 if (likely(folio_try_get(folio))) 99 return folio; 100 } 101 102 return NULL; 103 } 104 105 /** 106 * swap_cache_has_folio - Check if a swap slot has cache. 107 * @entry: swap entry indicating the slot. 108 * 109 * Context: Caller must ensure @entry is valid and protect the swap 110 * device with reference count or locks. 111 */ 112 bool swap_cache_has_folio(swp_entry_t entry) 113 { 114 unsigned long swp_tb; 115 116 swp_tb = swap_table_get(__swap_entry_to_cluster(entry), 117 swp_cluster_offset(entry)); 118 return swp_tb_is_folio(swp_tb); 119 } 120 121 /** 122 * swap_cache_get_shadow - Looks up a shadow in the swap cache. 123 * @entry: swap entry used for the lookup. 124 * 125 * Context: Caller must ensure @entry is valid and protect the swap device 126 * with reference count or locks. 127 * Return: Returns either NULL or an XA_VALUE (shadow). 128 */ 129 void *swap_cache_get_shadow(swp_entry_t entry) 130 { 131 unsigned long swp_tb; 132 133 swp_tb = swap_table_get(__swap_entry_to_cluster(entry), 134 swp_cluster_offset(entry)); 135 if (swp_tb_is_shadow(swp_tb)) 136 return swp_tb_to_shadow(swp_tb); 137 return NULL; 138 } 139 140 void __swap_cache_add_folio(struct swap_cluster_info *ci, 141 struct folio *folio, swp_entry_t entry) 142 { 143 unsigned int ci_off = swp_cluster_offset(entry), ci_end; 144 unsigned long nr_pages = folio_nr_pages(folio); 145 unsigned long pfn = folio_pfn(folio); 146 unsigned long old_tb; 147 148 VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); 149 VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio); 150 VM_WARN_ON_ONCE_FOLIO(!folio_test_swapbacked(folio), folio); 151 152 ci_end = ci_off + nr_pages; 153 do { 154 old_tb = __swap_table_get(ci, ci_off); 155 VM_WARN_ON_ONCE(swp_tb_is_folio(old_tb)); 156 __swap_table_set(ci, ci_off, pfn_to_swp_tb(pfn, __swp_tb_get_count(old_tb))); 157 } while (++ci_off < ci_end); 158 159 folio_ref_add(folio, nr_pages); 160 folio_set_swapcache(folio); 161 folio->swap = entry; 162 163 node_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages); 164 lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr_pages); 165 } 166 167 /** 168 * swap_cache_add_folio - Add a folio into the swap cache. 169 * @folio: The folio to be added. 170 * @entry: The swap entry corresponding to the folio. 171 * @gfp: gfp_mask for XArray node allocation. 172 * @shadowp: If a shadow is found, return the shadow. 173 * 174 * Context: Caller must ensure @entry is valid and protect the swap device 175 * with reference count or locks. 176 */ 177 static int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, 178 void **shadowp) 179 { 180 int err; 181 void *shadow = NULL; 182 unsigned long old_tb; 183 struct swap_info_struct *si; 184 struct swap_cluster_info *ci; 185 unsigned int ci_start, ci_off, ci_end; 186 unsigned long nr_pages = folio_nr_pages(folio); 187 188 si = __swap_entry_to_info(entry); 189 ci_start = swp_cluster_offset(entry); 190 ci_end = ci_start + nr_pages; 191 ci_off = ci_start; 192 ci = swap_cluster_lock(si, swp_offset(entry)); 193 if (unlikely(!ci->table)) { 194 err = -ENOENT; 195 goto failed; 196 } 197 do { 198 old_tb = __swap_table_get(ci, ci_off); 199 if (unlikely(swp_tb_is_folio(old_tb))) { 200 err = -EEXIST; 201 goto failed; 202 } 203 if (unlikely(!__swp_tb_get_count(old_tb))) { 204 err = -ENOENT; 205 goto failed; 206 } 207 if (swp_tb_is_shadow(old_tb)) 208 shadow = swp_tb_to_shadow(old_tb); 209 } while (++ci_off < ci_end); 210 __swap_cache_add_folio(ci, folio, entry); 211 swap_cluster_unlock(ci); 212 if (shadowp) 213 *shadowp = shadow; 214 return 0; 215 216 failed: 217 swap_cluster_unlock(ci); 218 return err; 219 } 220 221 /** 222 * __swap_cache_del_folio - Removes a folio from the swap cache. 223 * @ci: The locked swap cluster. 224 * @folio: The folio. 225 * @entry: The first swap entry that the folio corresponds to. 226 * @shadow: shadow value to be filled in the swap cache. 227 * 228 * Removes a folio from the swap cache and fills a shadow in place. 229 * This won't put the folio's refcount. The caller has to do that. 230 * 231 * Context: Caller must ensure the folio is locked and in the swap cache 232 * using the index of @entry, and lock the cluster that holds the entries. 233 */ 234 void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, 235 swp_entry_t entry, void *shadow) 236 { 237 int count; 238 unsigned long old_tb; 239 struct swap_info_struct *si; 240 unsigned int ci_start, ci_off, ci_end; 241 bool folio_swapped = false, need_free = false; 242 unsigned long nr_pages = folio_nr_pages(folio); 243 244 VM_WARN_ON_ONCE(__swap_entry_to_cluster(entry) != ci); 245 VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); 246 VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio); 247 VM_WARN_ON_ONCE_FOLIO(folio_test_writeback(folio), folio); 248 249 si = __swap_entry_to_info(entry); 250 ci_start = swp_cluster_offset(entry); 251 ci_end = ci_start + nr_pages; 252 ci_off = ci_start; 253 do { 254 old_tb = __swap_table_get(ci, ci_off); 255 WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || 256 swp_tb_to_folio(old_tb) != folio); 257 count = __swp_tb_get_count(old_tb); 258 if (count) 259 folio_swapped = true; 260 else 261 need_free = true; 262 /* If shadow is NULL, we sets an empty shadow. */ 263 __swap_table_set(ci, ci_off, shadow_to_swp_tb(shadow, count)); 264 } while (++ci_off < ci_end); 265 266 folio->swap.val = 0; 267 folio_clear_swapcache(folio); 268 node_stat_mod_folio(folio, NR_FILE_PAGES, -nr_pages); 269 lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr_pages); 270 271 if (!folio_swapped) { 272 __swap_cluster_free_entries(si, ci, ci_start, nr_pages); 273 } else if (need_free) { 274 ci_off = ci_start; 275 do { 276 if (!__swp_tb_get_count(__swap_table_get(ci, ci_off))) 277 __swap_cluster_free_entries(si, ci, ci_off, 1); 278 } while (++ci_off < ci_end); 279 } 280 } 281 282 /** 283 * swap_cache_del_folio - Removes a folio from the swap cache. 284 * @folio: The folio. 285 * 286 * Same as __swap_cache_del_folio, but handles lock and refcount. The 287 * caller must ensure the folio is either clean or has a swap count 288 * equal to zero, or it may cause data loss. 289 * 290 * Context: Caller must ensure the folio is locked and in the swap cache. 291 */ 292 void swap_cache_del_folio(struct folio *folio) 293 { 294 struct swap_cluster_info *ci; 295 swp_entry_t entry = folio->swap; 296 297 ci = swap_cluster_lock(__swap_entry_to_info(entry), swp_offset(entry)); 298 __swap_cache_del_folio(ci, folio, entry, NULL); 299 swap_cluster_unlock(ci); 300 301 folio_ref_sub(folio, folio_nr_pages(folio)); 302 } 303 304 /** 305 * __swap_cache_replace_folio - Replace a folio in the swap cache. 306 * @ci: The locked swap cluster. 307 * @old: The old folio to be replaced. 308 * @new: The new folio. 309 * 310 * Replace an existing folio in the swap cache with a new folio. The 311 * caller is responsible for setting up the new folio's flag and swap 312 * entries. Replacement will take the new folio's swap entry value as 313 * the starting offset to override all slots covered by the new folio. 314 * 315 * Context: Caller must ensure both folios are locked, and lock the 316 * cluster that holds the old folio to be replaced. 317 */ 318 void __swap_cache_replace_folio(struct swap_cluster_info *ci, 319 struct folio *old, struct folio *new) 320 { 321 swp_entry_t entry = new->swap; 322 unsigned long nr_pages = folio_nr_pages(new); 323 unsigned int ci_off = swp_cluster_offset(entry); 324 unsigned int ci_end = ci_off + nr_pages; 325 unsigned long pfn = folio_pfn(new); 326 unsigned long old_tb; 327 328 VM_WARN_ON_ONCE(!folio_test_swapcache(old) || !folio_test_swapcache(new)); 329 VM_WARN_ON_ONCE(!folio_test_locked(old) || !folio_test_locked(new)); 330 VM_WARN_ON_ONCE(!entry.val); 331 332 /* Swap cache still stores N entries instead of a high-order entry */ 333 do { 334 old_tb = __swap_table_get(ci, ci_off); 335 WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || swp_tb_to_folio(old_tb) != old); 336 __swap_table_set(ci, ci_off, pfn_to_swp_tb(pfn, __swp_tb_get_count(old_tb))); 337 } while (++ci_off < ci_end); 338 339 /* 340 * If the old folio is partially replaced (e.g., splitting a large 341 * folio, the old folio is shrunk, and new split sub folios replace 342 * the shrunk part), ensure the new folio doesn't overlap it. 343 */ 344 if (IS_ENABLED(CONFIG_DEBUG_VM) && 345 folio_order(old) != folio_order(new)) { 346 ci_off = swp_cluster_offset(old->swap); 347 ci_end = ci_off + folio_nr_pages(old); 348 while (ci_off++ < ci_end) 349 WARN_ON_ONCE(swp_tb_to_folio(__swap_table_get(ci, ci_off)) != old); 350 } 351 } 352 353 /* 354 * If we are the only user, then try to free up the swap cache. 355 * 356 * Its ok to check the swapcache flag without the folio lock 357 * here because we are going to recheck again inside 358 * folio_free_swap() _with_ the lock. 359 * - Marcelo 360 */ 361 void free_swap_cache(struct folio *folio) 362 { 363 if (folio_test_swapcache(folio) && !folio_mapped(folio) && 364 folio_trylock(folio)) { 365 folio_free_swap(folio); 366 folio_unlock(folio); 367 } 368 } 369 370 /* 371 * Freeing a folio and also freeing any swap cache associated with 372 * this folio if it is the last user. 373 */ 374 void free_folio_and_swap_cache(struct folio *folio) 375 { 376 free_swap_cache(folio); 377 if (!is_huge_zero_folio(folio)) 378 folio_put(folio); 379 } 380 381 /* 382 * Passed an array of pages, drop them all from swapcache and then release 383 * them. They are removed from the LRU and freed if this is their last use. 384 */ 385 void free_pages_and_swap_cache(struct encoded_page **pages, int nr) 386 { 387 struct folio_batch folios; 388 unsigned int refs[FOLIO_BATCH_SIZE]; 389 390 folio_batch_init(&folios); 391 for (int i = 0; i < nr; i++) { 392 struct folio *folio = page_folio(encoded_page_ptr(pages[i])); 393 394 free_swap_cache(folio); 395 refs[folios.nr] = 1; 396 if (unlikely(encoded_page_flags(pages[i]) & 397 ENCODED_PAGE_BIT_NR_PAGES_NEXT)) 398 refs[folios.nr] = encoded_nr_pages(pages[++i]); 399 400 if (folio_batch_add(&folios, folio) == 0) 401 folios_put_refs(&folios, refs); 402 } 403 if (folios.nr) 404 folios_put_refs(&folios, refs); 405 } 406 407 static inline bool swap_use_vma_readahead(void) 408 { 409 return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap); 410 } 411 412 /** 413 * swap_update_readahead - Update the readahead statistics of VMA or globally. 414 * @folio: the swap cache folio that just got hit. 415 * @vma: the VMA that should be updated, could be NULL for global update. 416 * @addr: the addr that triggered the swapin, ignored if @vma is NULL. 417 */ 418 void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma, 419 unsigned long addr) 420 { 421 bool readahead, vma_ra = swap_use_vma_readahead(); 422 423 /* 424 * At the moment, we don't support PG_readahead for anon THP 425 * so let's bail out rather than confusing the readahead stat. 426 */ 427 if (unlikely(folio_test_large(folio))) 428 return; 429 430 readahead = folio_test_clear_readahead(folio); 431 if (vma && vma_ra) { 432 unsigned long ra_val; 433 int win, hits; 434 435 ra_val = GET_SWAP_RA_VAL(vma); 436 win = SWAP_RA_WIN(ra_val); 437 hits = SWAP_RA_HITS(ra_val); 438 if (readahead) 439 hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX); 440 atomic_long_set(&vma->swap_readahead_info, 441 SWAP_RA_VAL(addr, win, hits)); 442 } 443 444 if (readahead) { 445 count_vm_event(SWAP_RA_HIT); 446 if (!vma || !vma_ra) 447 atomic_inc(&swapin_readahead_hits); 448 } 449 } 450 451 /** 452 * __swap_cache_prepare_and_add - Prepare the folio and add it to swap cache. 453 * @entry: swap entry to be bound to the folio. 454 * @folio: folio to be added. 455 * @gfp: memory allocation flags for charge, can be 0 if @charged if true. 456 * @charged: if the folio is already charged. 457 * 458 * Update the swap_map and add folio as swap cache, typically before swapin. 459 * All swap slots covered by the folio must have a non-zero swap count. 460 * 461 * Context: Caller must protect the swap device with reference count or locks. 462 * Return: Returns the folio being added on success. Returns the existing folio 463 * if @entry is already cached. Returns NULL if raced with swapin or swapoff. 464 */ 465 static struct folio *__swap_cache_prepare_and_add(swp_entry_t entry, 466 struct folio *folio, 467 gfp_t gfp, bool charged) 468 { 469 struct folio *swapcache = NULL; 470 void *shadow; 471 int ret; 472 473 __folio_set_locked(folio); 474 __folio_set_swapbacked(folio); 475 476 if (!charged && mem_cgroup_swapin_charge_folio(folio, NULL, gfp, entry)) 477 goto failed; 478 479 for (;;) { 480 ret = swap_cache_add_folio(folio, entry, &shadow); 481 if (!ret) 482 break; 483 484 /* 485 * Large order allocation needs special handling on 486 * race: if a smaller folio exists in cache, swapin needs 487 * to fallback to order 0, and doing a swap cache lookup 488 * might return a folio that is irrelevant to the faulting 489 * entry because @entry is aligned down. Just return NULL. 490 */ 491 if (ret != -EEXIST || folio_test_large(folio)) 492 goto failed; 493 494 swapcache = swap_cache_get_folio(entry); 495 if (swapcache) 496 goto failed; 497 } 498 499 memcg1_swapin(entry, folio_nr_pages(folio)); 500 if (shadow) 501 workingset_refault(folio, shadow); 502 503 /* Caller will initiate read into locked folio */ 504 folio_add_lru(folio); 505 return folio; 506 507 failed: 508 folio_unlock(folio); 509 return swapcache; 510 } 511 512 /** 513 * swap_cache_alloc_folio - Allocate folio for swapped out slot in swap cache. 514 * @entry: the swapped out swap entry to be binded to the folio. 515 * @gfp_mask: memory allocation flags 516 * @mpol: NUMA memory allocation policy to be applied 517 * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE 518 * @new_page_allocated: sets true if allocation happened, false otherwise 519 * 520 * Allocate a folio in the swap cache for one swap slot, typically before 521 * doing IO (e.g. swap in or zswap writeback). The swap slot indicated by 522 * @entry must have a non-zero swap count (swapped out). 523 * Currently only supports order 0. 524 * 525 * Context: Caller must protect the swap device with reference count or locks. 526 * Return: Returns the existing folio if @entry is cached already. Returns 527 * NULL if failed due to -ENOMEM or @entry have a swap count < 1. 528 */ 529 struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_mask, 530 struct mempolicy *mpol, pgoff_t ilx, 531 bool *new_page_allocated) 532 { 533 struct swap_info_struct *si = __swap_entry_to_info(entry); 534 struct folio *folio; 535 struct folio *result = NULL; 536 537 *new_page_allocated = false; 538 /* Check the swap cache again for readahead path. */ 539 folio = swap_cache_get_folio(entry); 540 if (folio) 541 return folio; 542 543 /* Skip allocation for unused and bad swap slot for readahead. */ 544 if (!swap_entry_swapped(si, entry)) 545 return NULL; 546 547 /* Allocate a new folio to be added into the swap cache. */ 548 folio = folio_alloc_mpol(gfp_mask, 0, mpol, ilx, numa_node_id()); 549 if (!folio) 550 return NULL; 551 /* Try add the new folio, returns existing folio or NULL on failure. */ 552 result = __swap_cache_prepare_and_add(entry, folio, gfp_mask, false); 553 if (result == folio) 554 *new_page_allocated = true; 555 else 556 folio_put(folio); 557 return result; 558 } 559 560 /** 561 * swapin_folio - swap-in one or multiple entries skipping readahead. 562 * @entry: starting swap entry to swap in 563 * @folio: a new allocated and charged folio 564 * 565 * Reads @entry into @folio, @folio will be added to the swap cache. 566 * If @folio is a large folio, the @entry will be rounded down to align 567 * with the folio size. 568 * 569 * Return: returns pointer to @folio on success. If folio is a large folio 570 * and this raced with another swapin, NULL will be returned to allow fallback 571 * to order 0. Else, if another folio was already added to the swap cache, 572 * return that swap cache folio instead. 573 */ 574 struct folio *swapin_folio(swp_entry_t entry, struct folio *folio) 575 { 576 struct folio *swapcache; 577 pgoff_t offset = swp_offset(entry); 578 unsigned long nr_pages = folio_nr_pages(folio); 579 580 entry = swp_entry(swp_type(entry), round_down(offset, nr_pages)); 581 swapcache = __swap_cache_prepare_and_add(entry, folio, 0, true); 582 if (swapcache == folio) 583 swap_read_folio(folio, NULL); 584 return swapcache; 585 } 586 587 /* 588 * Locate a page of swap in physical memory, reserving swap cache space 589 * and reading the disk if it is not already cached. 590 * A failure return means that either the page allocation failed or that 591 * the swap entry is no longer in use. 592 */ 593 struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, 594 struct vm_area_struct *vma, unsigned long addr, 595 struct swap_iocb **plug) 596 { 597 struct swap_info_struct *si; 598 bool page_allocated; 599 struct mempolicy *mpol; 600 pgoff_t ilx; 601 struct folio *folio; 602 603 si = get_swap_device(entry); 604 if (!si) 605 return NULL; 606 607 mpol = get_vma_policy(vma, addr, 0, &ilx); 608 folio = swap_cache_alloc_folio(entry, gfp_mask, mpol, ilx, 609 &page_allocated); 610 mpol_cond_put(mpol); 611 612 if (page_allocated) 613 swap_read_folio(folio, plug); 614 615 put_swap_device(si); 616 return folio; 617 } 618 619 static unsigned int __swapin_nr_pages(unsigned long prev_offset, 620 unsigned long offset, 621 int hits, 622 int max_pages, 623 int prev_win) 624 { 625 unsigned int pages, last_ra; 626 627 /* 628 * This heuristic has been found to work well on both sequential and 629 * random loads, swapping to hard disk or to SSD: please don't ask 630 * what the "+ 2" means, it just happens to work well, that's all. 631 */ 632 pages = hits + 2; 633 if (pages == 2) { 634 /* 635 * We can have no readahead hits to judge by: but must not get 636 * stuck here forever, so check for an adjacent offset instead 637 * (and don't even bother to check whether swap type is same). 638 */ 639 if (offset != prev_offset + 1 && offset != prev_offset - 1) 640 pages = 1; 641 } else { 642 unsigned int roundup = 4; 643 while (roundup < pages) 644 roundup <<= 1; 645 pages = roundup; 646 } 647 648 if (pages > max_pages) 649 pages = max_pages; 650 651 /* Don't shrink readahead too fast */ 652 last_ra = prev_win / 2; 653 if (pages < last_ra) 654 pages = last_ra; 655 656 return pages; 657 } 658 659 static unsigned long swapin_nr_pages(unsigned long offset) 660 { 661 static unsigned long prev_offset; 662 unsigned int hits, pages, max_pages; 663 static atomic_t last_readahead_pages; 664 665 max_pages = 1 << READ_ONCE(page_cluster); 666 if (max_pages <= 1) 667 return 1; 668 669 hits = atomic_xchg(&swapin_readahead_hits, 0); 670 pages = __swapin_nr_pages(READ_ONCE(prev_offset), offset, hits, 671 max_pages, 672 atomic_read(&last_readahead_pages)); 673 if (!hits) 674 WRITE_ONCE(prev_offset, offset); 675 atomic_set(&last_readahead_pages, pages); 676 677 return pages; 678 } 679 680 /** 681 * swap_cluster_readahead - swap in pages in hope we need them soon 682 * @entry: swap entry of this memory 683 * @gfp_mask: memory allocation flags 684 * @mpol: NUMA memory allocation policy to be applied 685 * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE 686 * 687 * Returns the struct folio for entry and addr, after queueing swapin. 688 * 689 * Primitive swap readahead code. We simply read an aligned block of 690 * (1 << page_cluster) entries in the swap area. This method is chosen 691 * because it doesn't cost us any seek time. We also make sure to queue 692 * the 'original' request together with the readahead ones... 693 * 694 * Note: it is intentional that the same NUMA policy and interleave index 695 * are used for every page of the readahead: neighbouring pages on swap 696 * are fairly likely to have been swapped out from the same node. 697 */ 698 struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, 699 struct mempolicy *mpol, pgoff_t ilx) 700 { 701 struct folio *folio; 702 unsigned long entry_offset = swp_offset(entry); 703 unsigned long offset = entry_offset; 704 unsigned long start_offset, end_offset; 705 unsigned long mask; 706 struct swap_info_struct *si = __swap_entry_to_info(entry); 707 struct blk_plug plug; 708 struct swap_iocb *splug = NULL; 709 bool page_allocated; 710 711 mask = swapin_nr_pages(offset) - 1; 712 if (!mask) 713 goto skip; 714 715 /* Read a page_cluster sized and aligned cluster around offset. */ 716 start_offset = offset & ~mask; 717 end_offset = offset | mask; 718 if (!start_offset) /* First page is swap header. */ 719 start_offset++; 720 if (end_offset >= si->max) 721 end_offset = si->max - 1; 722 723 blk_start_plug(&plug); 724 for (offset = start_offset; offset <= end_offset ; offset++) { 725 /* Ok, do the async read-ahead now */ 726 folio = swap_cache_alloc_folio( 727 swp_entry(swp_type(entry), offset), gfp_mask, mpol, ilx, 728 &page_allocated); 729 if (!folio) 730 continue; 731 if (page_allocated) { 732 swap_read_folio(folio, &splug); 733 if (offset != entry_offset) { 734 folio_set_readahead(folio); 735 count_vm_event(SWAP_RA); 736 } 737 } 738 folio_put(folio); 739 } 740 blk_finish_plug(&plug); 741 swap_read_unplug(splug); 742 lru_add_drain(); /* Push any new pages onto the LRU now */ 743 skip: 744 /* The page was likely read above, so no need for plugging here */ 745 folio = swap_cache_alloc_folio(entry, gfp_mask, mpol, ilx, 746 &page_allocated); 747 if (unlikely(page_allocated)) 748 swap_read_folio(folio, NULL); 749 return folio; 750 } 751 752 static int swap_vma_ra_win(struct vm_fault *vmf, unsigned long *start, 753 unsigned long *end) 754 { 755 struct vm_area_struct *vma = vmf->vma; 756 unsigned long ra_val; 757 unsigned long faddr, prev_faddr, left, right; 758 unsigned int max_win, hits, prev_win, win; 759 760 max_win = 1 << min(READ_ONCE(page_cluster), SWAP_RA_ORDER_CEILING); 761 if (max_win == 1) 762 return 1; 763 764 faddr = vmf->address; 765 ra_val = GET_SWAP_RA_VAL(vma); 766 prev_faddr = SWAP_RA_ADDR(ra_val); 767 prev_win = SWAP_RA_WIN(ra_val); 768 hits = SWAP_RA_HITS(ra_val); 769 win = __swapin_nr_pages(PFN_DOWN(prev_faddr), PFN_DOWN(faddr), hits, 770 max_win, prev_win); 771 atomic_long_set(&vma->swap_readahead_info, SWAP_RA_VAL(faddr, win, 0)); 772 if (win == 1) 773 return 1; 774 775 if (faddr == prev_faddr + PAGE_SIZE) 776 left = faddr; 777 else if (prev_faddr == faddr + PAGE_SIZE) 778 left = faddr - (win << PAGE_SHIFT) + PAGE_SIZE; 779 else 780 left = faddr - (((win - 1) / 2) << PAGE_SHIFT); 781 right = left + (win << PAGE_SHIFT); 782 if ((long)left < 0) 783 left = 0; 784 *start = max3(left, vma->vm_start, faddr & PMD_MASK); 785 *end = min3(right, vma->vm_end, (faddr & PMD_MASK) + PMD_SIZE); 786 787 return win; 788 } 789 790 /** 791 * swap_vma_readahead - swap in pages in hope we need them soon 792 * @targ_entry: swap entry of the targeted memory 793 * @gfp_mask: memory allocation flags 794 * @mpol: NUMA memory allocation policy to be applied 795 * @targ_ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE 796 * @vmf: fault information 797 * 798 * Returns the struct folio for entry and addr, after queueing swapin. 799 * 800 * Primitive swap readahead code. We simply read in a few pages whose 801 * virtual addresses are around the fault address in the same vma. 802 * 803 * Caller must hold read mmap_lock if vmf->vma is not NULL. 804 * 805 */ 806 static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, 807 struct mempolicy *mpol, pgoff_t targ_ilx, struct vm_fault *vmf) 808 { 809 struct blk_plug plug; 810 struct swap_iocb *splug = NULL; 811 struct folio *folio; 812 pte_t *pte = NULL, pentry; 813 int win; 814 unsigned long start, end, addr; 815 pgoff_t ilx; 816 bool page_allocated; 817 818 win = swap_vma_ra_win(vmf, &start, &end); 819 if (win == 1) 820 goto skip; 821 822 ilx = targ_ilx - PFN_DOWN(vmf->address - start); 823 824 blk_start_plug(&plug); 825 for (addr = start; addr < end; ilx++, addr += PAGE_SIZE) { 826 struct swap_info_struct *si = NULL; 827 softleaf_t entry; 828 829 if (!pte++) { 830 pte = pte_offset_map(vmf->pmd, addr); 831 if (!pte) 832 break; 833 } 834 pentry = ptep_get_lockless(pte); 835 entry = softleaf_from_pte(pentry); 836 837 if (!softleaf_is_swap(entry)) 838 continue; 839 pte_unmap(pte); 840 pte = NULL; 841 /* 842 * Readahead entry may come from a device that we are not 843 * holding a reference to, try to grab a reference, or skip. 844 */ 845 if (swp_type(entry) != swp_type(targ_entry)) { 846 si = get_swap_device(entry); 847 if (!si) 848 continue; 849 } 850 folio = swap_cache_alloc_folio(entry, gfp_mask, mpol, ilx, 851 &page_allocated); 852 if (si) 853 put_swap_device(si); 854 if (!folio) 855 continue; 856 if (page_allocated) { 857 swap_read_folio(folio, &splug); 858 if (addr != vmf->address) { 859 folio_set_readahead(folio); 860 count_vm_event(SWAP_RA); 861 } 862 } 863 folio_put(folio); 864 } 865 if (pte) 866 pte_unmap(pte); 867 blk_finish_plug(&plug); 868 swap_read_unplug(splug); 869 lru_add_drain(); 870 skip: 871 /* The folio was likely read above, so no need for plugging here */ 872 folio = swap_cache_alloc_folio(targ_entry, gfp_mask, mpol, targ_ilx, 873 &page_allocated); 874 if (unlikely(page_allocated)) 875 swap_read_folio(folio, NULL); 876 return folio; 877 } 878 879 /** 880 * swapin_readahead - swap in pages in hope we need them soon 881 * @entry: swap entry of this memory 882 * @gfp_mask: memory allocation flags 883 * @vmf: fault information 884 * 885 * Returns the struct folio for entry and addr, after queueing swapin. 886 * 887 * It's a main entry function for swap readahead. By the configuration, 888 * it will read ahead blocks by cluster-based(ie, physical disk based) 889 * or vma-based(ie, virtual address based on faulty address) readahead. 890 */ 891 struct folio *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, 892 struct vm_fault *vmf) 893 { 894 struct mempolicy *mpol; 895 pgoff_t ilx; 896 struct folio *folio; 897 898 mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx); 899 folio = swap_use_vma_readahead() ? 900 swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf) : 901 swap_cluster_readahead(entry, gfp_mask, mpol, ilx); 902 mpol_cond_put(mpol); 903 904 return folio; 905 } 906 907 #ifdef CONFIG_SYSFS 908 static ssize_t vma_ra_enabled_show(struct kobject *kobj, 909 struct kobj_attribute *attr, char *buf) 910 { 911 return sysfs_emit(buf, "%s\n", str_true_false(enable_vma_readahead)); 912 } 913 static ssize_t vma_ra_enabled_store(struct kobject *kobj, 914 struct kobj_attribute *attr, 915 const char *buf, size_t count) 916 { 917 ssize_t ret; 918 919 ret = kstrtobool(buf, &enable_vma_readahead); 920 if (ret) 921 return ret; 922 923 return count; 924 } 925 static struct kobj_attribute vma_ra_enabled_attr = __ATTR_RW(vma_ra_enabled); 926 927 static struct attribute *swap_attrs[] = { 928 &vma_ra_enabled_attr.attr, 929 NULL, 930 }; 931 932 static const struct attribute_group swap_attr_group = { 933 .attrs = swap_attrs, 934 }; 935 936 static int __init swap_init(void) 937 { 938 int err; 939 struct kobject *swap_kobj; 940 941 swap_kobj = kobject_create_and_add("swap", mm_kobj); 942 if (!swap_kobj) { 943 pr_err("failed to create swap kobject\n"); 944 return -ENOMEM; 945 } 946 err = sysfs_create_group(swap_kobj, &swap_attr_group); 947 if (err) { 948 pr_err("failed to register swap group\n"); 949 goto delete_obj; 950 } 951 /* Swap cache writeback is LRU based, no tags for it */ 952 mapping_set_no_writeback_tags(&swap_space); 953 return 0; 954 955 delete_obj: 956 kobject_put(swap_kobj); 957 return err; 958 } 959 subsys_initcall(swap_init); 960 #endif 961