1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/mm/swap_state.c 4 * 5 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 6 * Swap reorganised 29.12.95, Stephen Tweedie 7 * 8 * Rewritten to use page cache, (C) 1998 Stephen Tweedie 9 */ 10 #include <linux/mm.h> 11 #include <linux/gfp.h> 12 #include <linux/kernel_stat.h> 13 #include <linux/mempolicy.h> 14 #include <linux/swap.h> 15 #include <linux/leafops.h> 16 #include <linux/init.h> 17 #include <linux/pagemap.h> 18 #include <linux/folio_batch.h> 19 #include <linux/backing-dev.h> 20 #include <linux/blkdev.h> 21 #include <linux/migrate.h> 22 #include <linux/vmalloc.h> 23 #include <linux/huge_mm.h> 24 #include <linux/shmem_fs.h> 25 #include "internal.h" 26 #include "swap_table.h" 27 #include "swap.h" 28 29 /* 30 * swapper_space is a fiction, retained to simplify the path through 31 * vmscan's shrink_folio_list. 32 */ 33 static const struct address_space_operations swap_aops = { 34 .dirty_folio = noop_dirty_folio, 35 #ifdef CONFIG_MIGRATION 36 .migrate_folio = migrate_folio, 37 #endif 38 }; 39 40 struct address_space swap_space __read_mostly = { 41 .a_ops = &swap_aops, 42 }; 43 44 static bool enable_vma_readahead __read_mostly = true; 45 46 #define SWAP_RA_ORDER_CEILING 5 47 48 #define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) 49 #define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) 50 #define SWAP_RA_HITS_MAX SWAP_RA_HITS_MASK 51 #define SWAP_RA_WIN_MASK (~PAGE_MASK & ~SWAP_RA_HITS_MASK) 52 53 #define SWAP_RA_HITS(v) ((v) & SWAP_RA_HITS_MASK) 54 #define SWAP_RA_WIN(v) (((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT) 55 #define SWAP_RA_ADDR(v) ((v) & PAGE_MASK) 56 57 #define SWAP_RA_VAL(addr, win, hits) \ 58 (((addr) & PAGE_MASK) | \ 59 (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) | \ 60 ((hits) & SWAP_RA_HITS_MASK)) 61 62 /* Initial readahead hits is 4 to start up with a small window */ 63 #define GET_SWAP_RA_VAL(vma) \ 64 (atomic_long_read(&(vma)->swap_readahead_info) ? : 4) 65 66 static atomic_t swapin_readahead_hits = ATOMIC_INIT(4); 67 68 void show_swap_cache_info(void) 69 { 70 printk("%lu pages in swap cache\n", total_swapcache_pages()); 71 printk("Free swap = %ldkB\n", K(get_nr_swap_pages())); 72 printk("Total swap = %lukB\n", K(total_swap_pages)); 73 } 74 75 /** 76 * swap_cache_get_folio - Looks up a folio in the swap cache. 77 * @entry: swap entry used for the lookup. 78 * 79 * A found folio will be returned unlocked and with its refcount increased. 80 * 81 * Context: Caller must ensure @entry is valid and protect the swap device 82 * with reference count or locks. 83 * Return: Returns the found folio on success, NULL otherwise. The caller 84 * must lock and check if the folio still matches the swap entry before 85 * use (e.g., folio_matches_swap_entry). 86 */ 87 struct folio *swap_cache_get_folio(swp_entry_t entry) 88 { 89 unsigned long swp_tb; 90 struct folio *folio; 91 92 for (;;) { 93 swp_tb = swap_table_get(__swap_entry_to_cluster(entry), 94 swp_cluster_offset(entry)); 95 if (!swp_tb_is_folio(swp_tb)) 96 return NULL; 97 folio = swp_tb_to_folio(swp_tb); 98 if (likely(folio_try_get(folio))) 99 return folio; 100 } 101 102 return NULL; 103 } 104 105 /** 106 * swap_cache_has_folio - Check if a swap slot has cache. 107 * @entry: swap entry indicating the slot. 108 * 109 * Context: Caller must ensure @entry is valid and protect the swap 110 * device with reference count or locks. 111 */ 112 bool swap_cache_has_folio(swp_entry_t entry) 113 { 114 unsigned long swp_tb; 115 116 swp_tb = swap_table_get(__swap_entry_to_cluster(entry), 117 swp_cluster_offset(entry)); 118 return swp_tb_is_folio(swp_tb); 119 } 120 121 /** 122 * swap_cache_get_shadow - Looks up a shadow in the swap cache. 123 * @entry: swap entry used for the lookup. 124 * 125 * Context: Caller must ensure @entry is valid and protect the swap device 126 * with reference count or locks. 127 * Return: Returns either NULL or an XA_VALUE (shadow). 128 */ 129 void *swap_cache_get_shadow(swp_entry_t entry) 130 { 131 unsigned long swp_tb; 132 133 swp_tb = swap_table_get(__swap_entry_to_cluster(entry), 134 swp_cluster_offset(entry)); 135 if (swp_tb_is_shadow(swp_tb)) 136 return swp_tb_to_shadow(swp_tb); 137 return NULL; 138 } 139 140 void __swap_cache_add_folio(struct swap_cluster_info *ci, 141 struct folio *folio, swp_entry_t entry) 142 { 143 unsigned int ci_off = swp_cluster_offset(entry), ci_end; 144 unsigned long nr_pages = folio_nr_pages(folio); 145 unsigned long pfn = folio_pfn(folio); 146 unsigned long old_tb; 147 148 VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); 149 VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio); 150 VM_WARN_ON_ONCE_FOLIO(!folio_test_swapbacked(folio), folio); 151 152 ci_end = ci_off + nr_pages; 153 do { 154 old_tb = __swap_table_get(ci, ci_off); 155 VM_WARN_ON_ONCE(swp_tb_is_folio(old_tb)); 156 __swap_table_set(ci, ci_off, pfn_to_swp_tb(pfn, __swp_tb_get_count(old_tb))); 157 } while (++ci_off < ci_end); 158 159 folio_ref_add(folio, nr_pages); 160 folio_set_swapcache(folio); 161 folio->swap = entry; 162 163 node_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages); 164 lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr_pages); 165 } 166 167 /** 168 * swap_cache_add_folio - Add a folio into the swap cache. 169 * @folio: The folio to be added. 170 * @entry: The swap entry corresponding to the folio. 171 * @gfp: gfp_mask for XArray node allocation. 172 * @shadowp: If a shadow is found, return the shadow. 173 * 174 * Context: Caller must ensure @entry is valid and protect the swap device 175 * with reference count or locks. 176 */ 177 static int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, 178 void **shadowp) 179 { 180 int err; 181 void *shadow = NULL; 182 unsigned long old_tb; 183 struct swap_info_struct *si; 184 struct swap_cluster_info *ci; 185 unsigned int ci_start, ci_off, ci_end; 186 unsigned long nr_pages = folio_nr_pages(folio); 187 188 si = __swap_entry_to_info(entry); 189 ci_start = swp_cluster_offset(entry); 190 ci_end = ci_start + nr_pages; 191 ci_off = ci_start; 192 ci = swap_cluster_lock(si, swp_offset(entry)); 193 if (unlikely(!ci->table)) { 194 err = -ENOENT; 195 goto failed; 196 } 197 do { 198 old_tb = __swap_table_get(ci, ci_off); 199 if (unlikely(swp_tb_is_folio(old_tb))) { 200 err = -EEXIST; 201 goto failed; 202 } 203 if (unlikely(!__swp_tb_get_count(old_tb))) { 204 err = -ENOENT; 205 goto failed; 206 } 207 if (swp_tb_is_shadow(old_tb)) 208 shadow = swp_tb_to_shadow(old_tb); 209 } while (++ci_off < ci_end); 210 __swap_cache_add_folio(ci, folio, entry); 211 swap_cluster_unlock(ci); 212 if (shadowp) 213 *shadowp = shadow; 214 return 0; 215 216 failed: 217 swap_cluster_unlock(ci); 218 return err; 219 } 220 221 /** 222 * __swap_cache_del_folio - Removes a folio from the swap cache. 223 * @ci: The locked swap cluster. 224 * @folio: The folio. 225 * @entry: The first swap entry that the folio corresponds to. 226 * @shadow: shadow value to be filled in the swap cache. 227 * 228 * Removes a folio from the swap cache and fills a shadow in place. 229 * This won't put the folio's refcount. The caller has to do that. 230 * 231 * Context: Caller must ensure the folio is locked and in the swap cache 232 * using the index of @entry, and lock the cluster that holds the entries. 233 */ 234 void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, 235 swp_entry_t entry, void *shadow) 236 { 237 int count; 238 unsigned long old_tb; 239 struct swap_info_struct *si; 240 unsigned int ci_start, ci_off, ci_end; 241 bool folio_swapped = false, need_free = false; 242 unsigned long nr_pages = folio_nr_pages(folio); 243 244 VM_WARN_ON_ONCE(__swap_entry_to_cluster(entry) != ci); 245 VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); 246 VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio); 247 VM_WARN_ON_ONCE_FOLIO(folio_test_writeback(folio), folio); 248 249 si = __swap_entry_to_info(entry); 250 ci_start = swp_cluster_offset(entry); 251 ci_end = ci_start + nr_pages; 252 ci_off = ci_start; 253 do { 254 old_tb = __swap_table_get(ci, ci_off); 255 WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || 256 swp_tb_to_folio(old_tb) != folio); 257 count = __swp_tb_get_count(old_tb); 258 if (count) 259 folio_swapped = true; 260 else 261 need_free = true; 262 /* If shadow is NULL, we sets an empty shadow. */ 263 __swap_table_set(ci, ci_off, shadow_to_swp_tb(shadow, count)); 264 } while (++ci_off < ci_end); 265 266 folio->swap.val = 0; 267 folio_clear_swapcache(folio); 268 node_stat_mod_folio(folio, NR_FILE_PAGES, -nr_pages); 269 lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr_pages); 270 271 if (!folio_swapped) { 272 __swap_cluster_free_entries(si, ci, ci_start, nr_pages); 273 } else if (need_free) { 274 ci_off = ci_start; 275 do { 276 if (!__swp_tb_get_count(__swap_table_get(ci, ci_off))) 277 __swap_cluster_free_entries(si, ci, ci_off, 1); 278 } while (++ci_off < ci_end); 279 } 280 } 281 282 /** 283 * swap_cache_del_folio - Removes a folio from the swap cache. 284 * @folio: The folio. 285 * 286 * Same as __swap_cache_del_folio, but handles lock and refcount. The 287 * caller must ensure the folio is either clean or has a swap count 288 * equal to zero, or it may cause data loss. 289 * 290 * Context: Caller must ensure the folio is locked and in the swap cache. 291 */ 292 void swap_cache_del_folio(struct folio *folio) 293 { 294 struct swap_cluster_info *ci; 295 swp_entry_t entry = folio->swap; 296 297 ci = swap_cluster_lock(__swap_entry_to_info(entry), swp_offset(entry)); 298 __swap_cache_del_folio(ci, folio, entry, NULL); 299 swap_cluster_unlock(ci); 300 301 folio_ref_sub(folio, folio_nr_pages(folio)); 302 } 303 304 /** 305 * __swap_cache_replace_folio - Replace a folio in the swap cache. 306 * @ci: The locked swap cluster. 307 * @old: The old folio to be replaced. 308 * @new: The new folio. 309 * 310 * Replace an existing folio in the swap cache with a new folio. The 311 * caller is responsible for setting up the new folio's flag and swap 312 * entries. Replacement will take the new folio's swap entry value as 313 * the starting offset to override all slots covered by the new folio. 314 * 315 * Context: Caller must ensure both folios are locked, and lock the 316 * cluster that holds the old folio to be replaced. 317 */ 318 void __swap_cache_replace_folio(struct swap_cluster_info *ci, 319 struct folio *old, struct folio *new) 320 { 321 swp_entry_t entry = new->swap; 322 unsigned long nr_pages = folio_nr_pages(new); 323 unsigned int ci_off = swp_cluster_offset(entry); 324 unsigned int ci_end = ci_off + nr_pages; 325 unsigned long pfn = folio_pfn(new); 326 unsigned long old_tb; 327 328 VM_WARN_ON_ONCE(!folio_test_swapcache(old) || !folio_test_swapcache(new)); 329 VM_WARN_ON_ONCE(!folio_test_locked(old) || !folio_test_locked(new)); 330 VM_WARN_ON_ONCE(!entry.val); 331 332 /* Swap cache still stores N entries instead of a high-order entry */ 333 do { 334 old_tb = __swap_table_get(ci, ci_off); 335 WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || swp_tb_to_folio(old_tb) != old); 336 __swap_table_set(ci, ci_off, pfn_to_swp_tb(pfn, __swp_tb_get_count(old_tb))); 337 } while (++ci_off < ci_end); 338 339 /* 340 * If the old folio is partially replaced (e.g., splitting a large 341 * folio, the old folio is shrunk, and new split sub folios replace 342 * the shrunk part), ensure the new folio doesn't overlap it. 343 */ 344 if (IS_ENABLED(CONFIG_DEBUG_VM) && 345 folio_order(old) != folio_order(new)) { 346 ci_off = swp_cluster_offset(old->swap); 347 ci_end = ci_off + folio_nr_pages(old); 348 while (ci_off++ < ci_end) 349 WARN_ON_ONCE(swp_tb_to_folio(__swap_table_get(ci, ci_off)) != old); 350 } 351 } 352 353 /* 354 * If we are the only user, then try to free up the swap cache. 355 * 356 * Its ok to check the swapcache flag without the folio lock 357 * here because we are going to recheck again inside 358 * folio_free_swap() _with_ the lock. 359 * - Marcelo 360 */ 361 void free_swap_cache(struct folio *folio) 362 { 363 if (folio_test_swapcache(folio) && !folio_mapped(folio) && 364 folio_trylock(folio)) { 365 folio_free_swap(folio); 366 folio_unlock(folio); 367 } 368 } 369 370 /* 371 * Freeing a folio and also freeing any swap cache associated with 372 * this folio if it is the last user. 373 */ 374 void free_folio_and_swap_cache(struct folio *folio) 375 { 376 free_swap_cache(folio); 377 if (!is_huge_zero_folio(folio)) 378 folio_put(folio); 379 } 380 381 /* 382 * Passed an array of pages, drop them all from swapcache and then release 383 * them. They are removed from the LRU and freed if this is their last use. 384 */ 385 void free_pages_and_swap_cache(struct encoded_page **pages, int nr) 386 { 387 struct folio_batch folios; 388 unsigned int refs[FOLIO_BATCH_SIZE]; 389 390 folio_batch_init(&folios); 391 for (int i = 0; i < nr; i++) { 392 struct folio *folio = page_folio(encoded_page_ptr(pages[i])); 393 394 free_swap_cache(folio); 395 refs[folios.nr] = 1; 396 if (unlikely(encoded_page_flags(pages[i]) & 397 ENCODED_PAGE_BIT_NR_PAGES_NEXT)) 398 refs[folios.nr] = encoded_nr_pages(pages[++i]); 399 400 if (folio_batch_add(&folios, folio) == 0) 401 folios_put_refs(&folios, refs); 402 } 403 if (folios.nr) 404 folios_put_refs(&folios, refs); 405 } 406 407 static inline bool swap_use_vma_readahead(void) 408 { 409 return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap); 410 } 411 412 /** 413 * swap_update_readahead - Update the readahead statistics of VMA or globally. 414 * @folio: the swap cache folio that just got hit. 415 * @vma: the VMA that should be updated, could be NULL for global update. 416 * @addr: the addr that triggered the swapin, ignored if @vma is NULL. 417 */ 418 void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma, 419 unsigned long addr) 420 { 421 bool readahead, vma_ra = swap_use_vma_readahead(); 422 423 /* 424 * At the moment, we don't support PG_readahead for anon THP 425 * so let's bail out rather than confusing the readahead stat. 426 */ 427 if (unlikely(folio_test_large(folio))) 428 return; 429 430 readahead = folio_test_clear_readahead(folio); 431 if (vma && vma_ra) { 432 unsigned long ra_val; 433 int win, hits; 434 435 ra_val = GET_SWAP_RA_VAL(vma); 436 win = SWAP_RA_WIN(ra_val); 437 hits = SWAP_RA_HITS(ra_val); 438 if (readahead) 439 hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX); 440 atomic_long_set(&vma->swap_readahead_info, 441 SWAP_RA_VAL(addr, win, hits)); 442 } 443 444 if (readahead) { 445 count_vm_event(SWAP_RA_HIT); 446 if (!vma || !vma_ra) 447 atomic_inc(&swapin_readahead_hits); 448 } 449 } 450 451 /** 452 * __swap_cache_prepare_and_add - Prepare the folio and add it to swap cache. 453 * @entry: swap entry to be bound to the folio. 454 * @folio: folio to be added. 455 * @gfp: memory allocation flags for charge, can be 0 if @charged if true. 456 * @charged: if the folio is already charged. 457 * 458 * Update the swap_map and add folio as swap cache, typically before swapin. 459 * All swap slots covered by the folio must have a non-zero swap count. 460 * 461 * Context: Caller must protect the swap device with reference count or locks. 462 * Return: Returns the folio being added on success. Returns the existing folio 463 * if @entry is already cached. Returns NULL if raced with swapin or swapoff. 464 */ 465 static struct folio *__swap_cache_prepare_and_add(swp_entry_t entry, 466 struct folio *folio, 467 gfp_t gfp, bool charged) 468 { 469 struct folio *swapcache = NULL; 470 void *shadow; 471 int ret; 472 473 __folio_set_locked(folio); 474 __folio_set_swapbacked(folio); 475 for (;;) { 476 ret = swap_cache_add_folio(folio, entry, &shadow); 477 if (!ret) 478 break; 479 480 /* 481 * Large order allocation needs special handling on 482 * race: if a smaller folio exists in cache, swapin needs 483 * to fallback to order 0, and doing a swap cache lookup 484 * might return a folio that is irrelevant to the faulting 485 * entry because @entry is aligned down. Just return NULL. 486 */ 487 if (ret != -EEXIST || folio_test_large(folio)) 488 goto failed; 489 490 swapcache = swap_cache_get_folio(entry); 491 if (swapcache) 492 goto failed; 493 } 494 495 if (!charged && mem_cgroup_swapin_charge_folio(folio, NULL, gfp, entry)) { 496 swap_cache_del_folio(folio); 497 goto failed; 498 } 499 500 memcg1_swapin(entry, folio_nr_pages(folio)); 501 if (shadow) 502 workingset_refault(folio, shadow); 503 504 /* Caller will initiate read into locked folio */ 505 folio_add_lru(folio); 506 return folio; 507 508 failed: 509 folio_unlock(folio); 510 return swapcache; 511 } 512 513 /** 514 * swap_cache_alloc_folio - Allocate folio for swapped out slot in swap cache. 515 * @entry: the swapped out swap entry to be binded to the folio. 516 * @gfp_mask: memory allocation flags 517 * @mpol: NUMA memory allocation policy to be applied 518 * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE 519 * @new_page_allocated: sets true if allocation happened, false otherwise 520 * 521 * Allocate a folio in the swap cache for one swap slot, typically before 522 * doing IO (e.g. swap in or zswap writeback). The swap slot indicated by 523 * @entry must have a non-zero swap count (swapped out). 524 * Currently only supports order 0. 525 * 526 * Context: Caller must protect the swap device with reference count or locks. 527 * Return: Returns the existing folio if @entry is cached already. Returns 528 * NULL if failed due to -ENOMEM or @entry have a swap count < 1. 529 */ 530 struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_mask, 531 struct mempolicy *mpol, pgoff_t ilx, 532 bool *new_page_allocated) 533 { 534 struct swap_info_struct *si = __swap_entry_to_info(entry); 535 struct folio *folio; 536 struct folio *result = NULL; 537 538 *new_page_allocated = false; 539 /* Check the swap cache again for readahead path. */ 540 folio = swap_cache_get_folio(entry); 541 if (folio) 542 return folio; 543 544 /* Skip allocation for unused and bad swap slot for readahead. */ 545 if (!swap_entry_swapped(si, entry)) 546 return NULL; 547 548 /* Allocate a new folio to be added into the swap cache. */ 549 folio = folio_alloc_mpol(gfp_mask, 0, mpol, ilx, numa_node_id()); 550 if (!folio) 551 return NULL; 552 /* Try add the new folio, returns existing folio or NULL on failure. */ 553 result = __swap_cache_prepare_and_add(entry, folio, gfp_mask, false); 554 if (result == folio) 555 *new_page_allocated = true; 556 else 557 folio_put(folio); 558 return result; 559 } 560 561 /** 562 * swapin_folio - swap-in one or multiple entries skipping readahead. 563 * @entry: starting swap entry to swap in 564 * @folio: a new allocated and charged folio 565 * 566 * Reads @entry into @folio, @folio will be added to the swap cache. 567 * If @folio is a large folio, the @entry will be rounded down to align 568 * with the folio size. 569 * 570 * Return: returns pointer to @folio on success. If folio is a large folio 571 * and this raced with another swapin, NULL will be returned to allow fallback 572 * to order 0. Else, if another folio was already added to the swap cache, 573 * return that swap cache folio instead. 574 */ 575 struct folio *swapin_folio(swp_entry_t entry, struct folio *folio) 576 { 577 struct folio *swapcache; 578 pgoff_t offset = swp_offset(entry); 579 unsigned long nr_pages = folio_nr_pages(folio); 580 581 entry = swp_entry(swp_type(entry), round_down(offset, nr_pages)); 582 swapcache = __swap_cache_prepare_and_add(entry, folio, 0, true); 583 if (swapcache == folio) 584 swap_read_folio(folio, NULL); 585 return swapcache; 586 } 587 588 /* 589 * Locate a page of swap in physical memory, reserving swap cache space 590 * and reading the disk if it is not already cached. 591 * A failure return means that either the page allocation failed or that 592 * the swap entry is no longer in use. 593 */ 594 struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, 595 struct vm_area_struct *vma, unsigned long addr, 596 struct swap_iocb **plug) 597 { 598 struct swap_info_struct *si; 599 bool page_allocated; 600 struct mempolicy *mpol; 601 pgoff_t ilx; 602 struct folio *folio; 603 604 si = get_swap_device(entry); 605 if (!si) 606 return NULL; 607 608 mpol = get_vma_policy(vma, addr, 0, &ilx); 609 folio = swap_cache_alloc_folio(entry, gfp_mask, mpol, ilx, 610 &page_allocated); 611 mpol_cond_put(mpol); 612 613 if (page_allocated) 614 swap_read_folio(folio, plug); 615 616 put_swap_device(si); 617 return folio; 618 } 619 620 static unsigned int __swapin_nr_pages(unsigned long prev_offset, 621 unsigned long offset, 622 int hits, 623 int max_pages, 624 int prev_win) 625 { 626 unsigned int pages, last_ra; 627 628 /* 629 * This heuristic has been found to work well on both sequential and 630 * random loads, swapping to hard disk or to SSD: please don't ask 631 * what the "+ 2" means, it just happens to work well, that's all. 632 */ 633 pages = hits + 2; 634 if (pages == 2) { 635 /* 636 * We can have no readahead hits to judge by: but must not get 637 * stuck here forever, so check for an adjacent offset instead 638 * (and don't even bother to check whether swap type is same). 639 */ 640 if (offset != prev_offset + 1 && offset != prev_offset - 1) 641 pages = 1; 642 } else { 643 unsigned int roundup = 4; 644 while (roundup < pages) 645 roundup <<= 1; 646 pages = roundup; 647 } 648 649 if (pages > max_pages) 650 pages = max_pages; 651 652 /* Don't shrink readahead too fast */ 653 last_ra = prev_win / 2; 654 if (pages < last_ra) 655 pages = last_ra; 656 657 return pages; 658 } 659 660 static unsigned long swapin_nr_pages(unsigned long offset) 661 { 662 static unsigned long prev_offset; 663 unsigned int hits, pages, max_pages; 664 static atomic_t last_readahead_pages; 665 666 max_pages = 1 << READ_ONCE(page_cluster); 667 if (max_pages <= 1) 668 return 1; 669 670 hits = atomic_xchg(&swapin_readahead_hits, 0); 671 pages = __swapin_nr_pages(READ_ONCE(prev_offset), offset, hits, 672 max_pages, 673 atomic_read(&last_readahead_pages)); 674 if (!hits) 675 WRITE_ONCE(prev_offset, offset); 676 atomic_set(&last_readahead_pages, pages); 677 678 return pages; 679 } 680 681 /** 682 * swap_cluster_readahead - swap in pages in hope we need them soon 683 * @entry: swap entry of this memory 684 * @gfp_mask: memory allocation flags 685 * @mpol: NUMA memory allocation policy to be applied 686 * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE 687 * 688 * Returns the struct folio for entry and addr, after queueing swapin. 689 * 690 * Primitive swap readahead code. We simply read an aligned block of 691 * (1 << page_cluster) entries in the swap area. This method is chosen 692 * because it doesn't cost us any seek time. We also make sure to queue 693 * the 'original' request together with the readahead ones... 694 * 695 * Note: it is intentional that the same NUMA policy and interleave index 696 * are used for every page of the readahead: neighbouring pages on swap 697 * are fairly likely to have been swapped out from the same node. 698 */ 699 struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, 700 struct mempolicy *mpol, pgoff_t ilx) 701 { 702 struct folio *folio; 703 unsigned long entry_offset = swp_offset(entry); 704 unsigned long offset = entry_offset; 705 unsigned long start_offset, end_offset; 706 unsigned long mask; 707 struct swap_info_struct *si = __swap_entry_to_info(entry); 708 struct blk_plug plug; 709 struct swap_iocb *splug = NULL; 710 bool page_allocated; 711 712 mask = swapin_nr_pages(offset) - 1; 713 if (!mask) 714 goto skip; 715 716 /* Read a page_cluster sized and aligned cluster around offset. */ 717 start_offset = offset & ~mask; 718 end_offset = offset | mask; 719 if (!start_offset) /* First page is swap header. */ 720 start_offset++; 721 if (end_offset >= si->max) 722 end_offset = si->max - 1; 723 724 blk_start_plug(&plug); 725 for (offset = start_offset; offset <= end_offset ; offset++) { 726 /* Ok, do the async read-ahead now */ 727 folio = swap_cache_alloc_folio( 728 swp_entry(swp_type(entry), offset), gfp_mask, mpol, ilx, 729 &page_allocated); 730 if (!folio) 731 continue; 732 if (page_allocated) { 733 swap_read_folio(folio, &splug); 734 if (offset != entry_offset) { 735 folio_set_readahead(folio); 736 count_vm_event(SWAP_RA); 737 } 738 } 739 folio_put(folio); 740 } 741 blk_finish_plug(&plug); 742 swap_read_unplug(splug); 743 lru_add_drain(); /* Push any new pages onto the LRU now */ 744 skip: 745 /* The page was likely read above, so no need for plugging here */ 746 folio = swap_cache_alloc_folio(entry, gfp_mask, mpol, ilx, 747 &page_allocated); 748 if (unlikely(page_allocated)) 749 swap_read_folio(folio, NULL); 750 return folio; 751 } 752 753 static int swap_vma_ra_win(struct vm_fault *vmf, unsigned long *start, 754 unsigned long *end) 755 { 756 struct vm_area_struct *vma = vmf->vma; 757 unsigned long ra_val; 758 unsigned long faddr, prev_faddr, left, right; 759 unsigned int max_win, hits, prev_win, win; 760 761 max_win = 1 << min(READ_ONCE(page_cluster), SWAP_RA_ORDER_CEILING); 762 if (max_win == 1) 763 return 1; 764 765 faddr = vmf->address; 766 ra_val = GET_SWAP_RA_VAL(vma); 767 prev_faddr = SWAP_RA_ADDR(ra_val); 768 prev_win = SWAP_RA_WIN(ra_val); 769 hits = SWAP_RA_HITS(ra_val); 770 win = __swapin_nr_pages(PFN_DOWN(prev_faddr), PFN_DOWN(faddr), hits, 771 max_win, prev_win); 772 atomic_long_set(&vma->swap_readahead_info, SWAP_RA_VAL(faddr, win, 0)); 773 if (win == 1) 774 return 1; 775 776 if (faddr == prev_faddr + PAGE_SIZE) 777 left = faddr; 778 else if (prev_faddr == faddr + PAGE_SIZE) 779 left = faddr - (win << PAGE_SHIFT) + PAGE_SIZE; 780 else 781 left = faddr - (((win - 1) / 2) << PAGE_SHIFT); 782 right = left + (win << PAGE_SHIFT); 783 if ((long)left < 0) 784 left = 0; 785 *start = max3(left, vma->vm_start, faddr & PMD_MASK); 786 *end = min3(right, vma->vm_end, (faddr & PMD_MASK) + PMD_SIZE); 787 788 return win; 789 } 790 791 /** 792 * swap_vma_readahead - swap in pages in hope we need them soon 793 * @targ_entry: swap entry of the targeted memory 794 * @gfp_mask: memory allocation flags 795 * @mpol: NUMA memory allocation policy to be applied 796 * @targ_ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE 797 * @vmf: fault information 798 * 799 * Returns the struct folio for entry and addr, after queueing swapin. 800 * 801 * Primitive swap readahead code. We simply read in a few pages whose 802 * virtual addresses are around the fault address in the same vma. 803 * 804 * Caller must hold read mmap_lock if vmf->vma is not NULL. 805 * 806 */ 807 static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, 808 struct mempolicy *mpol, pgoff_t targ_ilx, struct vm_fault *vmf) 809 { 810 struct blk_plug plug; 811 struct swap_iocb *splug = NULL; 812 struct folio *folio; 813 pte_t *pte = NULL, pentry; 814 int win; 815 unsigned long start, end, addr; 816 pgoff_t ilx; 817 bool page_allocated; 818 819 win = swap_vma_ra_win(vmf, &start, &end); 820 if (win == 1) 821 goto skip; 822 823 ilx = targ_ilx - PFN_DOWN(vmf->address - start); 824 825 blk_start_plug(&plug); 826 for (addr = start; addr < end; ilx++, addr += PAGE_SIZE) { 827 struct swap_info_struct *si = NULL; 828 softleaf_t entry; 829 830 if (!pte++) { 831 pte = pte_offset_map(vmf->pmd, addr); 832 if (!pte) 833 break; 834 } 835 pentry = ptep_get_lockless(pte); 836 entry = softleaf_from_pte(pentry); 837 838 if (!softleaf_is_swap(entry)) 839 continue; 840 pte_unmap(pte); 841 pte = NULL; 842 /* 843 * Readahead entry may come from a device that we are not 844 * holding a reference to, try to grab a reference, or skip. 845 */ 846 if (swp_type(entry) != swp_type(targ_entry)) { 847 si = get_swap_device(entry); 848 if (!si) 849 continue; 850 } 851 folio = swap_cache_alloc_folio(entry, gfp_mask, mpol, ilx, 852 &page_allocated); 853 if (si) 854 put_swap_device(si); 855 if (!folio) 856 continue; 857 if (page_allocated) { 858 swap_read_folio(folio, &splug); 859 if (addr != vmf->address) { 860 folio_set_readahead(folio); 861 count_vm_event(SWAP_RA); 862 } 863 } 864 folio_put(folio); 865 } 866 if (pte) 867 pte_unmap(pte); 868 blk_finish_plug(&plug); 869 swap_read_unplug(splug); 870 lru_add_drain(); 871 skip: 872 /* The folio was likely read above, so no need for plugging here */ 873 folio = swap_cache_alloc_folio(targ_entry, gfp_mask, mpol, targ_ilx, 874 &page_allocated); 875 if (unlikely(page_allocated)) 876 swap_read_folio(folio, NULL); 877 return folio; 878 } 879 880 /** 881 * swapin_readahead - swap in pages in hope we need them soon 882 * @entry: swap entry of this memory 883 * @gfp_mask: memory allocation flags 884 * @vmf: fault information 885 * 886 * Returns the struct folio for entry and addr, after queueing swapin. 887 * 888 * It's a main entry function for swap readahead. By the configuration, 889 * it will read ahead blocks by cluster-based(ie, physical disk based) 890 * or vma-based(ie, virtual address based on faulty address) readahead. 891 */ 892 struct folio *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, 893 struct vm_fault *vmf) 894 { 895 struct mempolicy *mpol; 896 pgoff_t ilx; 897 struct folio *folio; 898 899 mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx); 900 folio = swap_use_vma_readahead() ? 901 swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf) : 902 swap_cluster_readahead(entry, gfp_mask, mpol, ilx); 903 mpol_cond_put(mpol); 904 905 return folio; 906 } 907 908 #ifdef CONFIG_SYSFS 909 static ssize_t vma_ra_enabled_show(struct kobject *kobj, 910 struct kobj_attribute *attr, char *buf) 911 { 912 return sysfs_emit(buf, "%s\n", str_true_false(enable_vma_readahead)); 913 } 914 static ssize_t vma_ra_enabled_store(struct kobject *kobj, 915 struct kobj_attribute *attr, 916 const char *buf, size_t count) 917 { 918 ssize_t ret; 919 920 ret = kstrtobool(buf, &enable_vma_readahead); 921 if (ret) 922 return ret; 923 924 return count; 925 } 926 static struct kobj_attribute vma_ra_enabled_attr = __ATTR_RW(vma_ra_enabled); 927 928 static struct attribute *swap_attrs[] = { 929 &vma_ra_enabled_attr.attr, 930 NULL, 931 }; 932 933 static const struct attribute_group swap_attr_group = { 934 .attrs = swap_attrs, 935 }; 936 937 static int __init swap_init(void) 938 { 939 int err; 940 struct kobject *swap_kobj; 941 942 swap_kobj = kobject_create_and_add("swap", mm_kobj); 943 if (!swap_kobj) { 944 pr_err("failed to create swap kobject\n"); 945 return -ENOMEM; 946 } 947 err = sysfs_create_group(swap_kobj, &swap_attr_group); 948 if (err) { 949 pr_err("failed to register swap group\n"); 950 goto delete_obj; 951 } 952 /* Swap cache writeback is LRU based, no tags for it */ 953 mapping_set_no_writeback_tags(&swap_space); 954 return 0; 955 956 delete_obj: 957 kobject_put(swap_kobj); 958 return err; 959 } 960 subsys_initcall(swap_init); 961 #endif 962