1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/mm/swap_state.c 4 * 5 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 6 * Swap reorganised 29.12.95, Stephen Tweedie 7 * 8 * Rewritten to use page cache, (C) 1998 Stephen Tweedie 9 */ 10 #include <linux/mm.h> 11 #include <linux/gfp.h> 12 #include <linux/kernel_stat.h> 13 #include <linux/mempolicy.h> 14 #include <linux/swap.h> 15 #include <linux/leafops.h> 16 #include <linux/init.h> 17 #include <linux/pagemap.h> 18 #include <linux/pagevec.h> 19 #include <linux/backing-dev.h> 20 #include <linux/blkdev.h> 21 #include <linux/migrate.h> 22 #include <linux/vmalloc.h> 23 #include <linux/huge_mm.h> 24 #include <linux/shmem_fs.h> 25 #include "internal.h" 26 #include "swap_table.h" 27 #include "swap.h" 28 29 /* 30 * swapper_space is a fiction, retained to simplify the path through 31 * vmscan's shrink_folio_list. 32 */ 33 static const struct address_space_operations swap_aops = { 34 .dirty_folio = noop_dirty_folio, 35 #ifdef CONFIG_MIGRATION 36 .migrate_folio = migrate_folio, 37 #endif 38 }; 39 40 struct address_space swap_space __read_mostly = { 41 .a_ops = &swap_aops, 42 }; 43 44 static bool enable_vma_readahead __read_mostly = true; 45 46 #define SWAP_RA_ORDER_CEILING 5 47 48 #define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) 49 #define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) 50 #define SWAP_RA_HITS_MAX SWAP_RA_HITS_MASK 51 #define SWAP_RA_WIN_MASK (~PAGE_MASK & ~SWAP_RA_HITS_MASK) 52 53 #define SWAP_RA_HITS(v) ((v) & SWAP_RA_HITS_MASK) 54 #define SWAP_RA_WIN(v) (((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT) 55 #define SWAP_RA_ADDR(v) ((v) & PAGE_MASK) 56 57 #define SWAP_RA_VAL(addr, win, hits) \ 58 (((addr) & PAGE_MASK) | \ 59 (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) | \ 60 ((hits) & SWAP_RA_HITS_MASK)) 61 62 /* Initial readahead hits is 4 to start up with a small window */ 63 #define GET_SWAP_RA_VAL(vma) \ 64 (atomic_long_read(&(vma)->swap_readahead_info) ? : 4) 65 66 static atomic_t swapin_readahead_hits = ATOMIC_INIT(4); 67 68 void show_swap_cache_info(void) 69 { 70 printk("%lu pages in swap cache\n", total_swapcache_pages()); 71 printk("Free swap = %ldkB\n", K(get_nr_swap_pages())); 72 printk("Total swap = %lukB\n", K(total_swap_pages)); 73 } 74 75 /** 76 * swap_cache_get_folio - Looks up a folio in the swap cache. 77 * @entry: swap entry used for the lookup. 78 * 79 * A found folio will be returned unlocked and with its refcount increased. 80 * 81 * Context: Caller must ensure @entry is valid and protect the swap device 82 * with reference count or locks. 83 * Return: Returns the found folio on success, NULL otherwise. The caller 84 * must lock and check if the folio still matches the swap entry before 85 * use (e.g., folio_matches_swap_entry). 86 */ 87 struct folio *swap_cache_get_folio(swp_entry_t entry) 88 { 89 unsigned long swp_tb; 90 struct folio *folio; 91 92 for (;;) { 93 swp_tb = swap_table_get(__swap_entry_to_cluster(entry), 94 swp_cluster_offset(entry)); 95 if (!swp_tb_is_folio(swp_tb)) 96 return NULL; 97 folio = swp_tb_to_folio(swp_tb); 98 if (likely(folio_try_get(folio))) 99 return folio; 100 } 101 102 return NULL; 103 } 104 105 /** 106 * swap_cache_has_folio - Check if a swap slot has cache. 107 * @entry: swap entry indicating the slot. 108 * 109 * Context: Caller must ensure @entry is valid and protect the swap 110 * device with reference count or locks. 111 */ 112 bool swap_cache_has_folio(swp_entry_t entry) 113 { 114 unsigned long swp_tb; 115 116 swp_tb = swap_table_get(__swap_entry_to_cluster(entry), 117 swp_cluster_offset(entry)); 118 return swp_tb_is_folio(swp_tb); 119 } 120 121 /** 122 * swap_cache_get_shadow - Looks up a shadow in the swap cache. 123 * @entry: swap entry used for the lookup. 124 * 125 * Context: Caller must ensure @entry is valid and protect the swap device 126 * with reference count or locks. 127 * Return: Returns either NULL or an XA_VALUE (shadow). 128 */ 129 void *swap_cache_get_shadow(swp_entry_t entry) 130 { 131 unsigned long swp_tb; 132 133 swp_tb = swap_table_get(__swap_entry_to_cluster(entry), 134 swp_cluster_offset(entry)); 135 if (swp_tb_is_shadow(swp_tb)) 136 return swp_tb_to_shadow(swp_tb); 137 return NULL; 138 } 139 140 void __swap_cache_add_folio(struct swap_cluster_info *ci, 141 struct folio *folio, swp_entry_t entry) 142 { 143 unsigned long new_tb; 144 unsigned int ci_start, ci_off, ci_end; 145 unsigned long nr_pages = folio_nr_pages(folio); 146 147 VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); 148 VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio); 149 VM_WARN_ON_ONCE_FOLIO(!folio_test_swapbacked(folio), folio); 150 151 new_tb = folio_to_swp_tb(folio); 152 ci_start = swp_cluster_offset(entry); 153 ci_off = ci_start; 154 ci_end = ci_start + nr_pages; 155 do { 156 VM_WARN_ON_ONCE(swp_tb_is_folio(__swap_table_get(ci, ci_off))); 157 __swap_table_set(ci, ci_off, new_tb); 158 } while (++ci_off < ci_end); 159 160 folio_ref_add(folio, nr_pages); 161 folio_set_swapcache(folio); 162 folio->swap = entry; 163 164 node_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages); 165 lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr_pages); 166 } 167 168 /** 169 * swap_cache_add_folio - Add a folio into the swap cache. 170 * @folio: The folio to be added. 171 * @entry: The swap entry corresponding to the folio. 172 * @gfp: gfp_mask for XArray node allocation. 173 * @shadowp: If a shadow is found, return the shadow. 174 * 175 * Context: Caller must ensure @entry is valid and protect the swap device 176 * with reference count or locks. 177 */ 178 static int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, 179 void **shadowp) 180 { 181 int err; 182 void *shadow = NULL; 183 unsigned long old_tb; 184 struct swap_info_struct *si; 185 struct swap_cluster_info *ci; 186 unsigned int ci_start, ci_off, ci_end, offset; 187 unsigned long nr_pages = folio_nr_pages(folio); 188 189 si = __swap_entry_to_info(entry); 190 ci_start = swp_cluster_offset(entry); 191 ci_end = ci_start + nr_pages; 192 ci_off = ci_start; 193 offset = swp_offset(entry); 194 ci = swap_cluster_lock(si, swp_offset(entry)); 195 if (unlikely(!ci->table)) { 196 err = -ENOENT; 197 goto failed; 198 } 199 do { 200 old_tb = __swap_table_get(ci, ci_off); 201 if (unlikely(swp_tb_is_folio(old_tb))) { 202 err = -EEXIST; 203 goto failed; 204 } 205 if (unlikely(!__swap_count(swp_entry(swp_type(entry), offset)))) { 206 err = -ENOENT; 207 goto failed; 208 } 209 if (swp_tb_is_shadow(old_tb)) 210 shadow = swp_tb_to_shadow(old_tb); 211 offset++; 212 } while (++ci_off < ci_end); 213 __swap_cache_add_folio(ci, folio, entry); 214 swap_cluster_unlock(ci); 215 if (shadowp) 216 *shadowp = shadow; 217 return 0; 218 219 failed: 220 swap_cluster_unlock(ci); 221 return err; 222 } 223 224 /** 225 * __swap_cache_del_folio - Removes a folio from the swap cache. 226 * @ci: The locked swap cluster. 227 * @folio: The folio. 228 * @entry: The first swap entry that the folio corresponds to. 229 * @shadow: shadow value to be filled in the swap cache. 230 * 231 * Removes a folio from the swap cache and fills a shadow in place. 232 * This won't put the folio's refcount. The caller has to do that. 233 * 234 * Context: Caller must ensure the folio is locked and in the swap cache 235 * using the index of @entry, and lock the cluster that holds the entries. 236 */ 237 void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, 238 swp_entry_t entry, void *shadow) 239 { 240 struct swap_info_struct *si; 241 unsigned long old_tb, new_tb; 242 unsigned int ci_start, ci_off, ci_end; 243 bool folio_swapped = false, need_free = false; 244 unsigned long nr_pages = folio_nr_pages(folio); 245 246 VM_WARN_ON_ONCE(__swap_entry_to_cluster(entry) != ci); 247 VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); 248 VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio); 249 VM_WARN_ON_ONCE_FOLIO(folio_test_writeback(folio), folio); 250 251 si = __swap_entry_to_info(entry); 252 new_tb = shadow_swp_to_tb(shadow); 253 ci_start = swp_cluster_offset(entry); 254 ci_end = ci_start + nr_pages; 255 ci_off = ci_start; 256 do { 257 /* If shadow is NULL, we sets an empty shadow */ 258 old_tb = __swap_table_xchg(ci, ci_off, new_tb); 259 WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || 260 swp_tb_to_folio(old_tb) != folio); 261 if (__swap_count(swp_entry(si->type, 262 swp_offset(entry) + ci_off - ci_start))) 263 folio_swapped = true; 264 else 265 need_free = true; 266 } while (++ci_off < ci_end); 267 268 folio->swap.val = 0; 269 folio_clear_swapcache(folio); 270 node_stat_mod_folio(folio, NR_FILE_PAGES, -nr_pages); 271 lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr_pages); 272 273 if (!folio_swapped) { 274 swap_entries_free(si, ci, swp_offset(entry), nr_pages); 275 } else if (need_free) { 276 do { 277 if (!__swap_count(entry)) 278 swap_entries_free(si, ci, swp_offset(entry), 1); 279 entry.val++; 280 } while (--nr_pages); 281 } 282 } 283 284 /** 285 * swap_cache_del_folio - Removes a folio from the swap cache. 286 * @folio: The folio. 287 * 288 * Same as __swap_cache_del_folio, but handles lock and refcount. The 289 * caller must ensure the folio is either clean or has a swap count 290 * equal to zero, or it may cause data loss. 291 * 292 * Context: Caller must ensure the folio is locked and in the swap cache. 293 */ 294 void swap_cache_del_folio(struct folio *folio) 295 { 296 struct swap_cluster_info *ci; 297 swp_entry_t entry = folio->swap; 298 299 ci = swap_cluster_lock(__swap_entry_to_info(entry), swp_offset(entry)); 300 __swap_cache_del_folio(ci, folio, entry, NULL); 301 swap_cluster_unlock(ci); 302 303 folio_ref_sub(folio, folio_nr_pages(folio)); 304 } 305 306 /** 307 * __swap_cache_replace_folio - Replace a folio in the swap cache. 308 * @ci: The locked swap cluster. 309 * @old: The old folio to be replaced. 310 * @new: The new folio. 311 * 312 * Replace an existing folio in the swap cache with a new folio. The 313 * caller is responsible for setting up the new folio's flag and swap 314 * entries. Replacement will take the new folio's swap entry value as 315 * the starting offset to override all slots covered by the new folio. 316 * 317 * Context: Caller must ensure both folios are locked, and lock the 318 * cluster that holds the old folio to be replaced. 319 */ 320 void __swap_cache_replace_folio(struct swap_cluster_info *ci, 321 struct folio *old, struct folio *new) 322 { 323 swp_entry_t entry = new->swap; 324 unsigned long nr_pages = folio_nr_pages(new); 325 unsigned int ci_off = swp_cluster_offset(entry); 326 unsigned int ci_end = ci_off + nr_pages; 327 unsigned long old_tb, new_tb; 328 329 VM_WARN_ON_ONCE(!folio_test_swapcache(old) || !folio_test_swapcache(new)); 330 VM_WARN_ON_ONCE(!folio_test_locked(old) || !folio_test_locked(new)); 331 VM_WARN_ON_ONCE(!entry.val); 332 333 /* Swap cache still stores N entries instead of a high-order entry */ 334 new_tb = folio_to_swp_tb(new); 335 do { 336 old_tb = __swap_table_xchg(ci, ci_off, new_tb); 337 WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || swp_tb_to_folio(old_tb) != old); 338 } while (++ci_off < ci_end); 339 340 /* 341 * If the old folio is partially replaced (e.g., splitting a large 342 * folio, the old folio is shrunk, and new split sub folios replace 343 * the shrunk part), ensure the new folio doesn't overlap it. 344 */ 345 if (IS_ENABLED(CONFIG_DEBUG_VM) && 346 folio_order(old) != folio_order(new)) { 347 ci_off = swp_cluster_offset(old->swap); 348 ci_end = ci_off + folio_nr_pages(old); 349 while (ci_off++ < ci_end) 350 WARN_ON_ONCE(swp_tb_to_folio(__swap_table_get(ci, ci_off)) != old); 351 } 352 } 353 354 /** 355 * __swap_cache_clear_shadow - Clears a set of shadows in the swap cache. 356 * @entry: The starting index entry. 357 * @nr_ents: How many slots need to be cleared. 358 * 359 * Context: Caller must ensure the range is valid, all in one single cluster, 360 * not occupied by any folio, and lock the cluster. 361 */ 362 void __swap_cache_clear_shadow(swp_entry_t entry, int nr_ents) 363 { 364 struct swap_cluster_info *ci = __swap_entry_to_cluster(entry); 365 unsigned int ci_off = swp_cluster_offset(entry), ci_end; 366 unsigned long old; 367 368 ci_end = ci_off + nr_ents; 369 do { 370 old = __swap_table_xchg(ci, ci_off, null_to_swp_tb()); 371 WARN_ON_ONCE(swp_tb_is_folio(old)); 372 } while (++ci_off < ci_end); 373 } 374 375 /* 376 * If we are the only user, then try to free up the swap cache. 377 * 378 * Its ok to check the swapcache flag without the folio lock 379 * here because we are going to recheck again inside 380 * folio_free_swap() _with_ the lock. 381 * - Marcelo 382 */ 383 void free_swap_cache(struct folio *folio) 384 { 385 if (folio_test_swapcache(folio) && !folio_mapped(folio) && 386 folio_trylock(folio)) { 387 folio_free_swap(folio); 388 folio_unlock(folio); 389 } 390 } 391 392 /* 393 * Freeing a folio and also freeing any swap cache associated with 394 * this folio if it is the last user. 395 */ 396 void free_folio_and_swap_cache(struct folio *folio) 397 { 398 free_swap_cache(folio); 399 if (!is_huge_zero_folio(folio)) 400 folio_put(folio); 401 } 402 403 /* 404 * Passed an array of pages, drop them all from swapcache and then release 405 * them. They are removed from the LRU and freed if this is their last use. 406 */ 407 void free_pages_and_swap_cache(struct encoded_page **pages, int nr) 408 { 409 struct folio_batch folios; 410 unsigned int refs[PAGEVEC_SIZE]; 411 412 folio_batch_init(&folios); 413 for (int i = 0; i < nr; i++) { 414 struct folio *folio = page_folio(encoded_page_ptr(pages[i])); 415 416 free_swap_cache(folio); 417 refs[folios.nr] = 1; 418 if (unlikely(encoded_page_flags(pages[i]) & 419 ENCODED_PAGE_BIT_NR_PAGES_NEXT)) 420 refs[folios.nr] = encoded_nr_pages(pages[++i]); 421 422 if (folio_batch_add(&folios, folio) == 0) 423 folios_put_refs(&folios, refs); 424 } 425 if (folios.nr) 426 folios_put_refs(&folios, refs); 427 } 428 429 static inline bool swap_use_vma_readahead(void) 430 { 431 return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap); 432 } 433 434 /** 435 * swap_update_readahead - Update the readahead statistics of VMA or globally. 436 * @folio: the swap cache folio that just got hit. 437 * @vma: the VMA that should be updated, could be NULL for global update. 438 * @addr: the addr that triggered the swapin, ignored if @vma is NULL. 439 */ 440 void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma, 441 unsigned long addr) 442 { 443 bool readahead, vma_ra = swap_use_vma_readahead(); 444 445 /* 446 * At the moment, we don't support PG_readahead for anon THP 447 * so let's bail out rather than confusing the readahead stat. 448 */ 449 if (unlikely(folio_test_large(folio))) 450 return; 451 452 readahead = folio_test_clear_readahead(folio); 453 if (vma && vma_ra) { 454 unsigned long ra_val; 455 int win, hits; 456 457 ra_val = GET_SWAP_RA_VAL(vma); 458 win = SWAP_RA_WIN(ra_val); 459 hits = SWAP_RA_HITS(ra_val); 460 if (readahead) 461 hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX); 462 atomic_long_set(&vma->swap_readahead_info, 463 SWAP_RA_VAL(addr, win, hits)); 464 } 465 466 if (readahead) { 467 count_vm_event(SWAP_RA_HIT); 468 if (!vma || !vma_ra) 469 atomic_inc(&swapin_readahead_hits); 470 } 471 } 472 473 /** 474 * __swap_cache_prepare_and_add - Prepare the folio and add it to swap cache. 475 * @entry: swap entry to be bound to the folio. 476 * @folio: folio to be added. 477 * @gfp: memory allocation flags for charge, can be 0 if @charged if true. 478 * @charged: if the folio is already charged. 479 * 480 * Update the swap_map and add folio as swap cache, typically before swapin. 481 * All swap slots covered by the folio must have a non-zero swap count. 482 * 483 * Context: Caller must protect the swap device with reference count or locks. 484 * Return: Returns the folio being added on success. Returns the existing folio 485 * if @entry is already cached. Returns NULL if raced with swapin or swapoff. 486 */ 487 static struct folio *__swap_cache_prepare_and_add(swp_entry_t entry, 488 struct folio *folio, 489 gfp_t gfp, bool charged) 490 { 491 struct folio *swapcache = NULL; 492 void *shadow; 493 int ret; 494 495 __folio_set_locked(folio); 496 __folio_set_swapbacked(folio); 497 for (;;) { 498 ret = swap_cache_add_folio(folio, entry, &shadow); 499 if (!ret) 500 break; 501 502 /* 503 * Large order allocation needs special handling on 504 * race: if a smaller folio exists in cache, swapin needs 505 * to fallback to order 0, and doing a swap cache lookup 506 * might return a folio that is irrelevant to the faulting 507 * entry because @entry is aligned down. Just return NULL. 508 */ 509 if (ret != -EEXIST || folio_test_large(folio)) 510 goto failed; 511 512 swapcache = swap_cache_get_folio(entry); 513 if (swapcache) 514 goto failed; 515 } 516 517 if (!charged && mem_cgroup_swapin_charge_folio(folio, NULL, gfp, entry)) { 518 swap_cache_del_folio(folio); 519 goto failed; 520 } 521 522 memcg1_swapin(entry, folio_nr_pages(folio)); 523 if (shadow) 524 workingset_refault(folio, shadow); 525 526 /* Caller will initiate read into locked folio */ 527 folio_add_lru(folio); 528 return folio; 529 530 failed: 531 folio_unlock(folio); 532 return swapcache; 533 } 534 535 /** 536 * swap_cache_alloc_folio - Allocate folio for swapped out slot in swap cache. 537 * @entry: the swapped out swap entry to be binded to the folio. 538 * @gfp_mask: memory allocation flags 539 * @mpol: NUMA memory allocation policy to be applied 540 * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE 541 * @new_page_allocated: sets true if allocation happened, false otherwise 542 * 543 * Allocate a folio in the swap cache for one swap slot, typically before 544 * doing IO (e.g. swap in or zswap writeback). The swap slot indicated by 545 * @entry must have a non-zero swap count (swapped out). 546 * Currently only supports order 0. 547 * 548 * Context: Caller must protect the swap device with reference count or locks. 549 * Return: Returns the existing folio if @entry is cached already. Returns 550 * NULL if failed due to -ENOMEM or @entry have a swap count < 1. 551 */ 552 struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_mask, 553 struct mempolicy *mpol, pgoff_t ilx, 554 bool *new_page_allocated) 555 { 556 struct swap_info_struct *si = __swap_entry_to_info(entry); 557 struct folio *folio; 558 struct folio *result = NULL; 559 560 *new_page_allocated = false; 561 /* Check the swap cache again for readahead path. */ 562 folio = swap_cache_get_folio(entry); 563 if (folio) 564 return folio; 565 566 /* Skip allocation for unused and bad swap slot for readahead. */ 567 if (!swap_entry_swapped(si, entry)) 568 return NULL; 569 570 /* Allocate a new folio to be added into the swap cache. */ 571 folio = folio_alloc_mpol(gfp_mask, 0, mpol, ilx, numa_node_id()); 572 if (!folio) 573 return NULL; 574 /* Try add the new folio, returns existing folio or NULL on failure. */ 575 result = __swap_cache_prepare_and_add(entry, folio, gfp_mask, false); 576 if (result == folio) 577 *new_page_allocated = true; 578 else 579 folio_put(folio); 580 return result; 581 } 582 583 /** 584 * swapin_folio - swap-in one or multiple entries skipping readahead. 585 * @entry: starting swap entry to swap in 586 * @folio: a new allocated and charged folio 587 * 588 * Reads @entry into @folio, @folio will be added to the swap cache. 589 * If @folio is a large folio, the @entry will be rounded down to align 590 * with the folio size. 591 * 592 * Return: returns pointer to @folio on success. If folio is a large folio 593 * and this raced with another swapin, NULL will be returned to allow fallback 594 * to order 0. Else, if another folio was already added to the swap cache, 595 * return that swap cache folio instead. 596 */ 597 struct folio *swapin_folio(swp_entry_t entry, struct folio *folio) 598 { 599 struct folio *swapcache; 600 pgoff_t offset = swp_offset(entry); 601 unsigned long nr_pages = folio_nr_pages(folio); 602 603 entry = swp_entry(swp_type(entry), round_down(offset, nr_pages)); 604 swapcache = __swap_cache_prepare_and_add(entry, folio, 0, true); 605 if (swapcache == folio) 606 swap_read_folio(folio, NULL); 607 return swapcache; 608 } 609 610 /* 611 * Locate a page of swap in physical memory, reserving swap cache space 612 * and reading the disk if it is not already cached. 613 * A failure return means that either the page allocation failed or that 614 * the swap entry is no longer in use. 615 */ 616 struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, 617 struct vm_area_struct *vma, unsigned long addr, 618 struct swap_iocb **plug) 619 { 620 struct swap_info_struct *si; 621 bool page_allocated; 622 struct mempolicy *mpol; 623 pgoff_t ilx; 624 struct folio *folio; 625 626 si = get_swap_device(entry); 627 if (!si) 628 return NULL; 629 630 mpol = get_vma_policy(vma, addr, 0, &ilx); 631 folio = swap_cache_alloc_folio(entry, gfp_mask, mpol, ilx, 632 &page_allocated); 633 mpol_cond_put(mpol); 634 635 if (page_allocated) 636 swap_read_folio(folio, plug); 637 638 put_swap_device(si); 639 return folio; 640 } 641 642 static unsigned int __swapin_nr_pages(unsigned long prev_offset, 643 unsigned long offset, 644 int hits, 645 int max_pages, 646 int prev_win) 647 { 648 unsigned int pages, last_ra; 649 650 /* 651 * This heuristic has been found to work well on both sequential and 652 * random loads, swapping to hard disk or to SSD: please don't ask 653 * what the "+ 2" means, it just happens to work well, that's all. 654 */ 655 pages = hits + 2; 656 if (pages == 2) { 657 /* 658 * We can have no readahead hits to judge by: but must not get 659 * stuck here forever, so check for an adjacent offset instead 660 * (and don't even bother to check whether swap type is same). 661 */ 662 if (offset != prev_offset + 1 && offset != prev_offset - 1) 663 pages = 1; 664 } else { 665 unsigned int roundup = 4; 666 while (roundup < pages) 667 roundup <<= 1; 668 pages = roundup; 669 } 670 671 if (pages > max_pages) 672 pages = max_pages; 673 674 /* Don't shrink readahead too fast */ 675 last_ra = prev_win / 2; 676 if (pages < last_ra) 677 pages = last_ra; 678 679 return pages; 680 } 681 682 static unsigned long swapin_nr_pages(unsigned long offset) 683 { 684 static unsigned long prev_offset; 685 unsigned int hits, pages, max_pages; 686 static atomic_t last_readahead_pages; 687 688 max_pages = 1 << READ_ONCE(page_cluster); 689 if (max_pages <= 1) 690 return 1; 691 692 hits = atomic_xchg(&swapin_readahead_hits, 0); 693 pages = __swapin_nr_pages(READ_ONCE(prev_offset), offset, hits, 694 max_pages, 695 atomic_read(&last_readahead_pages)); 696 if (!hits) 697 WRITE_ONCE(prev_offset, offset); 698 atomic_set(&last_readahead_pages, pages); 699 700 return pages; 701 } 702 703 /** 704 * swap_cluster_readahead - swap in pages in hope we need them soon 705 * @entry: swap entry of this memory 706 * @gfp_mask: memory allocation flags 707 * @mpol: NUMA memory allocation policy to be applied 708 * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE 709 * 710 * Returns the struct folio for entry and addr, after queueing swapin. 711 * 712 * Primitive swap readahead code. We simply read an aligned block of 713 * (1 << page_cluster) entries in the swap area. This method is chosen 714 * because it doesn't cost us any seek time. We also make sure to queue 715 * the 'original' request together with the readahead ones... 716 * 717 * Note: it is intentional that the same NUMA policy and interleave index 718 * are used for every page of the readahead: neighbouring pages on swap 719 * are fairly likely to have been swapped out from the same node. 720 */ 721 struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, 722 struct mempolicy *mpol, pgoff_t ilx) 723 { 724 struct folio *folio; 725 unsigned long entry_offset = swp_offset(entry); 726 unsigned long offset = entry_offset; 727 unsigned long start_offset, end_offset; 728 unsigned long mask; 729 struct swap_info_struct *si = __swap_entry_to_info(entry); 730 struct blk_plug plug; 731 struct swap_iocb *splug = NULL; 732 bool page_allocated; 733 734 mask = swapin_nr_pages(offset) - 1; 735 if (!mask) 736 goto skip; 737 738 /* Read a page_cluster sized and aligned cluster around offset. */ 739 start_offset = offset & ~mask; 740 end_offset = offset | mask; 741 if (!start_offset) /* First page is swap header. */ 742 start_offset++; 743 if (end_offset >= si->max) 744 end_offset = si->max - 1; 745 746 blk_start_plug(&plug); 747 for (offset = start_offset; offset <= end_offset ; offset++) { 748 /* Ok, do the async read-ahead now */ 749 folio = swap_cache_alloc_folio( 750 swp_entry(swp_type(entry), offset), gfp_mask, mpol, ilx, 751 &page_allocated); 752 if (!folio) 753 continue; 754 if (page_allocated) { 755 swap_read_folio(folio, &splug); 756 if (offset != entry_offset) { 757 folio_set_readahead(folio); 758 count_vm_event(SWAP_RA); 759 } 760 } 761 folio_put(folio); 762 } 763 blk_finish_plug(&plug); 764 swap_read_unplug(splug); 765 lru_add_drain(); /* Push any new pages onto the LRU now */ 766 skip: 767 /* The page was likely read above, so no need for plugging here */ 768 folio = swap_cache_alloc_folio(entry, gfp_mask, mpol, ilx, 769 &page_allocated); 770 if (unlikely(page_allocated)) 771 swap_read_folio(folio, NULL); 772 return folio; 773 } 774 775 static int swap_vma_ra_win(struct vm_fault *vmf, unsigned long *start, 776 unsigned long *end) 777 { 778 struct vm_area_struct *vma = vmf->vma; 779 unsigned long ra_val; 780 unsigned long faddr, prev_faddr, left, right; 781 unsigned int max_win, hits, prev_win, win; 782 783 max_win = 1 << min(READ_ONCE(page_cluster), SWAP_RA_ORDER_CEILING); 784 if (max_win == 1) 785 return 1; 786 787 faddr = vmf->address; 788 ra_val = GET_SWAP_RA_VAL(vma); 789 prev_faddr = SWAP_RA_ADDR(ra_val); 790 prev_win = SWAP_RA_WIN(ra_val); 791 hits = SWAP_RA_HITS(ra_val); 792 win = __swapin_nr_pages(PFN_DOWN(prev_faddr), PFN_DOWN(faddr), hits, 793 max_win, prev_win); 794 atomic_long_set(&vma->swap_readahead_info, SWAP_RA_VAL(faddr, win, 0)); 795 if (win == 1) 796 return 1; 797 798 if (faddr == prev_faddr + PAGE_SIZE) 799 left = faddr; 800 else if (prev_faddr == faddr + PAGE_SIZE) 801 left = faddr - (win << PAGE_SHIFT) + PAGE_SIZE; 802 else 803 left = faddr - (((win - 1) / 2) << PAGE_SHIFT); 804 right = left + (win << PAGE_SHIFT); 805 if ((long)left < 0) 806 left = 0; 807 *start = max3(left, vma->vm_start, faddr & PMD_MASK); 808 *end = min3(right, vma->vm_end, (faddr & PMD_MASK) + PMD_SIZE); 809 810 return win; 811 } 812 813 /** 814 * swap_vma_readahead - swap in pages in hope we need them soon 815 * @targ_entry: swap entry of the targeted memory 816 * @gfp_mask: memory allocation flags 817 * @mpol: NUMA memory allocation policy to be applied 818 * @targ_ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE 819 * @vmf: fault information 820 * 821 * Returns the struct folio for entry and addr, after queueing swapin. 822 * 823 * Primitive swap readahead code. We simply read in a few pages whose 824 * virtual addresses are around the fault address in the same vma. 825 * 826 * Caller must hold read mmap_lock if vmf->vma is not NULL. 827 * 828 */ 829 static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, 830 struct mempolicy *mpol, pgoff_t targ_ilx, struct vm_fault *vmf) 831 { 832 struct blk_plug plug; 833 struct swap_iocb *splug = NULL; 834 struct folio *folio; 835 pte_t *pte = NULL, pentry; 836 int win; 837 unsigned long start, end, addr; 838 pgoff_t ilx; 839 bool page_allocated; 840 841 win = swap_vma_ra_win(vmf, &start, &end); 842 if (win == 1) 843 goto skip; 844 845 ilx = targ_ilx - PFN_DOWN(vmf->address - start); 846 847 blk_start_plug(&plug); 848 for (addr = start; addr < end; ilx++, addr += PAGE_SIZE) { 849 struct swap_info_struct *si = NULL; 850 softleaf_t entry; 851 852 if (!pte++) { 853 pte = pte_offset_map(vmf->pmd, addr); 854 if (!pte) 855 break; 856 } 857 pentry = ptep_get_lockless(pte); 858 entry = softleaf_from_pte(pentry); 859 860 if (!softleaf_is_swap(entry)) 861 continue; 862 pte_unmap(pte); 863 pte = NULL; 864 /* 865 * Readahead entry may come from a device that we are not 866 * holding a reference to, try to grab a reference, or skip. 867 */ 868 if (swp_type(entry) != swp_type(targ_entry)) { 869 si = get_swap_device(entry); 870 if (!si) 871 continue; 872 } 873 folio = swap_cache_alloc_folio(entry, gfp_mask, mpol, ilx, 874 &page_allocated); 875 if (si) 876 put_swap_device(si); 877 if (!folio) 878 continue; 879 if (page_allocated) { 880 swap_read_folio(folio, &splug); 881 if (addr != vmf->address) { 882 folio_set_readahead(folio); 883 count_vm_event(SWAP_RA); 884 } 885 } 886 folio_put(folio); 887 } 888 if (pte) 889 pte_unmap(pte); 890 blk_finish_plug(&plug); 891 swap_read_unplug(splug); 892 lru_add_drain(); 893 skip: 894 /* The folio was likely read above, so no need for plugging here */ 895 folio = swap_cache_alloc_folio(targ_entry, gfp_mask, mpol, targ_ilx, 896 &page_allocated); 897 if (unlikely(page_allocated)) 898 swap_read_folio(folio, NULL); 899 return folio; 900 } 901 902 /** 903 * swapin_readahead - swap in pages in hope we need them soon 904 * @entry: swap entry of this memory 905 * @gfp_mask: memory allocation flags 906 * @vmf: fault information 907 * 908 * Returns the struct folio for entry and addr, after queueing swapin. 909 * 910 * It's a main entry function for swap readahead. By the configuration, 911 * it will read ahead blocks by cluster-based(ie, physical disk based) 912 * or vma-based(ie, virtual address based on faulty address) readahead. 913 */ 914 struct folio *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, 915 struct vm_fault *vmf) 916 { 917 struct mempolicy *mpol; 918 pgoff_t ilx; 919 struct folio *folio; 920 921 mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx); 922 folio = swap_use_vma_readahead() ? 923 swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf) : 924 swap_cluster_readahead(entry, gfp_mask, mpol, ilx); 925 mpol_cond_put(mpol); 926 927 return folio; 928 } 929 930 #ifdef CONFIG_SYSFS 931 static ssize_t vma_ra_enabled_show(struct kobject *kobj, 932 struct kobj_attribute *attr, char *buf) 933 { 934 return sysfs_emit(buf, "%s\n", str_true_false(enable_vma_readahead)); 935 } 936 static ssize_t vma_ra_enabled_store(struct kobject *kobj, 937 struct kobj_attribute *attr, 938 const char *buf, size_t count) 939 { 940 ssize_t ret; 941 942 ret = kstrtobool(buf, &enable_vma_readahead); 943 if (ret) 944 return ret; 945 946 return count; 947 } 948 static struct kobj_attribute vma_ra_enabled_attr = __ATTR_RW(vma_ra_enabled); 949 950 static struct attribute *swap_attrs[] = { 951 &vma_ra_enabled_attr.attr, 952 NULL, 953 }; 954 955 static const struct attribute_group swap_attr_group = { 956 .attrs = swap_attrs, 957 }; 958 959 static int __init swap_init(void) 960 { 961 int err; 962 struct kobject *swap_kobj; 963 964 swap_kobj = kobject_create_and_add("swap", mm_kobj); 965 if (!swap_kobj) { 966 pr_err("failed to create swap kobject\n"); 967 return -ENOMEM; 968 } 969 err = sysfs_create_group(swap_kobj, &swap_attr_group); 970 if (err) { 971 pr_err("failed to register swap group\n"); 972 goto delete_obj; 973 } 974 /* Swap cache writeback is LRU based, no tags for it */ 975 mapping_set_no_writeback_tags(&swap_space); 976 return 0; 977 978 delete_obj: 979 kobject_put(swap_kobj); 980 return err; 981 } 982 subsys_initcall(swap_init); 983 #endif 984