1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/mm/swap_state.c 4 * 5 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 6 * Swap reorganised 29.12.95, Stephen Tweedie 7 * 8 * Rewritten to use page cache, (C) 1998 Stephen Tweedie 9 */ 10 #include <linux/mm.h> 11 #include <linux/gfp.h> 12 #include <linux/kernel_stat.h> 13 #include <linux/mempolicy.h> 14 #include <linux/swap.h> 15 #include <linux/leafops.h> 16 #include <linux/init.h> 17 #include <linux/pagemap.h> 18 #include <linux/pagevec.h> 19 #include <linux/backing-dev.h> 20 #include <linux/blkdev.h> 21 #include <linux/migrate.h> 22 #include <linux/vmalloc.h> 23 #include <linux/huge_mm.h> 24 #include <linux/shmem_fs.h> 25 #include "internal.h" 26 #include "swap_table.h" 27 #include "swap.h" 28 29 /* 30 * swapper_space is a fiction, retained to simplify the path through 31 * vmscan's shrink_folio_list. 32 */ 33 static const struct address_space_operations swap_aops = { 34 .dirty_folio = noop_dirty_folio, 35 #ifdef CONFIG_MIGRATION 36 .migrate_folio = migrate_folio, 37 #endif 38 }; 39 40 struct address_space swap_space __read_mostly = { 41 .a_ops = &swap_aops, 42 }; 43 44 static bool enable_vma_readahead __read_mostly = true; 45 46 #define SWAP_RA_ORDER_CEILING 5 47 48 #define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) 49 #define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) 50 #define SWAP_RA_HITS_MAX SWAP_RA_HITS_MASK 51 #define SWAP_RA_WIN_MASK (~PAGE_MASK & ~SWAP_RA_HITS_MASK) 52 53 #define SWAP_RA_HITS(v) ((v) & SWAP_RA_HITS_MASK) 54 #define SWAP_RA_WIN(v) (((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT) 55 #define SWAP_RA_ADDR(v) ((v) & PAGE_MASK) 56 57 #define SWAP_RA_VAL(addr, win, hits) \ 58 (((addr) & PAGE_MASK) | \ 59 (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) | \ 60 ((hits) & SWAP_RA_HITS_MASK)) 61 62 /* Initial readahead hits is 4 to start up with a small window */ 63 #define GET_SWAP_RA_VAL(vma) \ 64 (atomic_long_read(&(vma)->swap_readahead_info) ? : 4) 65 66 static atomic_t swapin_readahead_hits = ATOMIC_INIT(4); 67 68 void show_swap_cache_info(void) 69 { 70 printk("%lu pages in swap cache\n", total_swapcache_pages()); 71 printk("Free swap = %ldkB\n", K(get_nr_swap_pages())); 72 printk("Total swap = %lukB\n", K(total_swap_pages)); 73 } 74 75 /** 76 * swap_cache_get_folio - Looks up a folio in the swap cache. 77 * @entry: swap entry used for the lookup. 78 * 79 * A found folio will be returned unlocked and with its refcount increased. 80 * 81 * Context: Caller must ensure @entry is valid and protect the swap device 82 * with reference count or locks. 83 * Return: Returns the found folio on success, NULL otherwise. The caller 84 * must lock and check if the folio still matches the swap entry before 85 * use (e.g., folio_matches_swap_entry). 86 */ 87 struct folio *swap_cache_get_folio(swp_entry_t entry) 88 { 89 unsigned long swp_tb; 90 struct folio *folio; 91 92 for (;;) { 93 swp_tb = swap_table_get(__swap_entry_to_cluster(entry), 94 swp_cluster_offset(entry)); 95 if (!swp_tb_is_folio(swp_tb)) 96 return NULL; 97 folio = swp_tb_to_folio(swp_tb); 98 if (likely(folio_try_get(folio))) 99 return folio; 100 } 101 102 return NULL; 103 } 104 105 /** 106 * swap_cache_has_folio - Check if a swap slot has cache. 107 * @entry: swap entry indicating the slot. 108 * 109 * Context: Caller must ensure @entry is valid and protect the swap 110 * device with reference count or locks. 111 */ 112 bool swap_cache_has_folio(swp_entry_t entry) 113 { 114 unsigned long swp_tb; 115 116 swp_tb = swap_table_get(__swap_entry_to_cluster(entry), 117 swp_cluster_offset(entry)); 118 return swp_tb_is_folio(swp_tb); 119 } 120 121 /** 122 * swap_cache_get_shadow - Looks up a shadow in the swap cache. 123 * @entry: swap entry used for the lookup. 124 * 125 * Context: Caller must ensure @entry is valid and protect the swap device 126 * with reference count or locks. 127 * Return: Returns either NULL or an XA_VALUE (shadow). 128 */ 129 void *swap_cache_get_shadow(swp_entry_t entry) 130 { 131 unsigned long swp_tb; 132 133 swp_tb = swap_table_get(__swap_entry_to_cluster(entry), 134 swp_cluster_offset(entry)); 135 if (swp_tb_is_shadow(swp_tb)) 136 return swp_tb_to_shadow(swp_tb); 137 return NULL; 138 } 139 140 void __swap_cache_add_folio(struct swap_cluster_info *ci, 141 struct folio *folio, swp_entry_t entry) 142 { 143 unsigned long new_tb; 144 unsigned int ci_start, ci_off, ci_end; 145 unsigned long nr_pages = folio_nr_pages(folio); 146 147 VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); 148 VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio); 149 VM_WARN_ON_ONCE_FOLIO(!folio_test_swapbacked(folio), folio); 150 151 new_tb = folio_to_swp_tb(folio); 152 ci_start = swp_cluster_offset(entry); 153 ci_off = ci_start; 154 ci_end = ci_start + nr_pages; 155 do { 156 VM_WARN_ON_ONCE(swp_tb_is_folio(__swap_table_get(ci, ci_off))); 157 __swap_table_set(ci, ci_off, new_tb); 158 } while (++ci_off < ci_end); 159 160 folio_ref_add(folio, nr_pages); 161 folio_set_swapcache(folio); 162 folio->swap = entry; 163 164 node_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages); 165 lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr_pages); 166 } 167 168 /** 169 * swap_cache_add_folio - Add a folio into the swap cache. 170 * @folio: The folio to be added. 171 * @entry: The swap entry corresponding to the folio. 172 * @gfp: gfp_mask for XArray node allocation. 173 * @shadowp: If a shadow is found, return the shadow. 174 * 175 * Context: Caller must ensure @entry is valid and protect the swap device 176 * with reference count or locks. 177 */ 178 static int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, 179 void **shadowp) 180 { 181 int err; 182 void *shadow = NULL; 183 unsigned long old_tb; 184 struct swap_info_struct *si; 185 struct swap_cluster_info *ci; 186 unsigned int ci_start, ci_off, ci_end, offset; 187 unsigned long nr_pages = folio_nr_pages(folio); 188 189 si = __swap_entry_to_info(entry); 190 ci_start = swp_cluster_offset(entry); 191 ci_end = ci_start + nr_pages; 192 ci_off = ci_start; 193 offset = swp_offset(entry); 194 ci = swap_cluster_lock(si, swp_offset(entry)); 195 if (unlikely(!ci->table)) { 196 err = -ENOENT; 197 goto failed; 198 } 199 do { 200 old_tb = __swap_table_get(ci, ci_off); 201 if (unlikely(swp_tb_is_folio(old_tb))) { 202 err = -EEXIST; 203 goto failed; 204 } 205 if (unlikely(!__swap_count(swp_entry(swp_type(entry), offset)))) { 206 err = -ENOENT; 207 goto failed; 208 } 209 if (swp_tb_is_shadow(old_tb)) 210 shadow = swp_tb_to_shadow(old_tb); 211 offset++; 212 } while (++ci_off < ci_end); 213 __swap_cache_add_folio(ci, folio, entry); 214 swap_cluster_unlock(ci); 215 if (shadowp) 216 *shadowp = shadow; 217 return 0; 218 219 failed: 220 swap_cluster_unlock(ci); 221 return err; 222 } 223 224 /** 225 * __swap_cache_del_folio - Removes a folio from the swap cache. 226 * @ci: The locked swap cluster. 227 * @folio: The folio. 228 * @entry: The first swap entry that the folio corresponds to. 229 * @shadow: shadow value to be filled in the swap cache. 230 * 231 * Removes a folio from the swap cache and fills a shadow in place. 232 * This won't put the folio's refcount. The caller has to do that. 233 * 234 * Context: Caller must ensure the folio is locked and in the swap cache 235 * using the index of @entry, and lock the cluster that holds the entries. 236 */ 237 void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, 238 swp_entry_t entry, void *shadow) 239 { 240 struct swap_info_struct *si; 241 unsigned long old_tb, new_tb; 242 unsigned int ci_start, ci_off, ci_end; 243 bool folio_swapped = false, need_free = false; 244 unsigned long nr_pages = folio_nr_pages(folio); 245 246 VM_WARN_ON_ONCE(__swap_entry_to_cluster(entry) != ci); 247 VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); 248 VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio); 249 VM_WARN_ON_ONCE_FOLIO(folio_test_writeback(folio), folio); 250 251 si = __swap_entry_to_info(entry); 252 new_tb = shadow_swp_to_tb(shadow); 253 ci_start = swp_cluster_offset(entry); 254 ci_end = ci_start + nr_pages; 255 ci_off = ci_start; 256 do { 257 /* If shadow is NULL, we sets an empty shadow */ 258 old_tb = __swap_table_xchg(ci, ci_off, new_tb); 259 WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || 260 swp_tb_to_folio(old_tb) != folio); 261 if (__swap_count(swp_entry(si->type, 262 swp_offset(entry) + ci_off - ci_start))) 263 folio_swapped = true; 264 else 265 need_free = true; 266 } while (++ci_off < ci_end); 267 268 folio->swap.val = 0; 269 folio_clear_swapcache(folio); 270 node_stat_mod_folio(folio, NR_FILE_PAGES, -nr_pages); 271 lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr_pages); 272 273 if (!folio_swapped) { 274 swap_entries_free(si, ci, swp_offset(entry), nr_pages); 275 } else if (need_free) { 276 do { 277 if (!__swap_count(entry)) 278 swap_entries_free(si, ci, swp_offset(entry), 1); 279 entry.val++; 280 } while (--nr_pages); 281 } 282 } 283 284 /** 285 * swap_cache_del_folio - Removes a folio from the swap cache. 286 * @folio: The folio. 287 * 288 * Same as __swap_cache_del_folio, but handles lock and refcount. The 289 * caller must ensure the folio is either clean or has a swap count 290 * equal to zero, or it may cause data loss. 291 * 292 * Context: Caller must ensure the folio is locked and in the swap cache. 293 */ 294 void swap_cache_del_folio(struct folio *folio) 295 { 296 struct swap_cluster_info *ci; 297 swp_entry_t entry = folio->swap; 298 299 ci = swap_cluster_lock(__swap_entry_to_info(entry), swp_offset(entry)); 300 __swap_cache_del_folio(ci, folio, entry, NULL); 301 swap_cluster_unlock(ci); 302 303 folio_ref_sub(folio, folio_nr_pages(folio)); 304 } 305 306 /** 307 * __swap_cache_replace_folio - Replace a folio in the swap cache. 308 * @ci: The locked swap cluster. 309 * @old: The old folio to be replaced. 310 * @new: The new folio. 311 * 312 * Replace an existing folio in the swap cache with a new folio. The 313 * caller is responsible for setting up the new folio's flag and swap 314 * entries. Replacement will take the new folio's swap entry value as 315 * the starting offset to override all slots covered by the new folio. 316 * 317 * Context: Caller must ensure both folios are locked, and lock the 318 * cluster that holds the old folio to be replaced. 319 */ 320 void __swap_cache_replace_folio(struct swap_cluster_info *ci, 321 struct folio *old, struct folio *new) 322 { 323 swp_entry_t entry = new->swap; 324 unsigned long nr_pages = folio_nr_pages(new); 325 unsigned int ci_off = swp_cluster_offset(entry); 326 unsigned int ci_end = ci_off + nr_pages; 327 unsigned long old_tb, new_tb; 328 329 VM_WARN_ON_ONCE(!folio_test_swapcache(old) || !folio_test_swapcache(new)); 330 VM_WARN_ON_ONCE(!folio_test_locked(old) || !folio_test_locked(new)); 331 VM_WARN_ON_ONCE(!entry.val); 332 333 /* Swap cache still stores N entries instead of a high-order entry */ 334 new_tb = folio_to_swp_tb(new); 335 do { 336 old_tb = __swap_table_xchg(ci, ci_off, new_tb); 337 WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || swp_tb_to_folio(old_tb) != old); 338 } while (++ci_off < ci_end); 339 340 /* 341 * If the old folio is partially replaced (e.g., splitting a large 342 * folio, the old folio is shrunk, and new split sub folios replace 343 * the shrunk part), ensure the new folio doesn't overlap it. 344 */ 345 if (IS_ENABLED(CONFIG_DEBUG_VM) && 346 folio_order(old) != folio_order(new)) { 347 ci_off = swp_cluster_offset(old->swap); 348 ci_end = ci_off + folio_nr_pages(old); 349 while (ci_off++ < ci_end) 350 WARN_ON_ONCE(swp_tb_to_folio(__swap_table_get(ci, ci_off)) != old); 351 } 352 } 353 354 /** 355 * __swap_cache_clear_shadow - Clears a set of shadows in the swap cache. 356 * @entry: The starting index entry. 357 * @nr_ents: How many slots need to be cleared. 358 * 359 * Context: Caller must ensure the range is valid, all in one single cluster, 360 * not occupied by any folio, and lock the cluster. 361 */ 362 void __swap_cache_clear_shadow(swp_entry_t entry, int nr_ents) 363 { 364 struct swap_cluster_info *ci = __swap_entry_to_cluster(entry); 365 unsigned int ci_off = swp_cluster_offset(entry), ci_end; 366 unsigned long old; 367 368 ci_end = ci_off + nr_ents; 369 do { 370 old = __swap_table_xchg(ci, ci_off, null_to_swp_tb()); 371 WARN_ON_ONCE(swp_tb_is_folio(old)); 372 } while (++ci_off < ci_end); 373 } 374 375 /* 376 * If we are the only user, then try to free up the swap cache. 377 * 378 * Its ok to check the swapcache flag without the folio lock 379 * here because we are going to recheck again inside 380 * folio_free_swap() _with_ the lock. 381 * - Marcelo 382 */ 383 void free_swap_cache(struct folio *folio) 384 { 385 if (folio_test_swapcache(folio) && !folio_mapped(folio) && 386 folio_trylock(folio)) { 387 folio_free_swap(folio); 388 folio_unlock(folio); 389 } 390 } 391 392 /* 393 * Freeing a folio and also freeing any swap cache associated with 394 * this folio if it is the last user. 395 */ 396 void free_folio_and_swap_cache(struct folio *folio) 397 { 398 free_swap_cache(folio); 399 if (!is_huge_zero_folio(folio)) 400 folio_put(folio); 401 } 402 403 /* 404 * Passed an array of pages, drop them all from swapcache and then release 405 * them. They are removed from the LRU and freed if this is their last use. 406 */ 407 void free_pages_and_swap_cache(struct encoded_page **pages, int nr) 408 { 409 struct folio_batch folios; 410 unsigned int refs[PAGEVEC_SIZE]; 411 412 folio_batch_init(&folios); 413 for (int i = 0; i < nr; i++) { 414 struct folio *folio = page_folio(encoded_page_ptr(pages[i])); 415 416 free_swap_cache(folio); 417 refs[folios.nr] = 1; 418 if (unlikely(encoded_page_flags(pages[i]) & 419 ENCODED_PAGE_BIT_NR_PAGES_NEXT)) 420 refs[folios.nr] = encoded_nr_pages(pages[++i]); 421 422 if (folio_batch_add(&folios, folio) == 0) 423 folios_put_refs(&folios, refs); 424 } 425 if (folios.nr) 426 folios_put_refs(&folios, refs); 427 } 428 429 static inline bool swap_use_vma_readahead(void) 430 { 431 return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap); 432 } 433 434 /** 435 * swap_update_readahead - Update the readahead statistics of VMA or globally. 436 * @folio: the swap cache folio that just got hit. 437 * @vma: the VMA that should be updated, could be NULL for global update. 438 * @addr: the addr that triggered the swapin, ignored if @vma is NULL. 439 */ 440 void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma, 441 unsigned long addr) 442 { 443 bool readahead, vma_ra = swap_use_vma_readahead(); 444 445 /* 446 * At the moment, we don't support PG_readahead for anon THP 447 * so let's bail out rather than confusing the readahead stat. 448 */ 449 if (unlikely(folio_test_large(folio))) 450 return; 451 452 readahead = folio_test_clear_readahead(folio); 453 if (vma && vma_ra) { 454 unsigned long ra_val; 455 int win, hits; 456 457 ra_val = GET_SWAP_RA_VAL(vma); 458 win = SWAP_RA_WIN(ra_val); 459 hits = SWAP_RA_HITS(ra_val); 460 if (readahead) 461 hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX); 462 atomic_long_set(&vma->swap_readahead_info, 463 SWAP_RA_VAL(addr, win, hits)); 464 } 465 466 if (readahead) { 467 count_vm_event(SWAP_RA_HIT); 468 if (!vma || !vma_ra) 469 atomic_inc(&swapin_readahead_hits); 470 } 471 } 472 473 /** 474 * __swap_cache_prepare_and_add - Prepare the folio and add it to swap cache. 475 * @entry: swap entry to be bound to the folio. 476 * @folio: folio to be added. 477 * @gfp: memory allocation flags for charge, can be 0 if @charged if true. 478 * @charged: if the folio is already charged. 479 * 480 * Update the swap_map and add folio as swap cache, typically before swapin. 481 * All swap slots covered by the folio must have a non-zero swap count. 482 * 483 * Context: Caller must protect the swap device with reference count or locks. 484 * Return: Returns the folio being added on success. Returns the existing folio 485 * if @entry is already cached. Returns NULL if raced with swapin or swapoff. 486 */ 487 static struct folio *__swap_cache_prepare_and_add(swp_entry_t entry, 488 struct folio *folio, 489 gfp_t gfp, bool charged) 490 { 491 struct folio *swapcache = NULL; 492 void *shadow; 493 int ret; 494 495 __folio_set_locked(folio); 496 __folio_set_swapbacked(folio); 497 498 if (!charged && mem_cgroup_swapin_charge_folio(folio, NULL, gfp, entry)) 499 goto failed; 500 501 for (;;) { 502 ret = swap_cache_add_folio(folio, entry, &shadow); 503 if (!ret) 504 break; 505 506 /* 507 * Large order allocation needs special handling on 508 * race: if a smaller folio exists in cache, swapin needs 509 * to fallback to order 0, and doing a swap cache lookup 510 * might return a folio that is irrelevant to the faulting 511 * entry because @entry is aligned down. Just return NULL. 512 */ 513 if (ret != -EEXIST || folio_test_large(folio)) 514 goto failed; 515 516 swapcache = swap_cache_get_folio(entry); 517 if (swapcache) 518 goto failed; 519 } 520 521 memcg1_swapin(entry, folio_nr_pages(folio)); 522 if (shadow) 523 workingset_refault(folio, shadow); 524 525 /* Caller will initiate read into locked folio */ 526 folio_add_lru(folio); 527 return folio; 528 529 failed: 530 folio_unlock(folio); 531 return swapcache; 532 } 533 534 /** 535 * swap_cache_alloc_folio - Allocate folio for swapped out slot in swap cache. 536 * @entry: the swapped out swap entry to be binded to the folio. 537 * @gfp_mask: memory allocation flags 538 * @mpol: NUMA memory allocation policy to be applied 539 * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE 540 * @new_page_allocated: sets true if allocation happened, false otherwise 541 * 542 * Allocate a folio in the swap cache for one swap slot, typically before 543 * doing IO (e.g. swap in or zswap writeback). The swap slot indicated by 544 * @entry must have a non-zero swap count (swapped out). 545 * Currently only supports order 0. 546 * 547 * Context: Caller must protect the swap device with reference count or locks. 548 * Return: Returns the existing folio if @entry is cached already. Returns 549 * NULL if failed due to -ENOMEM or @entry have a swap count < 1. 550 */ 551 struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_mask, 552 struct mempolicy *mpol, pgoff_t ilx, 553 bool *new_page_allocated) 554 { 555 struct swap_info_struct *si = __swap_entry_to_info(entry); 556 struct folio *folio; 557 struct folio *result = NULL; 558 559 *new_page_allocated = false; 560 /* Check the swap cache again for readahead path. */ 561 folio = swap_cache_get_folio(entry); 562 if (folio) 563 return folio; 564 565 /* Skip allocation for unused and bad swap slot for readahead. */ 566 if (!swap_entry_swapped(si, entry)) 567 return NULL; 568 569 /* Allocate a new folio to be added into the swap cache. */ 570 folio = folio_alloc_mpol(gfp_mask, 0, mpol, ilx, numa_node_id()); 571 if (!folio) 572 return NULL; 573 /* Try add the new folio, returns existing folio or NULL on failure. */ 574 result = __swap_cache_prepare_and_add(entry, folio, gfp_mask, false); 575 if (result == folio) 576 *new_page_allocated = true; 577 else 578 folio_put(folio); 579 return result; 580 } 581 582 /** 583 * swapin_folio - swap-in one or multiple entries skipping readahead. 584 * @entry: starting swap entry to swap in 585 * @folio: a new allocated and charged folio 586 * 587 * Reads @entry into @folio, @folio will be added to the swap cache. 588 * If @folio is a large folio, the @entry will be rounded down to align 589 * with the folio size. 590 * 591 * Return: returns pointer to @folio on success. If folio is a large folio 592 * and this raced with another swapin, NULL will be returned to allow fallback 593 * to order 0. Else, if another folio was already added to the swap cache, 594 * return that swap cache folio instead. 595 */ 596 struct folio *swapin_folio(swp_entry_t entry, struct folio *folio) 597 { 598 struct folio *swapcache; 599 pgoff_t offset = swp_offset(entry); 600 unsigned long nr_pages = folio_nr_pages(folio); 601 602 entry = swp_entry(swp_type(entry), round_down(offset, nr_pages)); 603 swapcache = __swap_cache_prepare_and_add(entry, folio, 0, true); 604 if (swapcache == folio) 605 swap_read_folio(folio, NULL); 606 return swapcache; 607 } 608 609 /* 610 * Locate a page of swap in physical memory, reserving swap cache space 611 * and reading the disk if it is not already cached. 612 * A failure return means that either the page allocation failed or that 613 * the swap entry is no longer in use. 614 */ 615 struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, 616 struct vm_area_struct *vma, unsigned long addr, 617 struct swap_iocb **plug) 618 { 619 struct swap_info_struct *si; 620 bool page_allocated; 621 struct mempolicy *mpol; 622 pgoff_t ilx; 623 struct folio *folio; 624 625 si = get_swap_device(entry); 626 if (!si) 627 return NULL; 628 629 mpol = get_vma_policy(vma, addr, 0, &ilx); 630 folio = swap_cache_alloc_folio(entry, gfp_mask, mpol, ilx, 631 &page_allocated); 632 mpol_cond_put(mpol); 633 634 if (page_allocated) 635 swap_read_folio(folio, plug); 636 637 put_swap_device(si); 638 return folio; 639 } 640 641 static unsigned int __swapin_nr_pages(unsigned long prev_offset, 642 unsigned long offset, 643 int hits, 644 int max_pages, 645 int prev_win) 646 { 647 unsigned int pages, last_ra; 648 649 /* 650 * This heuristic has been found to work well on both sequential and 651 * random loads, swapping to hard disk or to SSD: please don't ask 652 * what the "+ 2" means, it just happens to work well, that's all. 653 */ 654 pages = hits + 2; 655 if (pages == 2) { 656 /* 657 * We can have no readahead hits to judge by: but must not get 658 * stuck here forever, so check for an adjacent offset instead 659 * (and don't even bother to check whether swap type is same). 660 */ 661 if (offset != prev_offset + 1 && offset != prev_offset - 1) 662 pages = 1; 663 } else { 664 unsigned int roundup = 4; 665 while (roundup < pages) 666 roundup <<= 1; 667 pages = roundup; 668 } 669 670 if (pages > max_pages) 671 pages = max_pages; 672 673 /* Don't shrink readahead too fast */ 674 last_ra = prev_win / 2; 675 if (pages < last_ra) 676 pages = last_ra; 677 678 return pages; 679 } 680 681 static unsigned long swapin_nr_pages(unsigned long offset) 682 { 683 static unsigned long prev_offset; 684 unsigned int hits, pages, max_pages; 685 static atomic_t last_readahead_pages; 686 687 max_pages = 1 << READ_ONCE(page_cluster); 688 if (max_pages <= 1) 689 return 1; 690 691 hits = atomic_xchg(&swapin_readahead_hits, 0); 692 pages = __swapin_nr_pages(READ_ONCE(prev_offset), offset, hits, 693 max_pages, 694 atomic_read(&last_readahead_pages)); 695 if (!hits) 696 WRITE_ONCE(prev_offset, offset); 697 atomic_set(&last_readahead_pages, pages); 698 699 return pages; 700 } 701 702 /** 703 * swap_cluster_readahead - swap in pages in hope we need them soon 704 * @entry: swap entry of this memory 705 * @gfp_mask: memory allocation flags 706 * @mpol: NUMA memory allocation policy to be applied 707 * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE 708 * 709 * Returns the struct folio for entry and addr, after queueing swapin. 710 * 711 * Primitive swap readahead code. We simply read an aligned block of 712 * (1 << page_cluster) entries in the swap area. This method is chosen 713 * because it doesn't cost us any seek time. We also make sure to queue 714 * the 'original' request together with the readahead ones... 715 * 716 * Note: it is intentional that the same NUMA policy and interleave index 717 * are used for every page of the readahead: neighbouring pages on swap 718 * are fairly likely to have been swapped out from the same node. 719 */ 720 struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, 721 struct mempolicy *mpol, pgoff_t ilx) 722 { 723 struct folio *folio; 724 unsigned long entry_offset = swp_offset(entry); 725 unsigned long offset = entry_offset; 726 unsigned long start_offset, end_offset; 727 unsigned long mask; 728 struct swap_info_struct *si = __swap_entry_to_info(entry); 729 struct blk_plug plug; 730 struct swap_iocb *splug = NULL; 731 bool page_allocated; 732 733 mask = swapin_nr_pages(offset) - 1; 734 if (!mask) 735 goto skip; 736 737 /* Read a page_cluster sized and aligned cluster around offset. */ 738 start_offset = offset & ~mask; 739 end_offset = offset | mask; 740 if (!start_offset) /* First page is swap header. */ 741 start_offset++; 742 if (end_offset >= si->max) 743 end_offset = si->max - 1; 744 745 blk_start_plug(&plug); 746 for (offset = start_offset; offset <= end_offset ; offset++) { 747 /* Ok, do the async read-ahead now */ 748 folio = swap_cache_alloc_folio( 749 swp_entry(swp_type(entry), offset), gfp_mask, mpol, ilx, 750 &page_allocated); 751 if (!folio) 752 continue; 753 if (page_allocated) { 754 swap_read_folio(folio, &splug); 755 if (offset != entry_offset) { 756 folio_set_readahead(folio); 757 count_vm_event(SWAP_RA); 758 } 759 } 760 folio_put(folio); 761 } 762 blk_finish_plug(&plug); 763 swap_read_unplug(splug); 764 lru_add_drain(); /* Push any new pages onto the LRU now */ 765 skip: 766 /* The page was likely read above, so no need for plugging here */ 767 folio = swap_cache_alloc_folio(entry, gfp_mask, mpol, ilx, 768 &page_allocated); 769 if (unlikely(page_allocated)) 770 swap_read_folio(folio, NULL); 771 return folio; 772 } 773 774 static int swap_vma_ra_win(struct vm_fault *vmf, unsigned long *start, 775 unsigned long *end) 776 { 777 struct vm_area_struct *vma = vmf->vma; 778 unsigned long ra_val; 779 unsigned long faddr, prev_faddr, left, right; 780 unsigned int max_win, hits, prev_win, win; 781 782 max_win = 1 << min(READ_ONCE(page_cluster), SWAP_RA_ORDER_CEILING); 783 if (max_win == 1) 784 return 1; 785 786 faddr = vmf->address; 787 ra_val = GET_SWAP_RA_VAL(vma); 788 prev_faddr = SWAP_RA_ADDR(ra_val); 789 prev_win = SWAP_RA_WIN(ra_val); 790 hits = SWAP_RA_HITS(ra_val); 791 win = __swapin_nr_pages(PFN_DOWN(prev_faddr), PFN_DOWN(faddr), hits, 792 max_win, prev_win); 793 atomic_long_set(&vma->swap_readahead_info, SWAP_RA_VAL(faddr, win, 0)); 794 if (win == 1) 795 return 1; 796 797 if (faddr == prev_faddr + PAGE_SIZE) 798 left = faddr; 799 else if (prev_faddr == faddr + PAGE_SIZE) 800 left = faddr - (win << PAGE_SHIFT) + PAGE_SIZE; 801 else 802 left = faddr - (((win - 1) / 2) << PAGE_SHIFT); 803 right = left + (win << PAGE_SHIFT); 804 if ((long)left < 0) 805 left = 0; 806 *start = max3(left, vma->vm_start, faddr & PMD_MASK); 807 *end = min3(right, vma->vm_end, (faddr & PMD_MASK) + PMD_SIZE); 808 809 return win; 810 } 811 812 /** 813 * swap_vma_readahead - swap in pages in hope we need them soon 814 * @targ_entry: swap entry of the targeted memory 815 * @gfp_mask: memory allocation flags 816 * @mpol: NUMA memory allocation policy to be applied 817 * @targ_ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE 818 * @vmf: fault information 819 * 820 * Returns the struct folio for entry and addr, after queueing swapin. 821 * 822 * Primitive swap readahead code. We simply read in a few pages whose 823 * virtual addresses are around the fault address in the same vma. 824 * 825 * Caller must hold read mmap_lock if vmf->vma is not NULL. 826 * 827 */ 828 static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, 829 struct mempolicy *mpol, pgoff_t targ_ilx, struct vm_fault *vmf) 830 { 831 struct blk_plug plug; 832 struct swap_iocb *splug = NULL; 833 struct folio *folio; 834 pte_t *pte = NULL, pentry; 835 int win; 836 unsigned long start, end, addr; 837 pgoff_t ilx; 838 bool page_allocated; 839 840 win = swap_vma_ra_win(vmf, &start, &end); 841 if (win == 1) 842 goto skip; 843 844 ilx = targ_ilx - PFN_DOWN(vmf->address - start); 845 846 blk_start_plug(&plug); 847 for (addr = start; addr < end; ilx++, addr += PAGE_SIZE) { 848 struct swap_info_struct *si = NULL; 849 softleaf_t entry; 850 851 if (!pte++) { 852 pte = pte_offset_map(vmf->pmd, addr); 853 if (!pte) 854 break; 855 } 856 pentry = ptep_get_lockless(pte); 857 entry = softleaf_from_pte(pentry); 858 859 if (!softleaf_is_swap(entry)) 860 continue; 861 pte_unmap(pte); 862 pte = NULL; 863 /* 864 * Readahead entry may come from a device that we are not 865 * holding a reference to, try to grab a reference, or skip. 866 */ 867 if (swp_type(entry) != swp_type(targ_entry)) { 868 si = get_swap_device(entry); 869 if (!si) 870 continue; 871 } 872 folio = swap_cache_alloc_folio(entry, gfp_mask, mpol, ilx, 873 &page_allocated); 874 if (si) 875 put_swap_device(si); 876 if (!folio) 877 continue; 878 if (page_allocated) { 879 swap_read_folio(folio, &splug); 880 if (addr != vmf->address) { 881 folio_set_readahead(folio); 882 count_vm_event(SWAP_RA); 883 } 884 } 885 folio_put(folio); 886 } 887 if (pte) 888 pte_unmap(pte); 889 blk_finish_plug(&plug); 890 swap_read_unplug(splug); 891 lru_add_drain(); 892 skip: 893 /* The folio was likely read above, so no need for plugging here */ 894 folio = swap_cache_alloc_folio(targ_entry, gfp_mask, mpol, targ_ilx, 895 &page_allocated); 896 if (unlikely(page_allocated)) 897 swap_read_folio(folio, NULL); 898 return folio; 899 } 900 901 /** 902 * swapin_readahead - swap in pages in hope we need them soon 903 * @entry: swap entry of this memory 904 * @gfp_mask: memory allocation flags 905 * @vmf: fault information 906 * 907 * Returns the struct folio for entry and addr, after queueing swapin. 908 * 909 * It's a main entry function for swap readahead. By the configuration, 910 * it will read ahead blocks by cluster-based(ie, physical disk based) 911 * or vma-based(ie, virtual address based on faulty address) readahead. 912 */ 913 struct folio *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, 914 struct vm_fault *vmf) 915 { 916 struct mempolicy *mpol; 917 pgoff_t ilx; 918 struct folio *folio; 919 920 mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx); 921 folio = swap_use_vma_readahead() ? 922 swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf) : 923 swap_cluster_readahead(entry, gfp_mask, mpol, ilx); 924 mpol_cond_put(mpol); 925 926 return folio; 927 } 928 929 #ifdef CONFIG_SYSFS 930 static ssize_t vma_ra_enabled_show(struct kobject *kobj, 931 struct kobj_attribute *attr, char *buf) 932 { 933 return sysfs_emit(buf, "%s\n", str_true_false(enable_vma_readahead)); 934 } 935 static ssize_t vma_ra_enabled_store(struct kobject *kobj, 936 struct kobj_attribute *attr, 937 const char *buf, size_t count) 938 { 939 ssize_t ret; 940 941 ret = kstrtobool(buf, &enable_vma_readahead); 942 if (ret) 943 return ret; 944 945 return count; 946 } 947 static struct kobj_attribute vma_ra_enabled_attr = __ATTR_RW(vma_ra_enabled); 948 949 static struct attribute *swap_attrs[] = { 950 &vma_ra_enabled_attr.attr, 951 NULL, 952 }; 953 954 static const struct attribute_group swap_attr_group = { 955 .attrs = swap_attrs, 956 }; 957 958 static int __init swap_init(void) 959 { 960 int err; 961 struct kobject *swap_kobj; 962 963 swap_kobj = kobject_create_and_add("swap", mm_kobj); 964 if (!swap_kobj) { 965 pr_err("failed to create swap kobject\n"); 966 return -ENOMEM; 967 } 968 err = sysfs_create_group(swap_kobj, &swap_attr_group); 969 if (err) { 970 pr_err("failed to register swap group\n"); 971 goto delete_obj; 972 } 973 /* Swap cache writeback is LRU based, no tags for it */ 974 mapping_set_no_writeback_tags(&swap_space); 975 return 0; 976 977 delete_obj: 978 kobject_put(swap_kobj); 979 return err; 980 } 981 subsys_initcall(swap_init); 982 #endif 983