1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * linux/mm/swapfile.c 4 * 5 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 6 * Swap reorganised 29.12.95, Stephen Tweedie 7 */ 8 9 #include <linux/blkdev.h> 10 #include <linux/mm.h> 11 #include <linux/sched/mm.h> 12 #include <linux/sched/task.h> 13 #include <linux/hugetlb.h> 14 #include <linux/mman.h> 15 #include <linux/slab.h> 16 #include <linux/kernel_stat.h> 17 #include <linux/swap.h> 18 #include <linux/vmalloc.h> 19 #include <linux/pagemap.h> 20 #include <linux/namei.h> 21 #include <linux/shmem_fs.h> 22 #include <linux/blk-cgroup.h> 23 #include <linux/random.h> 24 #include <linux/writeback.h> 25 #include <linux/proc_fs.h> 26 #include <linux/seq_file.h> 27 #include <linux/init.h> 28 #include <linux/ksm.h> 29 #include <linux/rmap.h> 30 #include <linux/security.h> 31 #include <linux/backing-dev.h> 32 #include <linux/mutex.h> 33 #include <linux/capability.h> 34 #include <linux/syscalls.h> 35 #include <linux/memcontrol.h> 36 #include <linux/poll.h> 37 #include <linux/oom.h> 38 #include <linux/swapfile.h> 39 #include <linux/export.h> 40 #include <linux/sort.h> 41 #include <linux/completion.h> 42 #include <linux/suspend.h> 43 #include <linux/zswap.h> 44 #include <linux/plist.h> 45 46 #include <asm/tlbflush.h> 47 #include <linux/leafops.h> 48 #include <linux/swap_cgroup.h> 49 #include "swap_table.h" 50 #include "internal.h" 51 #include "swap_table.h" 52 #include "swap.h" 53 54 static bool swap_count_continued(struct swap_info_struct *, pgoff_t, 55 unsigned char); 56 static void free_swap_count_continuations(struct swap_info_struct *); 57 static void swap_range_alloc(struct swap_info_struct *si, 58 unsigned int nr_entries); 59 static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr); 60 static void swap_put_entry_locked(struct swap_info_struct *si, 61 struct swap_cluster_info *ci, 62 unsigned long offset); 63 static bool folio_swapcache_freeable(struct folio *folio); 64 static void move_cluster(struct swap_info_struct *si, 65 struct swap_cluster_info *ci, struct list_head *list, 66 enum swap_cluster_flags new_flags); 67 68 static DEFINE_SPINLOCK(swap_lock); 69 static unsigned int nr_swapfiles; 70 atomic_long_t nr_swap_pages; 71 /* 72 * Some modules use swappable objects and may try to swap them out under 73 * memory pressure (via the shrinker). Before doing so, they may wish to 74 * check to see if any swap space is available. 75 */ 76 EXPORT_SYMBOL_GPL(nr_swap_pages); 77 /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ 78 long total_swap_pages; 79 #define DEF_SWAP_PRIO -1 80 unsigned long swapfile_maximum_size; 81 #ifdef CONFIG_MIGRATION 82 bool swap_migration_ad_supported; 83 #endif /* CONFIG_MIGRATION */ 84 85 static const char Bad_file[] = "Bad swap file entry "; 86 static const char Bad_offset[] = "Bad swap offset entry "; 87 88 /* 89 * all active swap_info_structs 90 * protected with swap_lock, and ordered by priority. 91 */ 92 static PLIST_HEAD(swap_active_head); 93 94 /* 95 * all available (active, not full) swap_info_structs 96 * protected with swap_avail_lock, ordered by priority. 97 * This is used by folio_alloc_swap() instead of swap_active_head 98 * because swap_active_head includes all swap_info_structs, 99 * but folio_alloc_swap() doesn't need to look at full ones. 100 * This uses its own lock instead of swap_lock because when a 101 * swap_info_struct changes between not-full/full, it needs to 102 * add/remove itself to/from this list, but the swap_info_struct->lock 103 * is held and the locking order requires swap_lock to be taken 104 * before any swap_info_struct->lock. 105 */ 106 static PLIST_HEAD(swap_avail_head); 107 static DEFINE_SPINLOCK(swap_avail_lock); 108 109 struct swap_info_struct *swap_info[MAX_SWAPFILES]; 110 111 static struct kmem_cache *swap_table_cachep; 112 113 static DEFINE_MUTEX(swapon_mutex); 114 115 static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait); 116 /* Activity counter to indicate that a swapon or swapoff has occurred */ 117 static atomic_t proc_poll_event = ATOMIC_INIT(0); 118 119 atomic_t nr_rotate_swap = ATOMIC_INIT(0); 120 121 struct percpu_swap_cluster { 122 struct swap_info_struct *si[SWAP_NR_ORDERS]; 123 unsigned long offset[SWAP_NR_ORDERS]; 124 local_lock_t lock; 125 }; 126 127 static DEFINE_PER_CPU(struct percpu_swap_cluster, percpu_swap_cluster) = { 128 .si = { NULL }, 129 .offset = { SWAP_ENTRY_INVALID }, 130 .lock = INIT_LOCAL_LOCK(), 131 }; 132 133 /* May return NULL on invalid type, caller must check for NULL return */ 134 static struct swap_info_struct *swap_type_to_info(int type) 135 { 136 if (type >= MAX_SWAPFILES) 137 return NULL; 138 return READ_ONCE(swap_info[type]); /* rcu_dereference() */ 139 } 140 141 /* May return NULL on invalid entry, caller must check for NULL return */ 142 static struct swap_info_struct *swap_entry_to_info(swp_entry_t entry) 143 { 144 return swap_type_to_info(swp_type(entry)); 145 } 146 147 /* 148 * Use the second highest bit of inuse_pages counter as the indicator 149 * if one swap device is on the available plist, so the atomic can 150 * still be updated arithmetically while having special data embedded. 151 * 152 * inuse_pages counter is the only thing indicating if a device should 153 * be on avail_lists or not (except swapon / swapoff). By embedding the 154 * off-list bit in the atomic counter, updates no longer need any lock 155 * to check the list status. 156 * 157 * This bit will be set if the device is not on the plist and not 158 * usable, will be cleared if the device is on the plist. 159 */ 160 #define SWAP_USAGE_OFFLIST_BIT (1UL << (BITS_PER_TYPE(atomic_t) - 2)) 161 #define SWAP_USAGE_COUNTER_MASK (~SWAP_USAGE_OFFLIST_BIT) 162 static long swap_usage_in_pages(struct swap_info_struct *si) 163 { 164 return atomic_long_read(&si->inuse_pages) & SWAP_USAGE_COUNTER_MASK; 165 } 166 167 /* Reclaim the swap entry anyway if possible */ 168 #define TTRS_ANYWAY 0x1 169 /* 170 * Reclaim the swap entry if there are no more mappings of the 171 * corresponding page 172 */ 173 #define TTRS_UNMAPPED 0x2 174 /* Reclaim the swap entry if swap is getting full */ 175 #define TTRS_FULL 0x4 176 177 static bool swap_only_has_cache(struct swap_info_struct *si, 178 struct swap_cluster_info *ci, 179 unsigned long offset, int nr_pages) 180 { 181 unsigned int ci_off = offset % SWAPFILE_CLUSTER; 182 unsigned char *map = si->swap_map + offset; 183 unsigned char *map_end = map + nr_pages; 184 unsigned long swp_tb; 185 186 do { 187 swp_tb = __swap_table_get(ci, ci_off); 188 VM_WARN_ON_ONCE(!swp_tb_is_folio(swp_tb)); 189 if (*map) 190 return false; 191 ++ci_off; 192 } while (++map < map_end); 193 194 return true; 195 } 196 197 /* 198 * returns number of pages in the folio that backs the swap entry. If positive, 199 * the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no 200 * folio was associated with the swap entry. 201 */ 202 static int __try_to_reclaim_swap(struct swap_info_struct *si, 203 unsigned long offset, unsigned long flags) 204 { 205 const swp_entry_t entry = swp_entry(si->type, offset); 206 struct swap_cluster_info *ci; 207 struct folio *folio; 208 int ret, nr_pages; 209 bool need_reclaim; 210 211 again: 212 folio = swap_cache_get_folio(entry); 213 if (!folio) 214 return 0; 215 216 nr_pages = folio_nr_pages(folio); 217 ret = -nr_pages; 218 219 /* 220 * We hold a folio lock here. We have to use trylock for 221 * avoiding deadlock. This is a special case and you should 222 * use folio_free_swap() with explicit folio_lock() in usual 223 * operations. 224 */ 225 if (!folio_trylock(folio)) 226 goto out; 227 228 /* 229 * Offset could point to the middle of a large folio, or folio 230 * may no longer point to the expected offset before it's locked. 231 */ 232 if (!folio_matches_swap_entry(folio, entry)) { 233 folio_unlock(folio); 234 folio_put(folio); 235 goto again; 236 } 237 offset = swp_offset(folio->swap); 238 239 need_reclaim = ((flags & TTRS_ANYWAY) || 240 ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || 241 ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio))); 242 if (!need_reclaim || !folio_swapcache_freeable(folio)) 243 goto out_unlock; 244 245 /* 246 * It's safe to delete the folio from swap cache only if the folio 247 * is in swap cache with swap count == 0. The slots have no page table 248 * reference or pending writeback, and can't be allocated to others. 249 */ 250 ci = swap_cluster_lock(si, offset); 251 need_reclaim = swap_only_has_cache(si, ci, offset, nr_pages); 252 swap_cluster_unlock(ci); 253 if (!need_reclaim) 254 goto out_unlock; 255 256 swap_cache_del_folio(folio); 257 folio_set_dirty(folio); 258 ret = nr_pages; 259 out_unlock: 260 folio_unlock(folio); 261 out: 262 folio_put(folio); 263 return ret; 264 } 265 266 static inline struct swap_extent *first_se(struct swap_info_struct *sis) 267 { 268 struct rb_node *rb = rb_first(&sis->swap_extent_root); 269 return rb_entry(rb, struct swap_extent, rb_node); 270 } 271 272 static inline struct swap_extent *next_se(struct swap_extent *se) 273 { 274 struct rb_node *rb = rb_next(&se->rb_node); 275 return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL; 276 } 277 278 /* 279 * swapon tell device that all the old swap contents can be discarded, 280 * to allow the swap device to optimize its wear-levelling. 281 */ 282 static int discard_swap(struct swap_info_struct *si) 283 { 284 struct swap_extent *se; 285 sector_t start_block; 286 sector_t nr_blocks; 287 int err = 0; 288 289 /* Do not discard the swap header page! */ 290 se = first_se(si); 291 start_block = (se->start_block + 1) << (PAGE_SHIFT - 9); 292 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); 293 if (nr_blocks) { 294 err = blkdev_issue_discard(si->bdev, start_block, 295 nr_blocks, GFP_KERNEL); 296 if (err) 297 return err; 298 cond_resched(); 299 } 300 301 for (se = next_se(se); se; se = next_se(se)) { 302 start_block = se->start_block << (PAGE_SHIFT - 9); 303 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); 304 305 err = blkdev_issue_discard(si->bdev, start_block, 306 nr_blocks, GFP_KERNEL); 307 if (err) 308 break; 309 310 cond_resched(); 311 } 312 return err; /* That will often be -EOPNOTSUPP */ 313 } 314 315 static struct swap_extent * 316 offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset) 317 { 318 struct swap_extent *se; 319 struct rb_node *rb; 320 321 rb = sis->swap_extent_root.rb_node; 322 while (rb) { 323 se = rb_entry(rb, struct swap_extent, rb_node); 324 if (offset < se->start_page) 325 rb = rb->rb_left; 326 else if (offset >= se->start_page + se->nr_pages) 327 rb = rb->rb_right; 328 else 329 return se; 330 } 331 /* It *must* be present */ 332 BUG(); 333 } 334 335 sector_t swap_folio_sector(struct folio *folio) 336 { 337 struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); 338 struct swap_extent *se; 339 sector_t sector; 340 pgoff_t offset; 341 342 offset = swp_offset(folio->swap); 343 se = offset_to_swap_extent(sis, offset); 344 sector = se->start_block + (offset - se->start_page); 345 return sector << (PAGE_SHIFT - 9); 346 } 347 348 /* 349 * swap allocation tell device that a cluster of swap can now be discarded, 350 * to allow the swap device to optimize its wear-levelling. 351 */ 352 static void discard_swap_cluster(struct swap_info_struct *si, 353 pgoff_t start_page, pgoff_t nr_pages) 354 { 355 struct swap_extent *se = offset_to_swap_extent(si, start_page); 356 357 while (nr_pages) { 358 pgoff_t offset = start_page - se->start_page; 359 sector_t start_block = se->start_block + offset; 360 sector_t nr_blocks = se->nr_pages - offset; 361 362 if (nr_blocks > nr_pages) 363 nr_blocks = nr_pages; 364 start_page += nr_blocks; 365 nr_pages -= nr_blocks; 366 367 start_block <<= PAGE_SHIFT - 9; 368 nr_blocks <<= PAGE_SHIFT - 9; 369 if (blkdev_issue_discard(si->bdev, start_block, 370 nr_blocks, GFP_NOIO)) 371 break; 372 373 se = next_se(se); 374 } 375 } 376 377 #define LATENCY_LIMIT 256 378 379 static inline bool cluster_is_empty(struct swap_cluster_info *info) 380 { 381 return info->count == 0; 382 } 383 384 static inline bool cluster_is_discard(struct swap_cluster_info *info) 385 { 386 return info->flags == CLUSTER_FLAG_DISCARD; 387 } 388 389 static inline bool cluster_table_is_alloced(struct swap_cluster_info *ci) 390 { 391 return rcu_dereference_protected(ci->table, lockdep_is_held(&ci->lock)); 392 } 393 394 static inline bool cluster_is_usable(struct swap_cluster_info *ci, int order) 395 { 396 if (unlikely(ci->flags > CLUSTER_FLAG_USABLE)) 397 return false; 398 if (!cluster_table_is_alloced(ci)) 399 return false; 400 if (!order) 401 return true; 402 return cluster_is_empty(ci) || order == ci->order; 403 } 404 405 static inline unsigned int cluster_index(struct swap_info_struct *si, 406 struct swap_cluster_info *ci) 407 { 408 return ci - si->cluster_info; 409 } 410 411 static inline unsigned int cluster_offset(struct swap_info_struct *si, 412 struct swap_cluster_info *ci) 413 { 414 return cluster_index(si, ci) * SWAPFILE_CLUSTER; 415 } 416 417 static struct swap_table *swap_table_alloc(gfp_t gfp) 418 { 419 struct folio *folio; 420 421 if (!SWP_TABLE_USE_PAGE) 422 return kmem_cache_zalloc(swap_table_cachep, gfp); 423 424 folio = folio_alloc(gfp | __GFP_ZERO, 0); 425 if (folio) 426 return folio_address(folio); 427 return NULL; 428 } 429 430 static void swap_table_free_folio_rcu_cb(struct rcu_head *head) 431 { 432 struct folio *folio; 433 434 folio = page_folio(container_of(head, struct page, rcu_head)); 435 folio_put(folio); 436 } 437 438 static void swap_table_free(struct swap_table *table) 439 { 440 if (!SWP_TABLE_USE_PAGE) { 441 kmem_cache_free(swap_table_cachep, table); 442 return; 443 } 444 445 call_rcu(&(folio_page(virt_to_folio(table), 0)->rcu_head), 446 swap_table_free_folio_rcu_cb); 447 } 448 449 static void swap_cluster_free_table(struct swap_cluster_info *ci) 450 { 451 unsigned int ci_off; 452 struct swap_table *table; 453 454 /* Only empty cluster's table is allow to be freed */ 455 lockdep_assert_held(&ci->lock); 456 VM_WARN_ON_ONCE(!cluster_is_empty(ci)); 457 for (ci_off = 0; ci_off < SWAPFILE_CLUSTER; ci_off++) 458 VM_WARN_ON_ONCE(!swp_tb_is_null(__swap_table_get(ci, ci_off))); 459 table = (void *)rcu_dereference_protected(ci->table, true); 460 rcu_assign_pointer(ci->table, NULL); 461 462 swap_table_free(table); 463 } 464 465 /* 466 * Allocate swap table for one cluster. Attempt an atomic allocation first, 467 * then fallback to sleeping allocation. 468 */ 469 static struct swap_cluster_info * 470 swap_cluster_alloc_table(struct swap_info_struct *si, 471 struct swap_cluster_info *ci) 472 { 473 struct swap_table *table; 474 475 /* 476 * Only cluster isolation from the allocator does table allocation. 477 * Swap allocator uses percpu clusters and holds the local lock. 478 */ 479 lockdep_assert_held(&ci->lock); 480 lockdep_assert_held(&this_cpu_ptr(&percpu_swap_cluster)->lock); 481 482 /* The cluster must be free and was just isolated from the free list. */ 483 VM_WARN_ON_ONCE(ci->flags || !cluster_is_empty(ci)); 484 485 table = swap_table_alloc(__GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); 486 if (table) { 487 rcu_assign_pointer(ci->table, table); 488 return ci; 489 } 490 491 /* 492 * Try a sleep allocation. Each isolated free cluster may cause 493 * a sleep allocation, but there is a limited number of them, so 494 * the potential recursive allocation is limited. 495 */ 496 spin_unlock(&ci->lock); 497 if (!(si->flags & SWP_SOLIDSTATE)) 498 spin_unlock(&si->global_cluster_lock); 499 local_unlock(&percpu_swap_cluster.lock); 500 501 table = swap_table_alloc(__GFP_HIGH | __GFP_NOMEMALLOC | GFP_KERNEL); 502 503 /* 504 * Back to atomic context. We might have migrated to a new CPU with a 505 * usable percpu cluster. But just keep using the isolated cluster to 506 * make things easier. Migration indicates a slight change of workload 507 * so using a new free cluster might not be a bad idea, and the worst 508 * could happen with ignoring the percpu cluster is fragmentation, 509 * which is acceptable since this fallback and race is rare. 510 */ 511 local_lock(&percpu_swap_cluster.lock); 512 if (!(si->flags & SWP_SOLIDSTATE)) 513 spin_lock(&si->global_cluster_lock); 514 spin_lock(&ci->lock); 515 516 /* Nothing except this helper should touch a dangling empty cluster. */ 517 if (WARN_ON_ONCE(cluster_table_is_alloced(ci))) { 518 if (table) 519 swap_table_free(table); 520 return ci; 521 } 522 523 if (!table) { 524 move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE); 525 spin_unlock(&ci->lock); 526 return NULL; 527 } 528 529 rcu_assign_pointer(ci->table, table); 530 return ci; 531 } 532 533 static void move_cluster(struct swap_info_struct *si, 534 struct swap_cluster_info *ci, struct list_head *list, 535 enum swap_cluster_flags new_flags) 536 { 537 VM_WARN_ON(ci->flags == new_flags); 538 539 BUILD_BUG_ON(1 << sizeof(ci->flags) * BITS_PER_BYTE < CLUSTER_FLAG_MAX); 540 lockdep_assert_held(&ci->lock); 541 542 spin_lock(&si->lock); 543 if (ci->flags == CLUSTER_FLAG_NONE) 544 list_add_tail(&ci->list, list); 545 else 546 list_move_tail(&ci->list, list); 547 spin_unlock(&si->lock); 548 ci->flags = new_flags; 549 } 550 551 /* Add a cluster to discard list and schedule it to do discard */ 552 static void swap_cluster_schedule_discard(struct swap_info_struct *si, 553 struct swap_cluster_info *ci) 554 { 555 VM_BUG_ON(ci->flags == CLUSTER_FLAG_FREE); 556 move_cluster(si, ci, &si->discard_clusters, CLUSTER_FLAG_DISCARD); 557 schedule_work(&si->discard_work); 558 } 559 560 static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) 561 { 562 swap_cluster_free_table(ci); 563 move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE); 564 ci->order = 0; 565 } 566 567 /* 568 * Isolate and lock the first cluster that is not contented on a list, 569 * clean its flag before taken off-list. Cluster flag must be in sync 570 * with list status, so cluster updaters can always know the cluster 571 * list status without touching si lock. 572 * 573 * Note it's possible that all clusters on a list are contented so 574 * this returns NULL for an non-empty list. 575 */ 576 static struct swap_cluster_info *isolate_lock_cluster( 577 struct swap_info_struct *si, struct list_head *list) 578 { 579 struct swap_cluster_info *ci, *found = NULL; 580 581 spin_lock(&si->lock); 582 list_for_each_entry(ci, list, list) { 583 if (!spin_trylock(&ci->lock)) 584 continue; 585 586 /* We may only isolate and clear flags of following lists */ 587 VM_BUG_ON(!ci->flags); 588 VM_BUG_ON(ci->flags > CLUSTER_FLAG_USABLE && 589 ci->flags != CLUSTER_FLAG_FULL); 590 591 list_del(&ci->list); 592 ci->flags = CLUSTER_FLAG_NONE; 593 found = ci; 594 break; 595 } 596 spin_unlock(&si->lock); 597 598 if (found && !cluster_table_is_alloced(found)) { 599 /* Only an empty free cluster's swap table can be freed. */ 600 VM_WARN_ON_ONCE(list != &si->free_clusters); 601 VM_WARN_ON_ONCE(!cluster_is_empty(found)); 602 return swap_cluster_alloc_table(si, found); 603 } 604 605 return found; 606 } 607 608 /* 609 * Doing discard actually. After a cluster discard is finished, the cluster 610 * will be added to free cluster list. Discard cluster is a bit special as 611 * they don't participate in allocation or reclaim, so clusters marked as 612 * CLUSTER_FLAG_DISCARD must remain off-list or on discard list. 613 */ 614 static bool swap_do_scheduled_discard(struct swap_info_struct *si) 615 { 616 struct swap_cluster_info *ci; 617 bool ret = false; 618 unsigned int idx; 619 620 spin_lock(&si->lock); 621 while (!list_empty(&si->discard_clusters)) { 622 ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list); 623 /* 624 * Delete the cluster from list to prepare for discard, but keep 625 * the CLUSTER_FLAG_DISCARD flag, percpu_swap_cluster could be 626 * pointing to it, or ran into by relocate_cluster. 627 */ 628 list_del(&ci->list); 629 idx = cluster_index(si, ci); 630 spin_unlock(&si->lock); 631 discard_swap_cluster(si, idx * SWAPFILE_CLUSTER, 632 SWAPFILE_CLUSTER); 633 634 spin_lock(&ci->lock); 635 /* 636 * Discard is done, clear its flags as it's off-list, then 637 * return the cluster to allocation list. 638 */ 639 ci->flags = CLUSTER_FLAG_NONE; 640 __free_cluster(si, ci); 641 spin_unlock(&ci->lock); 642 ret = true; 643 spin_lock(&si->lock); 644 } 645 spin_unlock(&si->lock); 646 return ret; 647 } 648 649 static void swap_discard_work(struct work_struct *work) 650 { 651 struct swap_info_struct *si; 652 653 si = container_of(work, struct swap_info_struct, discard_work); 654 655 swap_do_scheduled_discard(si); 656 } 657 658 static void swap_users_ref_free(struct percpu_ref *ref) 659 { 660 struct swap_info_struct *si; 661 662 si = container_of(ref, struct swap_info_struct, users); 663 complete(&si->comp); 664 } 665 666 /* 667 * Must be called after freeing if ci->count == 0, moves the cluster to free 668 * or discard list. 669 */ 670 static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) 671 { 672 VM_BUG_ON(ci->count != 0); 673 VM_BUG_ON(ci->flags == CLUSTER_FLAG_FREE); 674 lockdep_assert_held(&ci->lock); 675 676 /* 677 * If the swap is discardable, prepare discard the cluster 678 * instead of free it immediately. The cluster will be freed 679 * after discard. 680 */ 681 if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) == 682 (SWP_WRITEOK | SWP_PAGE_DISCARD)) { 683 swap_cluster_schedule_discard(si, ci); 684 return; 685 } 686 687 __free_cluster(si, ci); 688 } 689 690 /* 691 * Must be called after freeing if ci->count != 0, moves the cluster to 692 * nonfull list. 693 */ 694 static void partial_free_cluster(struct swap_info_struct *si, 695 struct swap_cluster_info *ci) 696 { 697 VM_BUG_ON(!ci->count || ci->count == SWAPFILE_CLUSTER); 698 lockdep_assert_held(&ci->lock); 699 700 if (ci->flags != CLUSTER_FLAG_NONFULL) 701 move_cluster(si, ci, &si->nonfull_clusters[ci->order], 702 CLUSTER_FLAG_NONFULL); 703 } 704 705 /* 706 * Must be called after allocation, moves the cluster to full or frag list. 707 * Note: allocation doesn't acquire si lock, and may drop the ci lock for 708 * reclaim, so the cluster could be any where when called. 709 */ 710 static void relocate_cluster(struct swap_info_struct *si, 711 struct swap_cluster_info *ci) 712 { 713 lockdep_assert_held(&ci->lock); 714 715 /* Discard cluster must remain off-list or on discard list */ 716 if (cluster_is_discard(ci)) 717 return; 718 719 if (!ci->count) { 720 if (ci->flags != CLUSTER_FLAG_FREE) 721 free_cluster(si, ci); 722 } else if (ci->count != SWAPFILE_CLUSTER) { 723 if (ci->flags != CLUSTER_FLAG_FRAG) 724 move_cluster(si, ci, &si->frag_clusters[ci->order], 725 CLUSTER_FLAG_FRAG); 726 } else { 727 if (ci->flags != CLUSTER_FLAG_FULL) 728 move_cluster(si, ci, &si->full_clusters, 729 CLUSTER_FLAG_FULL); 730 } 731 } 732 733 /* 734 * The cluster corresponding to @offset will be accounted as having one bad 735 * slot. The cluster will not be added to the free cluster list, and its 736 * usage counter will be increased by 1. Only used for initialization. 737 */ 738 static int swap_cluster_setup_bad_slot(struct swap_cluster_info *cluster_info, 739 unsigned long offset) 740 { 741 unsigned long idx = offset / SWAPFILE_CLUSTER; 742 struct swap_table *table; 743 struct swap_cluster_info *ci; 744 745 ci = cluster_info + idx; 746 if (!ci->table) { 747 table = swap_table_alloc(GFP_KERNEL); 748 if (!table) 749 return -ENOMEM; 750 rcu_assign_pointer(ci->table, table); 751 } 752 753 ci->count++; 754 755 WARN_ON(ci->count > SWAPFILE_CLUSTER); 756 WARN_ON(ci->flags); 757 758 return 0; 759 } 760 761 /* 762 * Reclaim drops the ci lock, so the cluster may become unusable (freed or 763 * stolen by a lower order). @usable will be set to false if that happens. 764 */ 765 static bool cluster_reclaim_range(struct swap_info_struct *si, 766 struct swap_cluster_info *ci, 767 unsigned long start, unsigned int order, 768 bool *usable) 769 { 770 unsigned int nr_pages = 1 << order; 771 unsigned long offset = start, end = start + nr_pages; 772 unsigned char *map = si->swap_map; 773 unsigned long swp_tb; 774 775 spin_unlock(&ci->lock); 776 do { 777 if (READ_ONCE(map[offset])) 778 break; 779 swp_tb = swap_table_get(ci, offset % SWAPFILE_CLUSTER); 780 if (swp_tb_is_folio(swp_tb)) { 781 if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY) < 0) 782 break; 783 } 784 } while (++offset < end); 785 spin_lock(&ci->lock); 786 787 /* 788 * We just dropped ci->lock so cluster could be used by another 789 * order or got freed, check if it's still usable or empty. 790 */ 791 if (!cluster_is_usable(ci, order)) { 792 *usable = false; 793 return false; 794 } 795 *usable = true; 796 797 /* Fast path, no need to scan if the whole cluster is empty */ 798 if (cluster_is_empty(ci)) 799 return true; 800 801 /* 802 * Recheck the range no matter reclaim succeeded or not, the slot 803 * could have been be freed while we are not holding the lock. 804 */ 805 for (offset = start; offset < end; offset++) { 806 swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER); 807 if (map[offset] || !swp_tb_is_null(swp_tb)) 808 return false; 809 } 810 811 return true; 812 } 813 814 static bool cluster_scan_range(struct swap_info_struct *si, 815 struct swap_cluster_info *ci, 816 unsigned long offset, unsigned int nr_pages, 817 bool *need_reclaim) 818 { 819 unsigned long end = offset + nr_pages; 820 unsigned char *map = si->swap_map; 821 unsigned long swp_tb; 822 823 if (cluster_is_empty(ci)) 824 return true; 825 826 do { 827 if (map[offset]) 828 return false; 829 swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER); 830 if (swp_tb_is_folio(swp_tb)) { 831 if (!vm_swap_full()) 832 return false; 833 *need_reclaim = true; 834 } else { 835 /* A entry with no count and no cache must be null */ 836 VM_WARN_ON_ONCE(!swp_tb_is_null(swp_tb)); 837 } 838 } while (++offset < end); 839 840 return true; 841 } 842 843 /* 844 * Currently, the swap table is not used for count tracking, just 845 * do a sanity check here to ensure nothing leaked, so the swap 846 * table should be empty upon freeing. 847 */ 848 static void swap_cluster_assert_table_empty(struct swap_cluster_info *ci, 849 unsigned int start, unsigned int nr) 850 { 851 unsigned int ci_off = start % SWAPFILE_CLUSTER; 852 unsigned int ci_end = ci_off + nr; 853 unsigned long swp_tb; 854 855 if (IS_ENABLED(CONFIG_DEBUG_VM)) { 856 do { 857 swp_tb = __swap_table_get(ci, ci_off); 858 VM_WARN_ON_ONCE(!swp_tb_is_null(swp_tb)); 859 } while (++ci_off < ci_end); 860 } 861 } 862 863 static bool cluster_alloc_range(struct swap_info_struct *si, 864 struct swap_cluster_info *ci, 865 struct folio *folio, 866 unsigned int offset) 867 { 868 unsigned long nr_pages; 869 unsigned int order; 870 871 lockdep_assert_held(&ci->lock); 872 873 if (!(si->flags & SWP_WRITEOK)) 874 return false; 875 876 /* 877 * All mm swap allocation starts with a folio (folio_alloc_swap), 878 * it's also the only allocation path for large orders allocation. 879 * Such swap slots starts with count == 0 and will be increased 880 * upon folio unmap. 881 * 882 * Else, it's a exclusive order 0 allocation for hibernation. 883 * The slot starts with count == 1 and never increases. 884 */ 885 if (likely(folio)) { 886 order = folio_order(folio); 887 nr_pages = 1 << order; 888 __swap_cache_add_folio(ci, folio, swp_entry(si->type, offset)); 889 } else if (IS_ENABLED(CONFIG_HIBERNATION)) { 890 order = 0; 891 nr_pages = 1; 892 WARN_ON_ONCE(si->swap_map[offset]); 893 si->swap_map[offset] = 1; 894 swap_cluster_assert_table_empty(ci, offset, 1); 895 } else { 896 /* Allocation without folio is only possible with hibernation */ 897 WARN_ON_ONCE(1); 898 return false; 899 } 900 901 /* 902 * The first allocation in a cluster makes the 903 * cluster exclusive to this order 904 */ 905 if (cluster_is_empty(ci)) 906 ci->order = order; 907 ci->count += nr_pages; 908 swap_range_alloc(si, nr_pages); 909 910 return true; 911 } 912 913 /* Try use a new cluster for current CPU and allocate from it. */ 914 static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, 915 struct swap_cluster_info *ci, 916 struct folio *folio, unsigned long offset) 917 { 918 unsigned int next = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID; 919 unsigned long start = ALIGN_DOWN(offset, SWAPFILE_CLUSTER); 920 unsigned long end = min(start + SWAPFILE_CLUSTER, si->max); 921 unsigned int order = likely(folio) ? folio_order(folio) : 0; 922 unsigned int nr_pages = 1 << order; 923 bool need_reclaim, ret, usable; 924 925 lockdep_assert_held(&ci->lock); 926 VM_WARN_ON(!cluster_is_usable(ci, order)); 927 928 if (end < nr_pages || ci->count + nr_pages > SWAPFILE_CLUSTER) 929 goto out; 930 931 for (end -= nr_pages; offset <= end; offset += nr_pages) { 932 need_reclaim = false; 933 if (!cluster_scan_range(si, ci, offset, nr_pages, &need_reclaim)) 934 continue; 935 if (need_reclaim) { 936 ret = cluster_reclaim_range(si, ci, offset, order, &usable); 937 if (!usable) 938 goto out; 939 if (cluster_is_empty(ci)) 940 offset = start; 941 /* Reclaim failed but cluster is usable, try next */ 942 if (!ret) 943 continue; 944 } 945 if (!cluster_alloc_range(si, ci, folio, offset)) 946 break; 947 found = offset; 948 offset += nr_pages; 949 if (ci->count < SWAPFILE_CLUSTER && offset <= end) 950 next = offset; 951 break; 952 } 953 out: 954 relocate_cluster(si, ci); 955 swap_cluster_unlock(ci); 956 if (si->flags & SWP_SOLIDSTATE) { 957 this_cpu_write(percpu_swap_cluster.offset[order], next); 958 this_cpu_write(percpu_swap_cluster.si[order], si); 959 } else { 960 si->global_cluster->next[order] = next; 961 } 962 return found; 963 } 964 965 static unsigned int alloc_swap_scan_list(struct swap_info_struct *si, 966 struct list_head *list, 967 struct folio *folio, 968 bool scan_all) 969 { 970 unsigned int found = SWAP_ENTRY_INVALID; 971 972 do { 973 struct swap_cluster_info *ci = isolate_lock_cluster(si, list); 974 unsigned long offset; 975 976 if (!ci) 977 break; 978 offset = cluster_offset(si, ci); 979 found = alloc_swap_scan_cluster(si, ci, folio, offset); 980 if (found) 981 break; 982 } while (scan_all); 983 984 return found; 985 } 986 987 static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) 988 { 989 long to_scan = 1; 990 unsigned long offset, end; 991 struct swap_cluster_info *ci; 992 unsigned char *map = si->swap_map; 993 int nr_reclaim; 994 995 if (force) 996 to_scan = swap_usage_in_pages(si) / SWAPFILE_CLUSTER; 997 998 while ((ci = isolate_lock_cluster(si, &si->full_clusters))) { 999 offset = cluster_offset(si, ci); 1000 end = min(si->max, offset + SWAPFILE_CLUSTER); 1001 to_scan--; 1002 1003 while (offset < end) { 1004 if (!READ_ONCE(map[offset]) && 1005 swp_tb_is_folio(swap_table_get(ci, offset % SWAPFILE_CLUSTER))) { 1006 spin_unlock(&ci->lock); 1007 nr_reclaim = __try_to_reclaim_swap(si, offset, 1008 TTRS_ANYWAY); 1009 spin_lock(&ci->lock); 1010 if (nr_reclaim) { 1011 offset += abs(nr_reclaim); 1012 continue; 1013 } 1014 } 1015 offset++; 1016 } 1017 1018 /* in case no swap cache is reclaimed */ 1019 if (ci->flags == CLUSTER_FLAG_NONE) 1020 relocate_cluster(si, ci); 1021 1022 swap_cluster_unlock(ci); 1023 if (to_scan <= 0) 1024 break; 1025 } 1026 } 1027 1028 static void swap_reclaim_work(struct work_struct *work) 1029 { 1030 struct swap_info_struct *si; 1031 1032 si = container_of(work, struct swap_info_struct, reclaim_work); 1033 1034 swap_reclaim_full_clusters(si, true); 1035 } 1036 1037 /* 1038 * Try to allocate swap entries with specified order and try set a new 1039 * cluster for current CPU too. 1040 */ 1041 static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, 1042 struct folio *folio) 1043 { 1044 struct swap_cluster_info *ci; 1045 unsigned int order = likely(folio) ? folio_order(folio) : 0; 1046 unsigned int offset = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID; 1047 1048 /* 1049 * Swapfile is not block device so unable 1050 * to allocate large entries. 1051 */ 1052 if (order && !(si->flags & SWP_BLKDEV)) 1053 return 0; 1054 1055 if (!(si->flags & SWP_SOLIDSTATE)) { 1056 /* Serialize HDD SWAP allocation for each device. */ 1057 spin_lock(&si->global_cluster_lock); 1058 offset = si->global_cluster->next[order]; 1059 if (offset == SWAP_ENTRY_INVALID) 1060 goto new_cluster; 1061 1062 ci = swap_cluster_lock(si, offset); 1063 /* Cluster could have been used by another order */ 1064 if (cluster_is_usable(ci, order)) { 1065 if (cluster_is_empty(ci)) 1066 offset = cluster_offset(si, ci); 1067 found = alloc_swap_scan_cluster(si, ci, folio, offset); 1068 } else { 1069 swap_cluster_unlock(ci); 1070 } 1071 if (found) 1072 goto done; 1073 } 1074 1075 new_cluster: 1076 /* 1077 * If the device need discard, prefer new cluster over nonfull 1078 * to spread out the writes. 1079 */ 1080 if (si->flags & SWP_PAGE_DISCARD) { 1081 found = alloc_swap_scan_list(si, &si->free_clusters, folio, false); 1082 if (found) 1083 goto done; 1084 } 1085 1086 if (order < PMD_ORDER) { 1087 found = alloc_swap_scan_list(si, &si->nonfull_clusters[order], folio, true); 1088 if (found) 1089 goto done; 1090 } 1091 1092 if (!(si->flags & SWP_PAGE_DISCARD)) { 1093 found = alloc_swap_scan_list(si, &si->free_clusters, folio, false); 1094 if (found) 1095 goto done; 1096 } 1097 1098 /* Try reclaim full clusters if free and nonfull lists are drained */ 1099 if (vm_swap_full()) 1100 swap_reclaim_full_clusters(si, false); 1101 1102 if (order < PMD_ORDER) { 1103 /* 1104 * Scan only one fragment cluster is good enough. Order 0 1105 * allocation will surely success, and large allocation 1106 * failure is not critical. Scanning one cluster still 1107 * keeps the list rotated and reclaimed (for clean swap cache). 1108 */ 1109 found = alloc_swap_scan_list(si, &si->frag_clusters[order], folio, false); 1110 if (found) 1111 goto done; 1112 } 1113 1114 if (order) 1115 goto done; 1116 1117 /* Order 0 stealing from higher order */ 1118 for (int o = 1; o < SWAP_NR_ORDERS; o++) { 1119 /* 1120 * Clusters here have at least one usable slots and can't fail order 0 1121 * allocation, but reclaim may drop si->lock and race with another user. 1122 */ 1123 found = alloc_swap_scan_list(si, &si->frag_clusters[o], folio, true); 1124 if (found) 1125 goto done; 1126 1127 found = alloc_swap_scan_list(si, &si->nonfull_clusters[o], folio, true); 1128 if (found) 1129 goto done; 1130 } 1131 done: 1132 if (!(si->flags & SWP_SOLIDSTATE)) 1133 spin_unlock(&si->global_cluster_lock); 1134 1135 return found; 1136 } 1137 1138 /* SWAP_USAGE_OFFLIST_BIT can only be set by this helper. */ 1139 static void del_from_avail_list(struct swap_info_struct *si, bool swapoff) 1140 { 1141 unsigned long pages; 1142 1143 spin_lock(&swap_avail_lock); 1144 1145 if (swapoff) { 1146 /* 1147 * Forcefully remove it. Clear the SWP_WRITEOK flags for 1148 * swapoff here so it's synchronized by both si->lock and 1149 * swap_avail_lock, to ensure the result can be seen by 1150 * add_to_avail_list. 1151 */ 1152 lockdep_assert_held(&si->lock); 1153 si->flags &= ~SWP_WRITEOK; 1154 atomic_long_or(SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages); 1155 } else { 1156 /* 1157 * If not called by swapoff, take it off-list only if it's 1158 * full and SWAP_USAGE_OFFLIST_BIT is not set (strictly 1159 * si->inuse_pages == pages), any concurrent slot freeing, 1160 * or device already removed from plist by someone else 1161 * will make this return false. 1162 */ 1163 pages = si->pages; 1164 if (!atomic_long_try_cmpxchg(&si->inuse_pages, &pages, 1165 pages | SWAP_USAGE_OFFLIST_BIT)) 1166 goto skip; 1167 } 1168 1169 plist_del(&si->avail_list, &swap_avail_head); 1170 1171 skip: 1172 spin_unlock(&swap_avail_lock); 1173 } 1174 1175 /* SWAP_USAGE_OFFLIST_BIT can only be cleared by this helper. */ 1176 static void add_to_avail_list(struct swap_info_struct *si, bool swapon) 1177 { 1178 long val; 1179 unsigned long pages; 1180 1181 spin_lock(&swap_avail_lock); 1182 1183 /* Corresponding to SWP_WRITEOK clearing in del_from_avail_list */ 1184 if (swapon) { 1185 lockdep_assert_held(&si->lock); 1186 si->flags |= SWP_WRITEOK; 1187 } else { 1188 if (!(READ_ONCE(si->flags) & SWP_WRITEOK)) 1189 goto skip; 1190 } 1191 1192 if (!(atomic_long_read(&si->inuse_pages) & SWAP_USAGE_OFFLIST_BIT)) 1193 goto skip; 1194 1195 val = atomic_long_fetch_and_relaxed(~SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages); 1196 1197 /* 1198 * When device is full and device is on the plist, only one updater will 1199 * see (inuse_pages == si->pages) and will call del_from_avail_list. If 1200 * that updater happen to be here, just skip adding. 1201 */ 1202 pages = si->pages; 1203 if (val == pages) { 1204 /* Just like the cmpxchg in del_from_avail_list */ 1205 if (atomic_long_try_cmpxchg(&si->inuse_pages, &pages, 1206 pages | SWAP_USAGE_OFFLIST_BIT)) 1207 goto skip; 1208 } 1209 1210 plist_add(&si->avail_list, &swap_avail_head); 1211 1212 skip: 1213 spin_unlock(&swap_avail_lock); 1214 } 1215 1216 /* 1217 * swap_usage_add / swap_usage_sub of each slot are serialized by ci->lock 1218 * within each cluster, so the total contribution to the global counter should 1219 * always be positive and cannot exceed the total number of usable slots. 1220 */ 1221 static bool swap_usage_add(struct swap_info_struct *si, unsigned int nr_entries) 1222 { 1223 long val = atomic_long_add_return_relaxed(nr_entries, &si->inuse_pages); 1224 1225 /* 1226 * If device is full, and SWAP_USAGE_OFFLIST_BIT is not set, 1227 * remove it from the plist. 1228 */ 1229 if (unlikely(val == si->pages)) { 1230 del_from_avail_list(si, false); 1231 return true; 1232 } 1233 1234 return false; 1235 } 1236 1237 static void swap_usage_sub(struct swap_info_struct *si, unsigned int nr_entries) 1238 { 1239 long val = atomic_long_sub_return_relaxed(nr_entries, &si->inuse_pages); 1240 1241 /* 1242 * If device is not full, and SWAP_USAGE_OFFLIST_BIT is set, 1243 * add it to the plist. 1244 */ 1245 if (unlikely(val & SWAP_USAGE_OFFLIST_BIT)) 1246 add_to_avail_list(si, false); 1247 } 1248 1249 static void swap_range_alloc(struct swap_info_struct *si, 1250 unsigned int nr_entries) 1251 { 1252 if (swap_usage_add(si, nr_entries)) { 1253 if (vm_swap_full()) 1254 schedule_work(&si->reclaim_work); 1255 } 1256 atomic_long_sub(nr_entries, &nr_swap_pages); 1257 } 1258 1259 static void swap_range_free(struct swap_info_struct *si, unsigned long offset, 1260 unsigned int nr_entries) 1261 { 1262 unsigned long begin = offset; 1263 unsigned long end = offset + nr_entries - 1; 1264 void (*swap_slot_free_notify)(struct block_device *, unsigned long); 1265 unsigned int i; 1266 1267 /* 1268 * Use atomic clear_bit operations only on zeromap instead of non-atomic 1269 * bitmap_clear to prevent adjacent bits corruption due to simultaneous writes. 1270 */ 1271 for (i = 0; i < nr_entries; i++) { 1272 clear_bit(offset + i, si->zeromap); 1273 zswap_invalidate(swp_entry(si->type, offset + i)); 1274 } 1275 1276 if (si->flags & SWP_BLKDEV) 1277 swap_slot_free_notify = 1278 si->bdev->bd_disk->fops->swap_slot_free_notify; 1279 else 1280 swap_slot_free_notify = NULL; 1281 while (offset <= end) { 1282 arch_swap_invalidate_page(si->type, offset); 1283 if (swap_slot_free_notify) 1284 swap_slot_free_notify(si->bdev, offset); 1285 offset++; 1286 } 1287 __swap_cache_clear_shadow(swp_entry(si->type, begin), nr_entries); 1288 1289 /* 1290 * Make sure that try_to_unuse() observes si->inuse_pages reaching 0 1291 * only after the above cleanups are done. 1292 */ 1293 smp_wmb(); 1294 atomic_long_add(nr_entries, &nr_swap_pages); 1295 swap_usage_sub(si, nr_entries); 1296 } 1297 1298 static bool get_swap_device_info(struct swap_info_struct *si) 1299 { 1300 if (!percpu_ref_tryget_live(&si->users)) 1301 return false; 1302 /* 1303 * Guarantee the si->users are checked before accessing other 1304 * fields of swap_info_struct, and si->flags (SWP_WRITEOK) is 1305 * up to dated. 1306 * 1307 * Paired with the spin_unlock() after setup_swap_info() in 1308 * enable_swap_info(), and smp_wmb() in swapoff. 1309 */ 1310 smp_rmb(); 1311 return true; 1312 } 1313 1314 /* 1315 * Fast path try to get swap entries with specified order from current 1316 * CPU's swap entry pool (a cluster). 1317 */ 1318 static bool swap_alloc_fast(struct folio *folio) 1319 { 1320 unsigned int order = folio_order(folio); 1321 struct swap_cluster_info *ci; 1322 struct swap_info_struct *si; 1323 unsigned int offset; 1324 1325 /* 1326 * Once allocated, swap_info_struct will never be completely freed, 1327 * so checking it's liveness by get_swap_device_info is enough. 1328 */ 1329 si = this_cpu_read(percpu_swap_cluster.si[order]); 1330 offset = this_cpu_read(percpu_swap_cluster.offset[order]); 1331 if (!si || !offset || !get_swap_device_info(si)) 1332 return false; 1333 1334 ci = swap_cluster_lock(si, offset); 1335 if (cluster_is_usable(ci, order)) { 1336 if (cluster_is_empty(ci)) 1337 offset = cluster_offset(si, ci); 1338 alloc_swap_scan_cluster(si, ci, folio, offset); 1339 } else { 1340 swap_cluster_unlock(ci); 1341 } 1342 1343 put_swap_device(si); 1344 return folio_test_swapcache(folio); 1345 } 1346 1347 /* Rotate the device and switch to a new cluster */ 1348 static void swap_alloc_slow(struct folio *folio) 1349 { 1350 struct swap_info_struct *si, *next; 1351 1352 spin_lock(&swap_avail_lock); 1353 start_over: 1354 plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) { 1355 /* Rotate the device and switch to a new cluster */ 1356 plist_requeue(&si->avail_list, &swap_avail_head); 1357 spin_unlock(&swap_avail_lock); 1358 if (get_swap_device_info(si)) { 1359 cluster_alloc_swap_entry(si, folio); 1360 put_swap_device(si); 1361 if (folio_test_swapcache(folio)) 1362 return; 1363 if (folio_test_large(folio)) 1364 return; 1365 } 1366 1367 spin_lock(&swap_avail_lock); 1368 /* 1369 * if we got here, it's likely that si was almost full before, 1370 * multiple callers probably all tried to get a page from the 1371 * same si and it filled up before we could get one; or, the si 1372 * filled up between us dropping swap_avail_lock. 1373 * Since we dropped the swap_avail_lock, the swap_avail_list 1374 * may have been modified; so if next is still in the 1375 * swap_avail_head list then try it, otherwise start over if we 1376 * have not gotten any slots. 1377 */ 1378 if (plist_node_empty(&next->avail_list)) 1379 goto start_over; 1380 } 1381 spin_unlock(&swap_avail_lock); 1382 } 1383 1384 /* 1385 * Discard pending clusters in a synchronized way when under high pressure. 1386 * Return: true if any cluster is discarded. 1387 */ 1388 static bool swap_sync_discard(void) 1389 { 1390 bool ret = false; 1391 struct swap_info_struct *si, *next; 1392 1393 spin_lock(&swap_lock); 1394 start_over: 1395 plist_for_each_entry_safe(si, next, &swap_active_head, list) { 1396 spin_unlock(&swap_lock); 1397 if (get_swap_device_info(si)) { 1398 if (si->flags & SWP_PAGE_DISCARD) 1399 ret = swap_do_scheduled_discard(si); 1400 put_swap_device(si); 1401 } 1402 if (ret) 1403 return true; 1404 1405 spin_lock(&swap_lock); 1406 if (plist_node_empty(&next->list)) 1407 goto start_over; 1408 } 1409 spin_unlock(&swap_lock); 1410 1411 return false; 1412 } 1413 1414 /** 1415 * swap_put_entries_cluster - Decrease the swap count of a set of slots. 1416 * @si: The swap device. 1417 * @start: start offset of slots. 1418 * @nr: number of slots. 1419 * @reclaim_cache: if true, also reclaim the swap cache. 1420 * 1421 * This helper decreases the swap count of a set of slots and tries to 1422 * batch free them. Also reclaims the swap cache if @reclaim_cache is true. 1423 * Context: The caller must ensure that all slots belong to the same 1424 * cluster and their swap count doesn't go underflow. 1425 */ 1426 static void swap_put_entries_cluster(struct swap_info_struct *si, 1427 unsigned long start, int nr, 1428 bool reclaim_cache) 1429 { 1430 unsigned long offset = start, end = start + nr; 1431 unsigned long batch_start = SWAP_ENTRY_INVALID; 1432 struct swap_cluster_info *ci; 1433 bool need_reclaim = false; 1434 unsigned int nr_reclaimed; 1435 unsigned long swp_tb; 1436 unsigned int count; 1437 1438 ci = swap_cluster_lock(si, offset); 1439 do { 1440 swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER); 1441 count = si->swap_map[offset]; 1442 VM_WARN_ON(count < 1 || count == SWAP_MAP_BAD); 1443 if (count == 1) { 1444 /* count == 1 and non-cached slots will be batch freed. */ 1445 if (!swp_tb_is_folio(swp_tb)) { 1446 if (!batch_start) 1447 batch_start = offset; 1448 continue; 1449 } 1450 /* count will be 0 after put, slot can be reclaimed */ 1451 need_reclaim = true; 1452 } 1453 /* 1454 * A count != 1 or cached slot can't be freed. Put its swap 1455 * count and then free the interrupted pending batch. Cached 1456 * slots will be freed when folio is removed from swap cache 1457 * (__swap_cache_del_folio). 1458 */ 1459 swap_put_entry_locked(si, ci, offset); 1460 if (batch_start) { 1461 swap_entries_free(si, ci, batch_start, offset - batch_start); 1462 batch_start = SWAP_ENTRY_INVALID; 1463 } 1464 } while (++offset < end); 1465 1466 if (batch_start) 1467 swap_entries_free(si, ci, batch_start, offset - batch_start); 1468 swap_cluster_unlock(ci); 1469 1470 if (!need_reclaim || !reclaim_cache) 1471 return; 1472 1473 offset = start; 1474 do { 1475 nr_reclaimed = __try_to_reclaim_swap(si, offset, 1476 TTRS_UNMAPPED | TTRS_FULL); 1477 offset++; 1478 if (nr_reclaimed) 1479 offset = round_up(offset, abs(nr_reclaimed)); 1480 } while (offset < end); 1481 } 1482 1483 /** 1484 * folio_alloc_swap - allocate swap space for a folio 1485 * @folio: folio we want to move to swap 1486 * 1487 * Allocate swap space for the folio and add the folio to the 1488 * swap cache. 1489 * 1490 * Context: Caller needs to hold the folio lock. 1491 * Return: Whether the folio was added to the swap cache. 1492 */ 1493 int folio_alloc_swap(struct folio *folio) 1494 { 1495 unsigned int order = folio_order(folio); 1496 unsigned int size = 1 << order; 1497 1498 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 1499 VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio); 1500 1501 if (order) { 1502 /* 1503 * Reject large allocation when THP_SWAP is disabled, 1504 * the caller should split the folio and try again. 1505 */ 1506 if (!IS_ENABLED(CONFIG_THP_SWAP)) 1507 return -EAGAIN; 1508 1509 /* 1510 * Allocation size should never exceed cluster size 1511 * (HPAGE_PMD_SIZE). 1512 */ 1513 if (size > SWAPFILE_CLUSTER) { 1514 VM_WARN_ON_ONCE(1); 1515 return -EINVAL; 1516 } 1517 } 1518 1519 again: 1520 local_lock(&percpu_swap_cluster.lock); 1521 if (!swap_alloc_fast(folio)) 1522 swap_alloc_slow(folio); 1523 local_unlock(&percpu_swap_cluster.lock); 1524 1525 if (!order && unlikely(!folio_test_swapcache(folio))) { 1526 if (swap_sync_discard()) 1527 goto again; 1528 } 1529 1530 /* Need to call this even if allocation failed, for MEMCG_SWAP_FAIL. */ 1531 if (unlikely(mem_cgroup_try_charge_swap(folio, folio->swap))) 1532 swap_cache_del_folio(folio); 1533 1534 if (unlikely(!folio_test_swapcache(folio))) 1535 return -ENOMEM; 1536 1537 return 0; 1538 } 1539 1540 /** 1541 * folio_dup_swap() - Increase swap count of swap entries of a folio. 1542 * @folio: folio with swap entries bounded. 1543 * @subpage: if not NULL, only increase the swap count of this subpage. 1544 * 1545 * Typically called when the folio is unmapped and have its swap entry to 1546 * take its palce. 1547 * 1548 * Context: Caller must ensure the folio is locked and in the swap cache. 1549 * NOTE: The caller also has to ensure there is no raced call to 1550 * swap_put_entries_direct on its swap entry before this helper returns, or 1551 * the swap map may underflow. Currently, we only accept @subpage == NULL 1552 * for shmem due to the limitation of swap continuation: shmem always 1553 * duplicates the swap entry only once, so there is no such issue for it. 1554 */ 1555 int folio_dup_swap(struct folio *folio, struct page *subpage) 1556 { 1557 int err = 0; 1558 swp_entry_t entry = folio->swap; 1559 unsigned long nr_pages = folio_nr_pages(folio); 1560 1561 VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); 1562 VM_WARN_ON_FOLIO(!folio_test_swapcache(folio), folio); 1563 1564 if (subpage) { 1565 entry.val += folio_page_idx(folio, subpage); 1566 nr_pages = 1; 1567 } 1568 1569 while (!err && __swap_duplicate(entry, 1, nr_pages) == -ENOMEM) 1570 err = add_swap_count_continuation(entry, GFP_ATOMIC); 1571 1572 return err; 1573 } 1574 1575 /** 1576 * folio_put_swap() - Decrease swap count of swap entries of a folio. 1577 * @folio: folio with swap entries bounded, must be in swap cache and locked. 1578 * @subpage: if not NULL, only decrease the swap count of this subpage. 1579 * 1580 * This won't free the swap slots even if swap count drops to zero, they are 1581 * still pinned by the swap cache. User may call folio_free_swap to free them. 1582 * Context: Caller must ensure the folio is locked and in the swap cache. 1583 */ 1584 void folio_put_swap(struct folio *folio, struct page *subpage) 1585 { 1586 swp_entry_t entry = folio->swap; 1587 unsigned long nr_pages = folio_nr_pages(folio); 1588 struct swap_info_struct *si = __swap_entry_to_info(entry); 1589 1590 VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); 1591 VM_WARN_ON_FOLIO(!folio_test_swapcache(folio), folio); 1592 1593 if (subpage) { 1594 entry.val += folio_page_idx(folio, subpage); 1595 nr_pages = 1; 1596 } 1597 1598 swap_put_entries_cluster(si, swp_offset(entry), nr_pages, false); 1599 } 1600 1601 static void swap_put_entry_locked(struct swap_info_struct *si, 1602 struct swap_cluster_info *ci, 1603 unsigned long offset) 1604 { 1605 unsigned char count; 1606 1607 count = si->swap_map[offset]; 1608 if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) { 1609 if (count == COUNT_CONTINUED) { 1610 if (swap_count_continued(si, offset, count)) 1611 count = SWAP_MAP_MAX | COUNT_CONTINUED; 1612 else 1613 count = SWAP_MAP_MAX; 1614 } else 1615 count--; 1616 } 1617 1618 WRITE_ONCE(si->swap_map[offset], count); 1619 if (!count && !swp_tb_is_folio(__swap_table_get(ci, offset % SWAPFILE_CLUSTER))) 1620 swap_entries_free(si, ci, offset, 1); 1621 } 1622 1623 /* 1624 * When we get a swap entry, if there aren't some other ways to 1625 * prevent swapoff, such as the folio in swap cache is locked, RCU 1626 * reader side is locked, etc., the swap entry may become invalid 1627 * because of swapoff. Then, we need to enclose all swap related 1628 * functions with get_swap_device() and put_swap_device(), unless the 1629 * swap functions call get/put_swap_device() by themselves. 1630 * 1631 * RCU reader side lock (including any spinlock) is sufficient to 1632 * prevent swapoff, because synchronize_rcu() is called in swapoff() 1633 * before freeing data structures. 1634 * 1635 * Check whether swap entry is valid in the swap device. If so, 1636 * return pointer to swap_info_struct, and keep the swap entry valid 1637 * via preventing the swap device from being swapoff, until 1638 * put_swap_device() is called. Otherwise return NULL. 1639 * 1640 * Notice that swapoff or swapoff+swapon can still happen before the 1641 * percpu_ref_tryget_live() in get_swap_device() or after the 1642 * percpu_ref_put() in put_swap_device() if there isn't any other way 1643 * to prevent swapoff. The caller must be prepared for that. For 1644 * example, the following situation is possible. 1645 * 1646 * CPU1 CPU2 1647 * do_swap_page() 1648 * ... swapoff+swapon 1649 * swap_cache_alloc_folio() 1650 * swap_cache_add_folio() 1651 * // check swap_map 1652 * // verify PTE not changed 1653 * 1654 * In __swap_duplicate(), the swap_map need to be checked before 1655 * changing partly because the specified swap entry may be for another 1656 * swap device which has been swapoff. And in do_swap_page(), after 1657 * the page is read from the swap device, the PTE is verified not 1658 * changed with the page table locked to check whether the swap device 1659 * has been swapoff or swapoff+swapon. 1660 */ 1661 struct swap_info_struct *get_swap_device(swp_entry_t entry) 1662 { 1663 struct swap_info_struct *si; 1664 unsigned long offset; 1665 1666 if (!entry.val) 1667 goto out; 1668 si = swap_entry_to_info(entry); 1669 if (!si) 1670 goto bad_nofile; 1671 if (!get_swap_device_info(si)) 1672 goto out; 1673 offset = swp_offset(entry); 1674 if (offset >= si->max) 1675 goto put_out; 1676 1677 return si; 1678 bad_nofile: 1679 pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val); 1680 out: 1681 return NULL; 1682 put_out: 1683 pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val); 1684 percpu_ref_put(&si->users); 1685 return NULL; 1686 } 1687 1688 /* 1689 * Drop the last ref of swap entries, caller have to ensure all entries 1690 * belong to the same cgroup and cluster. 1691 */ 1692 void swap_entries_free(struct swap_info_struct *si, 1693 struct swap_cluster_info *ci, 1694 unsigned long offset, unsigned int nr_pages) 1695 { 1696 swp_entry_t entry = swp_entry(si->type, offset); 1697 unsigned char *map = si->swap_map + offset; 1698 unsigned char *map_end = map + nr_pages; 1699 1700 /* It should never free entries across different clusters */ 1701 VM_BUG_ON(ci != __swap_offset_to_cluster(si, offset + nr_pages - 1)); 1702 VM_BUG_ON(cluster_is_empty(ci)); 1703 VM_BUG_ON(ci->count < nr_pages); 1704 1705 ci->count -= nr_pages; 1706 do { 1707 VM_WARN_ON(*map > 1); 1708 *map = 0; 1709 } while (++map < map_end); 1710 1711 mem_cgroup_uncharge_swap(entry, nr_pages); 1712 swap_range_free(si, offset, nr_pages); 1713 swap_cluster_assert_table_empty(ci, offset, nr_pages); 1714 1715 if (!ci->count) 1716 free_cluster(si, ci); 1717 else 1718 partial_free_cluster(si, ci); 1719 } 1720 1721 int __swap_count(swp_entry_t entry) 1722 { 1723 struct swap_info_struct *si = __swap_entry_to_info(entry); 1724 pgoff_t offset = swp_offset(entry); 1725 1726 return si->swap_map[offset]; 1727 } 1728 1729 /** 1730 * swap_entry_swapped - Check if the swap entry is swapped. 1731 * @si: the swap device. 1732 * @entry: the swap entry. 1733 */ 1734 bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry) 1735 { 1736 pgoff_t offset = swp_offset(entry); 1737 struct swap_cluster_info *ci; 1738 int count; 1739 1740 ci = swap_cluster_lock(si, offset); 1741 count = si->swap_map[offset]; 1742 swap_cluster_unlock(ci); 1743 1744 return count && count != SWAP_MAP_BAD; 1745 } 1746 1747 /* 1748 * How many references to @entry are currently swapped out? 1749 * This considers COUNT_CONTINUED so it returns exact answer. 1750 */ 1751 int swp_swapcount(swp_entry_t entry) 1752 { 1753 int count, tmp_count, n; 1754 struct swap_info_struct *si; 1755 struct swap_cluster_info *ci; 1756 struct page *page; 1757 pgoff_t offset; 1758 unsigned char *map; 1759 1760 si = get_swap_device(entry); 1761 if (!si) 1762 return 0; 1763 1764 offset = swp_offset(entry); 1765 1766 ci = swap_cluster_lock(si, offset); 1767 1768 count = si->swap_map[offset]; 1769 if (!(count & COUNT_CONTINUED)) 1770 goto out; 1771 1772 count &= ~COUNT_CONTINUED; 1773 n = SWAP_MAP_MAX + 1; 1774 1775 page = vmalloc_to_page(si->swap_map + offset); 1776 offset &= ~PAGE_MASK; 1777 VM_BUG_ON(page_private(page) != SWP_CONTINUED); 1778 1779 do { 1780 page = list_next_entry(page, lru); 1781 map = kmap_local_page(page); 1782 tmp_count = map[offset]; 1783 kunmap_local(map); 1784 1785 count += (tmp_count & ~COUNT_CONTINUED) * n; 1786 n *= (SWAP_CONT_MAX + 1); 1787 } while (tmp_count & COUNT_CONTINUED); 1788 out: 1789 swap_cluster_unlock(ci); 1790 put_swap_device(si); 1791 return count; 1792 } 1793 1794 static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, 1795 swp_entry_t entry, int order) 1796 { 1797 struct swap_cluster_info *ci; 1798 unsigned char *map = si->swap_map; 1799 unsigned int nr_pages = 1 << order; 1800 unsigned long roffset = swp_offset(entry); 1801 unsigned long offset = round_down(roffset, nr_pages); 1802 int i; 1803 bool ret = false; 1804 1805 ci = swap_cluster_lock(si, offset); 1806 if (nr_pages == 1) { 1807 if (map[roffset]) 1808 ret = true; 1809 goto unlock_out; 1810 } 1811 for (i = 0; i < nr_pages; i++) { 1812 if (map[offset + i]) { 1813 ret = true; 1814 break; 1815 } 1816 } 1817 unlock_out: 1818 swap_cluster_unlock(ci); 1819 return ret; 1820 } 1821 1822 static bool folio_swapped(struct folio *folio) 1823 { 1824 swp_entry_t entry = folio->swap; 1825 struct swap_info_struct *si; 1826 1827 VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); 1828 VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio); 1829 1830 si = __swap_entry_to_info(entry); 1831 if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!folio_test_large(folio))) 1832 return swap_entry_swapped(si, entry); 1833 1834 return swap_page_trans_huge_swapped(si, entry, folio_order(folio)); 1835 } 1836 1837 static bool folio_swapcache_freeable(struct folio *folio) 1838 { 1839 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 1840 1841 if (!folio_test_swapcache(folio)) 1842 return false; 1843 if (folio_test_writeback(folio)) 1844 return false; 1845 1846 /* 1847 * Once hibernation has begun to create its image of memory, 1848 * there's a danger that one of the calls to folio_free_swap() 1849 * - most probably a call from __try_to_reclaim_swap() while 1850 * hibernation is allocating its own swap pages for the image, 1851 * but conceivably even a call from memory reclaim - will free 1852 * the swap from a folio which has already been recorded in the 1853 * image as a clean swapcache folio, and then reuse its swap for 1854 * another page of the image. On waking from hibernation, the 1855 * original folio might be freed under memory pressure, then 1856 * later read back in from swap, now with the wrong data. 1857 * 1858 * Hibernation suspends storage while it is writing the image 1859 * to disk so check that here. 1860 */ 1861 if (pm_suspended_storage()) 1862 return false; 1863 1864 return true; 1865 } 1866 1867 /** 1868 * folio_free_swap() - Free the swap space used for this folio. 1869 * @folio: The folio to remove. 1870 * 1871 * If swap is getting full, or if there are no more mappings of this folio, 1872 * then call folio_free_swap to free its swap space. 1873 * 1874 * Return: true if we were able to release the swap space. 1875 */ 1876 bool folio_free_swap(struct folio *folio) 1877 { 1878 if (!folio_swapcache_freeable(folio)) 1879 return false; 1880 if (folio_swapped(folio)) 1881 return false; 1882 1883 swap_cache_del_folio(folio); 1884 folio_set_dirty(folio); 1885 return true; 1886 } 1887 1888 /** 1889 * swap_put_entries_direct() - Release reference on range of swap entries and 1890 * reclaim their cache if no more references remain. 1891 * @entry: First entry of range. 1892 * @nr: Number of entries in range. 1893 * 1894 * For each swap entry in the contiguous range, release a reference. If any swap 1895 * entries become free, try to reclaim their underlying folios, if present. The 1896 * offset range is defined by [entry.offset, entry.offset + nr). 1897 * 1898 * Context: Caller must ensure there is no race condition on the reference 1899 * owner. e.g., locking the PTL of a PTE containing the entry being released. 1900 */ 1901 void swap_put_entries_direct(swp_entry_t entry, int nr) 1902 { 1903 const unsigned long start_offset = swp_offset(entry); 1904 const unsigned long end_offset = start_offset + nr; 1905 unsigned long offset, cluster_end; 1906 struct swap_info_struct *si; 1907 1908 si = get_swap_device(entry); 1909 if (WARN_ON_ONCE(!si)) 1910 return; 1911 if (WARN_ON_ONCE(end_offset > si->max)) 1912 goto out; 1913 1914 /* Put entries and reclaim cache in each cluster */ 1915 offset = start_offset; 1916 do { 1917 cluster_end = min(round_up(offset + 1, SWAPFILE_CLUSTER), end_offset); 1918 swap_put_entries_cluster(si, offset, cluster_end - offset, true); 1919 offset = cluster_end; 1920 } while (offset < end_offset); 1921 out: 1922 put_swap_device(si); 1923 } 1924 1925 #ifdef CONFIG_HIBERNATION 1926 /* Allocate a slot for hibernation */ 1927 swp_entry_t swap_alloc_hibernation_slot(int type) 1928 { 1929 struct swap_info_struct *si = swap_type_to_info(type); 1930 unsigned long offset; 1931 swp_entry_t entry = {0}; 1932 1933 if (!si) 1934 goto fail; 1935 1936 /* This is called for allocating swap entry, not cache */ 1937 if (get_swap_device_info(si)) { 1938 if (si->flags & SWP_WRITEOK) { 1939 /* 1940 * Grab the local lock to be compliant 1941 * with swap table allocation. 1942 */ 1943 local_lock(&percpu_swap_cluster.lock); 1944 offset = cluster_alloc_swap_entry(si, NULL); 1945 local_unlock(&percpu_swap_cluster.lock); 1946 if (offset) 1947 entry = swp_entry(si->type, offset); 1948 } 1949 put_swap_device(si); 1950 } 1951 fail: 1952 return entry; 1953 } 1954 1955 /* Free a slot allocated by swap_alloc_hibernation_slot */ 1956 void swap_free_hibernation_slot(swp_entry_t entry) 1957 { 1958 struct swap_info_struct *si; 1959 struct swap_cluster_info *ci; 1960 pgoff_t offset = swp_offset(entry); 1961 1962 si = get_swap_device(entry); 1963 if (WARN_ON(!si)) 1964 return; 1965 1966 ci = swap_cluster_lock(si, offset); 1967 swap_put_entry_locked(si, ci, offset); 1968 swap_cluster_unlock(ci); 1969 1970 /* In theory readahead might add it to the swap cache by accident */ 1971 __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); 1972 put_swap_device(si); 1973 } 1974 1975 /* 1976 * Find the swap type that corresponds to given device (if any). 1977 * 1978 * @offset - number of the PAGE_SIZE-sized block of the device, starting 1979 * from 0, in which the swap header is expected to be located. 1980 * 1981 * This is needed for the suspend to disk (aka swsusp). 1982 */ 1983 int swap_type_of(dev_t device, sector_t offset) 1984 { 1985 int type; 1986 1987 if (!device) 1988 return -1; 1989 1990 spin_lock(&swap_lock); 1991 for (type = 0; type < nr_swapfiles; type++) { 1992 struct swap_info_struct *sis = swap_info[type]; 1993 1994 if (!(sis->flags & SWP_WRITEOK)) 1995 continue; 1996 1997 if (device == sis->bdev->bd_dev) { 1998 struct swap_extent *se = first_se(sis); 1999 2000 if (se->start_block == offset) { 2001 spin_unlock(&swap_lock); 2002 return type; 2003 } 2004 } 2005 } 2006 spin_unlock(&swap_lock); 2007 return -ENODEV; 2008 } 2009 2010 int find_first_swap(dev_t *device) 2011 { 2012 int type; 2013 2014 spin_lock(&swap_lock); 2015 for (type = 0; type < nr_swapfiles; type++) { 2016 struct swap_info_struct *sis = swap_info[type]; 2017 2018 if (!(sis->flags & SWP_WRITEOK)) 2019 continue; 2020 *device = sis->bdev->bd_dev; 2021 spin_unlock(&swap_lock); 2022 return type; 2023 } 2024 spin_unlock(&swap_lock); 2025 return -ENODEV; 2026 } 2027 2028 /* 2029 * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev 2030 * corresponding to given index in swap_info (swap type). 2031 */ 2032 sector_t swapdev_block(int type, pgoff_t offset) 2033 { 2034 struct swap_info_struct *si = swap_type_to_info(type); 2035 struct swap_extent *se; 2036 2037 if (!si || !(si->flags & SWP_WRITEOK)) 2038 return 0; 2039 se = offset_to_swap_extent(si, offset); 2040 return se->start_block + (offset - se->start_page); 2041 } 2042 2043 /* 2044 * Return either the total number of swap pages of given type, or the number 2045 * of free pages of that type (depending on @free) 2046 * 2047 * This is needed for software suspend 2048 */ 2049 unsigned int count_swap_pages(int type, int free) 2050 { 2051 unsigned int n = 0; 2052 2053 spin_lock(&swap_lock); 2054 if ((unsigned int)type < nr_swapfiles) { 2055 struct swap_info_struct *sis = swap_info[type]; 2056 2057 spin_lock(&sis->lock); 2058 if (sis->flags & SWP_WRITEOK) { 2059 n = sis->pages; 2060 if (free) 2061 n -= swap_usage_in_pages(sis); 2062 } 2063 spin_unlock(&sis->lock); 2064 } 2065 spin_unlock(&swap_lock); 2066 return n; 2067 } 2068 #endif /* CONFIG_HIBERNATION */ 2069 2070 static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte) 2071 { 2072 return pte_same(pte_swp_clear_flags(pte), swp_pte); 2073 } 2074 2075 /* 2076 * No need to decide whether this PTE shares the swap entry with others, 2077 * just let do_wp_page work it out if a write is requested later - to 2078 * force COW, vm_page_prot omits write permission from any private vma. 2079 */ 2080 static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, 2081 unsigned long addr, swp_entry_t entry, struct folio *folio) 2082 { 2083 struct page *page; 2084 struct folio *swapcache; 2085 spinlock_t *ptl; 2086 pte_t *pte, new_pte, old_pte; 2087 bool hwpoisoned = false; 2088 int ret = 1; 2089 2090 /* 2091 * If the folio is removed from swap cache by others, continue to 2092 * unuse other PTEs. try_to_unuse may try again if we missed this one. 2093 */ 2094 if (!folio_matches_swap_entry(folio, entry)) 2095 return 0; 2096 2097 swapcache = folio; 2098 folio = ksm_might_need_to_copy(folio, vma, addr); 2099 if (unlikely(!folio)) 2100 return -ENOMEM; 2101 else if (unlikely(folio == ERR_PTR(-EHWPOISON))) { 2102 hwpoisoned = true; 2103 folio = swapcache; 2104 } 2105 2106 page = folio_file_page(folio, swp_offset(entry)); 2107 if (PageHWPoison(page)) 2108 hwpoisoned = true; 2109 2110 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 2111 if (unlikely(!pte || !pte_same_as_swp(ptep_get(pte), 2112 swp_entry_to_pte(entry)))) { 2113 ret = 0; 2114 goto out; 2115 } 2116 2117 old_pte = ptep_get(pte); 2118 2119 if (unlikely(hwpoisoned || !folio_test_uptodate(folio))) { 2120 swp_entry_t swp_entry; 2121 2122 dec_mm_counter(vma->vm_mm, MM_SWAPENTS); 2123 if (hwpoisoned) { 2124 swp_entry = make_hwpoison_entry(page); 2125 } else { 2126 swp_entry = make_poisoned_swp_entry(); 2127 } 2128 new_pte = swp_entry_to_pte(swp_entry); 2129 ret = 0; 2130 goto setpte; 2131 } 2132 2133 /* 2134 * Some architectures may have to restore extra metadata to the page 2135 * when reading from swap. This metadata may be indexed by swap entry 2136 * so this must be called before folio_put_swap(). 2137 */ 2138 arch_swap_restore(folio_swap(entry, folio), folio); 2139 2140 dec_mm_counter(vma->vm_mm, MM_SWAPENTS); 2141 inc_mm_counter(vma->vm_mm, MM_ANONPAGES); 2142 folio_get(folio); 2143 if (folio == swapcache) { 2144 rmap_t rmap_flags = RMAP_NONE; 2145 2146 /* 2147 * See do_swap_page(): writeback would be problematic. 2148 * However, we do a folio_wait_writeback() just before this 2149 * call and have the folio locked. 2150 */ 2151 VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); 2152 if (pte_swp_exclusive(old_pte)) 2153 rmap_flags |= RMAP_EXCLUSIVE; 2154 /* 2155 * We currently only expect small !anon folios, which are either 2156 * fully exclusive or fully shared. If we ever get large folios 2157 * here, we have to be careful. 2158 */ 2159 if (!folio_test_anon(folio)) { 2160 VM_WARN_ON_ONCE(folio_test_large(folio)); 2161 VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); 2162 folio_add_new_anon_rmap(folio, vma, addr, rmap_flags); 2163 } else { 2164 folio_add_anon_rmap_pte(folio, page, vma, addr, rmap_flags); 2165 } 2166 } else { /* ksm created a completely new copy */ 2167 folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); 2168 folio_add_lru_vma(folio, vma); 2169 } 2170 new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot)); 2171 if (pte_swp_soft_dirty(old_pte)) 2172 new_pte = pte_mksoft_dirty(new_pte); 2173 if (pte_swp_uffd_wp(old_pte)) 2174 new_pte = pte_mkuffd_wp(new_pte); 2175 setpte: 2176 set_pte_at(vma->vm_mm, addr, pte, new_pte); 2177 folio_put_swap(swapcache, folio_file_page(swapcache, swp_offset(entry))); 2178 out: 2179 if (pte) 2180 pte_unmap_unlock(pte, ptl); 2181 if (folio != swapcache) { 2182 folio_unlock(folio); 2183 folio_put(folio); 2184 } 2185 return ret; 2186 } 2187 2188 static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 2189 unsigned long addr, unsigned long end, 2190 unsigned int type) 2191 { 2192 pte_t *pte = NULL; 2193 struct swap_info_struct *si; 2194 2195 si = swap_info[type]; 2196 do { 2197 struct folio *folio; 2198 unsigned long offset; 2199 unsigned char swp_count; 2200 softleaf_t entry; 2201 int ret; 2202 pte_t ptent; 2203 2204 if (!pte++) { 2205 pte = pte_offset_map(pmd, addr); 2206 if (!pte) 2207 break; 2208 } 2209 2210 ptent = ptep_get_lockless(pte); 2211 entry = softleaf_from_pte(ptent); 2212 2213 if (!softleaf_is_swap(entry)) 2214 continue; 2215 if (swp_type(entry) != type) 2216 continue; 2217 2218 offset = swp_offset(entry); 2219 pte_unmap(pte); 2220 pte = NULL; 2221 2222 folio = swap_cache_get_folio(entry); 2223 if (!folio) { 2224 struct vm_fault vmf = { 2225 .vma = vma, 2226 .address = addr, 2227 .real_address = addr, 2228 .pmd = pmd, 2229 }; 2230 2231 folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, 2232 &vmf); 2233 } 2234 if (!folio) { 2235 swp_count = READ_ONCE(si->swap_map[offset]); 2236 if (swp_count == 0 || swp_count == SWAP_MAP_BAD) 2237 continue; 2238 return -ENOMEM; 2239 } 2240 2241 folio_lock(folio); 2242 folio_wait_writeback(folio); 2243 ret = unuse_pte(vma, pmd, addr, entry, folio); 2244 if (ret < 0) { 2245 folio_unlock(folio); 2246 folio_put(folio); 2247 return ret; 2248 } 2249 2250 folio_free_swap(folio); 2251 folio_unlock(folio); 2252 folio_put(folio); 2253 } while (addr += PAGE_SIZE, addr != end); 2254 2255 if (pte) 2256 pte_unmap(pte); 2257 return 0; 2258 } 2259 2260 static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, 2261 unsigned long addr, unsigned long end, 2262 unsigned int type) 2263 { 2264 pmd_t *pmd; 2265 unsigned long next; 2266 int ret; 2267 2268 pmd = pmd_offset(pud, addr); 2269 do { 2270 cond_resched(); 2271 next = pmd_addr_end(addr, end); 2272 ret = unuse_pte_range(vma, pmd, addr, next, type); 2273 if (ret) 2274 return ret; 2275 } while (pmd++, addr = next, addr != end); 2276 return 0; 2277 } 2278 2279 static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d, 2280 unsigned long addr, unsigned long end, 2281 unsigned int type) 2282 { 2283 pud_t *pud; 2284 unsigned long next; 2285 int ret; 2286 2287 pud = pud_offset(p4d, addr); 2288 do { 2289 next = pud_addr_end(addr, end); 2290 if (pud_none_or_clear_bad(pud)) 2291 continue; 2292 ret = unuse_pmd_range(vma, pud, addr, next, type); 2293 if (ret) 2294 return ret; 2295 } while (pud++, addr = next, addr != end); 2296 return 0; 2297 } 2298 2299 static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd, 2300 unsigned long addr, unsigned long end, 2301 unsigned int type) 2302 { 2303 p4d_t *p4d; 2304 unsigned long next; 2305 int ret; 2306 2307 p4d = p4d_offset(pgd, addr); 2308 do { 2309 next = p4d_addr_end(addr, end); 2310 if (p4d_none_or_clear_bad(p4d)) 2311 continue; 2312 ret = unuse_pud_range(vma, p4d, addr, next, type); 2313 if (ret) 2314 return ret; 2315 } while (p4d++, addr = next, addr != end); 2316 return 0; 2317 } 2318 2319 static int unuse_vma(struct vm_area_struct *vma, unsigned int type) 2320 { 2321 pgd_t *pgd; 2322 unsigned long addr, end, next; 2323 int ret; 2324 2325 addr = vma->vm_start; 2326 end = vma->vm_end; 2327 2328 pgd = pgd_offset(vma->vm_mm, addr); 2329 do { 2330 next = pgd_addr_end(addr, end); 2331 if (pgd_none_or_clear_bad(pgd)) 2332 continue; 2333 ret = unuse_p4d_range(vma, pgd, addr, next, type); 2334 if (ret) 2335 return ret; 2336 } while (pgd++, addr = next, addr != end); 2337 return 0; 2338 } 2339 2340 static int unuse_mm(struct mm_struct *mm, unsigned int type) 2341 { 2342 struct vm_area_struct *vma; 2343 int ret = 0; 2344 VMA_ITERATOR(vmi, mm, 0); 2345 2346 mmap_read_lock(mm); 2347 if (check_stable_address_space(mm)) 2348 goto unlock; 2349 for_each_vma(vmi, vma) { 2350 if (vma->anon_vma && !is_vm_hugetlb_page(vma)) { 2351 ret = unuse_vma(vma, type); 2352 if (ret) 2353 break; 2354 } 2355 2356 cond_resched(); 2357 } 2358 unlock: 2359 mmap_read_unlock(mm); 2360 return ret; 2361 } 2362 2363 /* 2364 * Scan swap_map from current position to next entry still in use. 2365 * Return 0 if there are no inuse entries after prev till end of 2366 * the map. 2367 */ 2368 static unsigned int find_next_to_unuse(struct swap_info_struct *si, 2369 unsigned int prev) 2370 { 2371 unsigned int i; 2372 unsigned long swp_tb; 2373 unsigned char count; 2374 2375 /* 2376 * No need for swap_lock here: we're just looking 2377 * for whether an entry is in use, not modifying it; false 2378 * hits are okay, and sys_swapoff() has already prevented new 2379 * allocations from this area (while holding swap_lock). 2380 */ 2381 for (i = prev + 1; i < si->max; i++) { 2382 count = READ_ONCE(si->swap_map[i]); 2383 swp_tb = swap_table_get(__swap_offset_to_cluster(si, i), 2384 i % SWAPFILE_CLUSTER); 2385 if (count == SWAP_MAP_BAD) 2386 continue; 2387 if (count || swp_tb_is_folio(swp_tb)) 2388 break; 2389 if ((i % LATENCY_LIMIT) == 0) 2390 cond_resched(); 2391 } 2392 2393 if (i == si->max) 2394 i = 0; 2395 2396 return i; 2397 } 2398 2399 static int try_to_unuse(unsigned int type) 2400 { 2401 struct mm_struct *prev_mm; 2402 struct mm_struct *mm; 2403 struct list_head *p; 2404 int retval = 0; 2405 struct swap_info_struct *si = swap_info[type]; 2406 struct folio *folio; 2407 swp_entry_t entry; 2408 unsigned int i; 2409 2410 if (!swap_usage_in_pages(si)) 2411 goto success; 2412 2413 retry: 2414 retval = shmem_unuse(type); 2415 if (retval) 2416 return retval; 2417 2418 prev_mm = &init_mm; 2419 mmget(prev_mm); 2420 2421 spin_lock(&mmlist_lock); 2422 p = &init_mm.mmlist; 2423 while (swap_usage_in_pages(si) && 2424 !signal_pending(current) && 2425 (p = p->next) != &init_mm.mmlist) { 2426 2427 mm = list_entry(p, struct mm_struct, mmlist); 2428 if (!mmget_not_zero(mm)) 2429 continue; 2430 spin_unlock(&mmlist_lock); 2431 mmput(prev_mm); 2432 prev_mm = mm; 2433 retval = unuse_mm(mm, type); 2434 if (retval) { 2435 mmput(prev_mm); 2436 return retval; 2437 } 2438 2439 /* 2440 * Make sure that we aren't completely killing 2441 * interactive performance. 2442 */ 2443 cond_resched(); 2444 spin_lock(&mmlist_lock); 2445 } 2446 spin_unlock(&mmlist_lock); 2447 2448 mmput(prev_mm); 2449 2450 i = 0; 2451 while (swap_usage_in_pages(si) && 2452 !signal_pending(current) && 2453 (i = find_next_to_unuse(si, i)) != 0) { 2454 2455 entry = swp_entry(type, i); 2456 folio = swap_cache_get_folio(entry); 2457 if (!folio) 2458 continue; 2459 2460 /* 2461 * It is conceivable that a racing task removed this folio from 2462 * swap cache just before we acquired the page lock. The folio 2463 * might even be back in swap cache on another swap area. But 2464 * that is okay, folio_free_swap() only removes stale folios. 2465 */ 2466 folio_lock(folio); 2467 folio_wait_writeback(folio); 2468 folio_free_swap(folio); 2469 folio_unlock(folio); 2470 folio_put(folio); 2471 } 2472 2473 /* 2474 * Lets check again to see if there are still swap entries in the map. 2475 * If yes, we would need to do retry the unuse logic again. 2476 * Under global memory pressure, swap entries can be reinserted back 2477 * into process space after the mmlist loop above passes over them. 2478 * 2479 * Limit the number of retries? No: when mmget_not_zero() 2480 * above fails, that mm is likely to be freeing swap from 2481 * exit_mmap(), which proceeds at its own independent pace; 2482 * and even shmem_writeout() could have been preempted after 2483 * folio_alloc_swap(), temporarily hiding that swap. It's easy 2484 * and robust (though cpu-intensive) just to keep retrying. 2485 */ 2486 if (swap_usage_in_pages(si)) { 2487 if (!signal_pending(current)) 2488 goto retry; 2489 return -EINTR; 2490 } 2491 2492 success: 2493 /* 2494 * Make sure that further cleanups after try_to_unuse() returns happen 2495 * after swap_range_free() reduces si->inuse_pages to 0. 2496 */ 2497 smp_mb(); 2498 return 0; 2499 } 2500 2501 /* 2502 * After a successful try_to_unuse, if no swap is now in use, we know 2503 * we can empty the mmlist. swap_lock must be held on entry and exit. 2504 * Note that mmlist_lock nests inside swap_lock, and an mm must be 2505 * added to the mmlist just after page_duplicate - before would be racy. 2506 */ 2507 static void drain_mmlist(void) 2508 { 2509 struct list_head *p, *next; 2510 unsigned int type; 2511 2512 for (type = 0; type < nr_swapfiles; type++) 2513 if (swap_usage_in_pages(swap_info[type])) 2514 return; 2515 spin_lock(&mmlist_lock); 2516 list_for_each_safe(p, next, &init_mm.mmlist) 2517 list_del_init(p); 2518 spin_unlock(&mmlist_lock); 2519 } 2520 2521 /* 2522 * Free all of a swapdev's extent information 2523 */ 2524 static void destroy_swap_extents(struct swap_info_struct *sis) 2525 { 2526 while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) { 2527 struct rb_node *rb = sis->swap_extent_root.rb_node; 2528 struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node); 2529 2530 rb_erase(rb, &sis->swap_extent_root); 2531 kfree(se); 2532 } 2533 2534 if (sis->flags & SWP_ACTIVATED) { 2535 struct file *swap_file = sis->swap_file; 2536 struct address_space *mapping = swap_file->f_mapping; 2537 2538 sis->flags &= ~SWP_ACTIVATED; 2539 if (mapping->a_ops->swap_deactivate) 2540 mapping->a_ops->swap_deactivate(swap_file); 2541 } 2542 } 2543 2544 /* 2545 * Add a block range (and the corresponding page range) into this swapdev's 2546 * extent tree. 2547 * 2548 * This function rather assumes that it is called in ascending page order. 2549 */ 2550 int 2551 add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, 2552 unsigned long nr_pages, sector_t start_block) 2553 { 2554 struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL; 2555 struct swap_extent *se; 2556 struct swap_extent *new_se; 2557 2558 /* 2559 * place the new node at the right most since the 2560 * function is called in ascending page order. 2561 */ 2562 while (*link) { 2563 parent = *link; 2564 link = &parent->rb_right; 2565 } 2566 2567 if (parent) { 2568 se = rb_entry(parent, struct swap_extent, rb_node); 2569 BUG_ON(se->start_page + se->nr_pages != start_page); 2570 if (se->start_block + se->nr_pages == start_block) { 2571 /* Merge it */ 2572 se->nr_pages += nr_pages; 2573 return 0; 2574 } 2575 } 2576 2577 /* No merge, insert a new extent. */ 2578 new_se = kmalloc(sizeof(*se), GFP_KERNEL); 2579 if (new_se == NULL) 2580 return -ENOMEM; 2581 new_se->start_page = start_page; 2582 new_se->nr_pages = nr_pages; 2583 new_se->start_block = start_block; 2584 2585 rb_link_node(&new_se->rb_node, parent, link); 2586 rb_insert_color(&new_se->rb_node, &sis->swap_extent_root); 2587 return 1; 2588 } 2589 EXPORT_SYMBOL_GPL(add_swap_extent); 2590 2591 /* 2592 * A `swap extent' is a simple thing which maps a contiguous range of pages 2593 * onto a contiguous range of disk blocks. A rbtree of swap extents is 2594 * built at swapon time and is then used at swap_writepage/swap_read_folio 2595 * time for locating where on disk a page belongs. 2596 * 2597 * If the swapfile is an S_ISBLK block device, a single extent is installed. 2598 * This is done so that the main operating code can treat S_ISBLK and S_ISREG 2599 * swap files identically. 2600 * 2601 * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap 2602 * extent rbtree operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK 2603 * swapfiles are handled *identically* after swapon time. 2604 * 2605 * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks 2606 * and will parse them into a rbtree, in PAGE_SIZE chunks. If some stray 2607 * blocks are found which do not fall within the PAGE_SIZE alignment 2608 * requirements, they are simply tossed out - we will never use those blocks 2609 * for swapping. 2610 * 2611 * For all swap devices we set S_SWAPFILE across the life of the swapon. This 2612 * prevents users from writing to the swap device, which will corrupt memory. 2613 * 2614 * The amount of disk space which a single swap extent represents varies. 2615 * Typically it is in the 1-4 megabyte range. So we can have hundreds of 2616 * extents in the rbtree. - akpm. 2617 */ 2618 static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) 2619 { 2620 struct file *swap_file = sis->swap_file; 2621 struct address_space *mapping = swap_file->f_mapping; 2622 struct inode *inode = mapping->host; 2623 int ret; 2624 2625 if (S_ISBLK(inode->i_mode)) { 2626 ret = add_swap_extent(sis, 0, sis->max, 0); 2627 *span = sis->pages; 2628 return ret; 2629 } 2630 2631 if (mapping->a_ops->swap_activate) { 2632 ret = mapping->a_ops->swap_activate(sis, swap_file, span); 2633 if (ret < 0) 2634 return ret; 2635 sis->flags |= SWP_ACTIVATED; 2636 if ((sis->flags & SWP_FS_OPS) && 2637 sio_pool_init() != 0) { 2638 destroy_swap_extents(sis); 2639 return -ENOMEM; 2640 } 2641 return ret; 2642 } 2643 2644 return generic_swapfile_activate(sis, swap_file, span); 2645 } 2646 2647 static void setup_swap_info(struct swap_info_struct *si, int prio, 2648 unsigned char *swap_map, 2649 struct swap_cluster_info *cluster_info, 2650 unsigned long *zeromap) 2651 { 2652 si->prio = prio; 2653 /* 2654 * the plist prio is negated because plist ordering is 2655 * low-to-high, while swap ordering is high-to-low 2656 */ 2657 si->list.prio = -si->prio; 2658 si->avail_list.prio = -si->prio; 2659 si->swap_map = swap_map; 2660 si->cluster_info = cluster_info; 2661 si->zeromap = zeromap; 2662 } 2663 2664 static void _enable_swap_info(struct swap_info_struct *si) 2665 { 2666 atomic_long_add(si->pages, &nr_swap_pages); 2667 total_swap_pages += si->pages; 2668 2669 assert_spin_locked(&swap_lock); 2670 2671 plist_add(&si->list, &swap_active_head); 2672 2673 /* Add back to available list */ 2674 add_to_avail_list(si, true); 2675 } 2676 2677 static void enable_swap_info(struct swap_info_struct *si, int prio, 2678 unsigned char *swap_map, 2679 struct swap_cluster_info *cluster_info, 2680 unsigned long *zeromap) 2681 { 2682 spin_lock(&swap_lock); 2683 spin_lock(&si->lock); 2684 setup_swap_info(si, prio, swap_map, cluster_info, zeromap); 2685 spin_unlock(&si->lock); 2686 spin_unlock(&swap_lock); 2687 /* 2688 * Finished initializing swap device, now it's safe to reference it. 2689 */ 2690 percpu_ref_resurrect(&si->users); 2691 spin_lock(&swap_lock); 2692 spin_lock(&si->lock); 2693 _enable_swap_info(si); 2694 spin_unlock(&si->lock); 2695 spin_unlock(&swap_lock); 2696 } 2697 2698 static void reinsert_swap_info(struct swap_info_struct *si) 2699 { 2700 spin_lock(&swap_lock); 2701 spin_lock(&si->lock); 2702 setup_swap_info(si, si->prio, si->swap_map, si->cluster_info, si->zeromap); 2703 _enable_swap_info(si); 2704 spin_unlock(&si->lock); 2705 spin_unlock(&swap_lock); 2706 } 2707 2708 /* 2709 * Called after clearing SWP_WRITEOK, ensures cluster_alloc_range 2710 * see the updated flags, so there will be no more allocations. 2711 */ 2712 static void wait_for_allocation(struct swap_info_struct *si) 2713 { 2714 unsigned long offset; 2715 unsigned long end = ALIGN(si->max, SWAPFILE_CLUSTER); 2716 struct swap_cluster_info *ci; 2717 2718 BUG_ON(si->flags & SWP_WRITEOK); 2719 2720 for (offset = 0; offset < end; offset += SWAPFILE_CLUSTER) { 2721 ci = swap_cluster_lock(si, offset); 2722 swap_cluster_unlock(ci); 2723 } 2724 } 2725 2726 static void free_cluster_info(struct swap_cluster_info *cluster_info, 2727 unsigned long maxpages) 2728 { 2729 struct swap_cluster_info *ci; 2730 int i, nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); 2731 2732 if (!cluster_info) 2733 return; 2734 for (i = 0; i < nr_clusters; i++) { 2735 ci = cluster_info + i; 2736 /* Cluster with bad marks count will have a remaining table */ 2737 spin_lock(&ci->lock); 2738 if (rcu_dereference_protected(ci->table, true)) { 2739 ci->count = 0; 2740 swap_cluster_free_table(ci); 2741 } 2742 spin_unlock(&ci->lock); 2743 } 2744 kvfree(cluster_info); 2745 } 2746 2747 /* 2748 * Called after swap device's reference count is dead, so 2749 * neither scan nor allocation will use it. 2750 */ 2751 static void flush_percpu_swap_cluster(struct swap_info_struct *si) 2752 { 2753 int cpu, i; 2754 struct swap_info_struct **pcp_si; 2755 2756 for_each_possible_cpu(cpu) { 2757 pcp_si = per_cpu_ptr(percpu_swap_cluster.si, cpu); 2758 /* 2759 * Invalidate the percpu swap cluster cache, si->users 2760 * is dead, so no new user will point to it, just flush 2761 * any existing user. 2762 */ 2763 for (i = 0; i < SWAP_NR_ORDERS; i++) 2764 cmpxchg(&pcp_si[i], si, NULL); 2765 } 2766 } 2767 2768 2769 SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) 2770 { 2771 struct swap_info_struct *p = NULL; 2772 unsigned char *swap_map; 2773 unsigned long *zeromap; 2774 struct swap_cluster_info *cluster_info; 2775 struct file *swap_file, *victim; 2776 struct address_space *mapping; 2777 struct inode *inode; 2778 unsigned int maxpages; 2779 int err, found = 0; 2780 2781 if (!capable(CAP_SYS_ADMIN)) 2782 return -EPERM; 2783 2784 BUG_ON(!current->mm); 2785 2786 CLASS(filename, pathname)(specialfile); 2787 victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0); 2788 if (IS_ERR(victim)) 2789 return PTR_ERR(victim); 2790 2791 mapping = victim->f_mapping; 2792 spin_lock(&swap_lock); 2793 plist_for_each_entry(p, &swap_active_head, list) { 2794 if (p->flags & SWP_WRITEOK) { 2795 if (p->swap_file->f_mapping == mapping) { 2796 found = 1; 2797 break; 2798 } 2799 } 2800 } 2801 if (!found) { 2802 err = -EINVAL; 2803 spin_unlock(&swap_lock); 2804 goto out_dput; 2805 } 2806 if (!security_vm_enough_memory_mm(current->mm, p->pages)) 2807 vm_unacct_memory(p->pages); 2808 else { 2809 err = -ENOMEM; 2810 spin_unlock(&swap_lock); 2811 goto out_dput; 2812 } 2813 spin_lock(&p->lock); 2814 del_from_avail_list(p, true); 2815 plist_del(&p->list, &swap_active_head); 2816 atomic_long_sub(p->pages, &nr_swap_pages); 2817 total_swap_pages -= p->pages; 2818 spin_unlock(&p->lock); 2819 spin_unlock(&swap_lock); 2820 2821 wait_for_allocation(p); 2822 2823 set_current_oom_origin(); 2824 err = try_to_unuse(p->type); 2825 clear_current_oom_origin(); 2826 2827 if (err) { 2828 /* re-insert swap space back into swap_list */ 2829 reinsert_swap_info(p); 2830 goto out_dput; 2831 } 2832 2833 /* 2834 * Wait for swap operations protected by get/put_swap_device() 2835 * to complete. Because of synchronize_rcu() here, all swap 2836 * operations protected by RCU reader side lock (including any 2837 * spinlock) will be waited too. This makes it easy to 2838 * prevent folio_test_swapcache() and the following swap cache 2839 * operations from racing with swapoff. 2840 */ 2841 percpu_ref_kill(&p->users); 2842 synchronize_rcu(); 2843 wait_for_completion(&p->comp); 2844 2845 flush_work(&p->discard_work); 2846 flush_work(&p->reclaim_work); 2847 flush_percpu_swap_cluster(p); 2848 2849 destroy_swap_extents(p); 2850 if (p->flags & SWP_CONTINUED) 2851 free_swap_count_continuations(p); 2852 2853 if (!(p->flags & SWP_SOLIDSTATE)) 2854 atomic_dec(&nr_rotate_swap); 2855 2856 mutex_lock(&swapon_mutex); 2857 spin_lock(&swap_lock); 2858 spin_lock(&p->lock); 2859 drain_mmlist(); 2860 2861 swap_file = p->swap_file; 2862 p->swap_file = NULL; 2863 swap_map = p->swap_map; 2864 p->swap_map = NULL; 2865 zeromap = p->zeromap; 2866 p->zeromap = NULL; 2867 maxpages = p->max; 2868 cluster_info = p->cluster_info; 2869 p->max = 0; 2870 p->cluster_info = NULL; 2871 spin_unlock(&p->lock); 2872 spin_unlock(&swap_lock); 2873 arch_swap_invalidate_area(p->type); 2874 zswap_swapoff(p->type); 2875 mutex_unlock(&swapon_mutex); 2876 kfree(p->global_cluster); 2877 p->global_cluster = NULL; 2878 vfree(swap_map); 2879 kvfree(zeromap); 2880 free_cluster_info(cluster_info, maxpages); 2881 /* Destroy swap account information */ 2882 swap_cgroup_swapoff(p->type); 2883 2884 inode = mapping->host; 2885 2886 inode_lock(inode); 2887 inode->i_flags &= ~S_SWAPFILE; 2888 inode_unlock(inode); 2889 filp_close(swap_file, NULL); 2890 2891 /* 2892 * Clear the SWP_USED flag after all resources are freed so that swapon 2893 * can reuse this swap_info in alloc_swap_info() safely. It is ok to 2894 * not hold p->lock after we cleared its SWP_WRITEOK. 2895 */ 2896 spin_lock(&swap_lock); 2897 p->flags = 0; 2898 spin_unlock(&swap_lock); 2899 2900 err = 0; 2901 atomic_inc(&proc_poll_event); 2902 wake_up_interruptible(&proc_poll_wait); 2903 2904 out_dput: 2905 filp_close(victim, NULL); 2906 return err; 2907 } 2908 2909 #ifdef CONFIG_PROC_FS 2910 static __poll_t swaps_poll(struct file *file, poll_table *wait) 2911 { 2912 struct seq_file *seq = file->private_data; 2913 2914 poll_wait(file, &proc_poll_wait, wait); 2915 2916 if (seq->poll_event != atomic_read(&proc_poll_event)) { 2917 seq->poll_event = atomic_read(&proc_poll_event); 2918 return EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI; 2919 } 2920 2921 return EPOLLIN | EPOLLRDNORM; 2922 } 2923 2924 /* iterator */ 2925 static void *swap_start(struct seq_file *swap, loff_t *pos) 2926 { 2927 struct swap_info_struct *si; 2928 int type; 2929 loff_t l = *pos; 2930 2931 mutex_lock(&swapon_mutex); 2932 2933 if (!l) 2934 return SEQ_START_TOKEN; 2935 2936 for (type = 0; (si = swap_type_to_info(type)); type++) { 2937 if (!(si->flags & SWP_USED) || !si->swap_map) 2938 continue; 2939 if (!--l) 2940 return si; 2941 } 2942 2943 return NULL; 2944 } 2945 2946 static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) 2947 { 2948 struct swap_info_struct *si = v; 2949 int type; 2950 2951 if (v == SEQ_START_TOKEN) 2952 type = 0; 2953 else 2954 type = si->type + 1; 2955 2956 ++(*pos); 2957 for (; (si = swap_type_to_info(type)); type++) { 2958 if (!(si->flags & SWP_USED) || !si->swap_map) 2959 continue; 2960 return si; 2961 } 2962 2963 return NULL; 2964 } 2965 2966 static void swap_stop(struct seq_file *swap, void *v) 2967 { 2968 mutex_unlock(&swapon_mutex); 2969 } 2970 2971 static int swap_show(struct seq_file *swap, void *v) 2972 { 2973 struct swap_info_struct *si = v; 2974 struct file *file; 2975 int len; 2976 unsigned long bytes, inuse; 2977 2978 if (si == SEQ_START_TOKEN) { 2979 seq_puts(swap, "Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n"); 2980 return 0; 2981 } 2982 2983 bytes = K(si->pages); 2984 inuse = K(swap_usage_in_pages(si)); 2985 2986 file = si->swap_file; 2987 len = seq_file_path(swap, file, " \t\n\\"); 2988 seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n", 2989 len < 40 ? 40 - len : 1, " ", 2990 S_ISBLK(file_inode(file)->i_mode) ? 2991 "partition" : "file\t", 2992 bytes, bytes < 10000000 ? "\t" : "", 2993 inuse, inuse < 10000000 ? "\t" : "", 2994 si->prio); 2995 return 0; 2996 } 2997 2998 static const struct seq_operations swaps_op = { 2999 .start = swap_start, 3000 .next = swap_next, 3001 .stop = swap_stop, 3002 .show = swap_show 3003 }; 3004 3005 static int swaps_open(struct inode *inode, struct file *file) 3006 { 3007 struct seq_file *seq; 3008 int ret; 3009 3010 ret = seq_open(file, &swaps_op); 3011 if (ret) 3012 return ret; 3013 3014 seq = file->private_data; 3015 seq->poll_event = atomic_read(&proc_poll_event); 3016 return 0; 3017 } 3018 3019 static const struct proc_ops swaps_proc_ops = { 3020 .proc_flags = PROC_ENTRY_PERMANENT, 3021 .proc_open = swaps_open, 3022 .proc_read = seq_read, 3023 .proc_lseek = seq_lseek, 3024 .proc_release = seq_release, 3025 .proc_poll = swaps_poll, 3026 }; 3027 3028 static int __init procswaps_init(void) 3029 { 3030 proc_create("swaps", 0, NULL, &swaps_proc_ops); 3031 return 0; 3032 } 3033 __initcall(procswaps_init); 3034 #endif /* CONFIG_PROC_FS */ 3035 3036 #ifdef MAX_SWAPFILES_CHECK 3037 static int __init max_swapfiles_check(void) 3038 { 3039 MAX_SWAPFILES_CHECK(); 3040 return 0; 3041 } 3042 late_initcall(max_swapfiles_check); 3043 #endif 3044 3045 static struct swap_info_struct *alloc_swap_info(void) 3046 { 3047 struct swap_info_struct *p; 3048 struct swap_info_struct *defer = NULL; 3049 unsigned int type; 3050 3051 p = kvzalloc(sizeof(struct swap_info_struct), GFP_KERNEL); 3052 if (!p) 3053 return ERR_PTR(-ENOMEM); 3054 3055 if (percpu_ref_init(&p->users, swap_users_ref_free, 3056 PERCPU_REF_INIT_DEAD, GFP_KERNEL)) { 3057 kvfree(p); 3058 return ERR_PTR(-ENOMEM); 3059 } 3060 3061 spin_lock(&swap_lock); 3062 for (type = 0; type < nr_swapfiles; type++) { 3063 if (!(swap_info[type]->flags & SWP_USED)) 3064 break; 3065 } 3066 if (type >= MAX_SWAPFILES) { 3067 spin_unlock(&swap_lock); 3068 percpu_ref_exit(&p->users); 3069 kvfree(p); 3070 return ERR_PTR(-EPERM); 3071 } 3072 if (type >= nr_swapfiles) { 3073 p->type = type; 3074 /* 3075 * Publish the swap_info_struct after initializing it. 3076 * Note that kvzalloc() above zeroes all its fields. 3077 */ 3078 smp_store_release(&swap_info[type], p); /* rcu_assign_pointer() */ 3079 nr_swapfiles++; 3080 } else { 3081 defer = p; 3082 p = swap_info[type]; 3083 /* 3084 * Do not memset this entry: a racing procfs swap_next() 3085 * would be relying on p->type to remain valid. 3086 */ 3087 } 3088 p->swap_extent_root = RB_ROOT; 3089 plist_node_init(&p->list, 0); 3090 plist_node_init(&p->avail_list, 0); 3091 p->flags = SWP_USED; 3092 spin_unlock(&swap_lock); 3093 if (defer) { 3094 percpu_ref_exit(&defer->users); 3095 kvfree(defer); 3096 } 3097 spin_lock_init(&p->lock); 3098 spin_lock_init(&p->cont_lock); 3099 atomic_long_set(&p->inuse_pages, SWAP_USAGE_OFFLIST_BIT); 3100 init_completion(&p->comp); 3101 3102 return p; 3103 } 3104 3105 static int claim_swapfile(struct swap_info_struct *si, struct inode *inode) 3106 { 3107 if (S_ISBLK(inode->i_mode)) { 3108 si->bdev = I_BDEV(inode); 3109 /* 3110 * Zoned block devices contain zones that have a sequential 3111 * write only restriction. Hence zoned block devices are not 3112 * suitable for swapping. Disallow them here. 3113 */ 3114 if (bdev_is_zoned(si->bdev)) 3115 return -EINVAL; 3116 si->flags |= SWP_BLKDEV; 3117 } else if (S_ISREG(inode->i_mode)) { 3118 si->bdev = inode->i_sb->s_bdev; 3119 } 3120 3121 return 0; 3122 } 3123 3124 3125 /* 3126 * Find out how many pages are allowed for a single swap device. There 3127 * are two limiting factors: 3128 * 1) the number of bits for the swap offset in the swp_entry_t type, and 3129 * 2) the number of bits in the swap pte, as defined by the different 3130 * architectures. 3131 * 3132 * In order to find the largest possible bit mask, a swap entry with 3133 * swap type 0 and swap offset ~0UL is created, encoded to a swap pte, 3134 * decoded to a swp_entry_t again, and finally the swap offset is 3135 * extracted. 3136 * 3137 * This will mask all the bits from the initial ~0UL mask that can't 3138 * be encoded in either the swp_entry_t or the architecture definition 3139 * of a swap pte. 3140 */ 3141 unsigned long generic_max_swapfile_size(void) 3142 { 3143 swp_entry_t entry = swp_entry(0, ~0UL); 3144 const pte_t pte = softleaf_to_pte(entry); 3145 3146 /* 3147 * Since the PTE can be an invalid softleaf entry (e.g. the none PTE), 3148 * we need to do this manually. 3149 */ 3150 entry = __pte_to_swp_entry(pte); 3151 entry = swp_entry(__swp_type(entry), __swp_offset(entry)); 3152 3153 return swp_offset(entry) + 1; 3154 } 3155 3156 /* Can be overridden by an architecture for additional checks. */ 3157 __weak unsigned long arch_max_swapfile_size(void) 3158 { 3159 return generic_max_swapfile_size(); 3160 } 3161 3162 static unsigned long read_swap_header(struct swap_info_struct *si, 3163 union swap_header *swap_header, 3164 struct inode *inode) 3165 { 3166 int i; 3167 unsigned long maxpages; 3168 unsigned long swapfilepages; 3169 unsigned long last_page; 3170 3171 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { 3172 pr_err("Unable to find swap-space signature\n"); 3173 return 0; 3174 } 3175 3176 /* swap partition endianness hack... */ 3177 if (swab32(swap_header->info.version) == 1) { 3178 swab32s(&swap_header->info.version); 3179 swab32s(&swap_header->info.last_page); 3180 swab32s(&swap_header->info.nr_badpages); 3181 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) 3182 return 0; 3183 for (i = 0; i < swap_header->info.nr_badpages; i++) 3184 swab32s(&swap_header->info.badpages[i]); 3185 } 3186 /* Check the swap header's sub-version */ 3187 if (swap_header->info.version != 1) { 3188 pr_warn("Unable to handle swap header version %d\n", 3189 swap_header->info.version); 3190 return 0; 3191 } 3192 3193 maxpages = swapfile_maximum_size; 3194 last_page = swap_header->info.last_page; 3195 if (!last_page) { 3196 pr_warn("Empty swap-file\n"); 3197 return 0; 3198 } 3199 if (last_page > maxpages) { 3200 pr_warn("Truncating oversized swap area, only using %luk out of %luk\n", 3201 K(maxpages), K(last_page)); 3202 } 3203 if (maxpages > last_page) { 3204 maxpages = last_page + 1; 3205 /* p->max is an unsigned int: don't overflow it */ 3206 if ((unsigned int)maxpages == 0) 3207 maxpages = UINT_MAX; 3208 } 3209 3210 if (!maxpages) 3211 return 0; 3212 swapfilepages = i_size_read(inode) >> PAGE_SHIFT; 3213 if (swapfilepages && maxpages > swapfilepages) { 3214 pr_warn("Swap area shorter than signature indicates\n"); 3215 return 0; 3216 } 3217 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) 3218 return 0; 3219 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) 3220 return 0; 3221 3222 return maxpages; 3223 } 3224 3225 static int setup_swap_map(struct swap_info_struct *si, 3226 union swap_header *swap_header, 3227 unsigned char *swap_map, 3228 unsigned long maxpages) 3229 { 3230 unsigned long i; 3231 3232 swap_map[0] = SWAP_MAP_BAD; /* omit header page */ 3233 for (i = 0; i < swap_header->info.nr_badpages; i++) { 3234 unsigned int page_nr = swap_header->info.badpages[i]; 3235 if (page_nr == 0 || page_nr > swap_header->info.last_page) 3236 return -EINVAL; 3237 if (page_nr < maxpages) { 3238 swap_map[page_nr] = SWAP_MAP_BAD; 3239 si->pages--; 3240 } 3241 } 3242 3243 if (!si->pages) { 3244 pr_warn("Empty swap-file\n"); 3245 return -EINVAL; 3246 } 3247 3248 return 0; 3249 } 3250 3251 static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, 3252 union swap_header *swap_header, 3253 unsigned long maxpages) 3254 { 3255 unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); 3256 struct swap_cluster_info *cluster_info; 3257 int err = -ENOMEM; 3258 unsigned long i; 3259 3260 cluster_info = kvcalloc(nr_clusters, sizeof(*cluster_info), GFP_KERNEL); 3261 if (!cluster_info) 3262 goto err; 3263 3264 for (i = 0; i < nr_clusters; i++) 3265 spin_lock_init(&cluster_info[i].lock); 3266 3267 if (!(si->flags & SWP_SOLIDSTATE)) { 3268 si->global_cluster = kmalloc(sizeof(*si->global_cluster), 3269 GFP_KERNEL); 3270 if (!si->global_cluster) 3271 goto err; 3272 for (i = 0; i < SWAP_NR_ORDERS; i++) 3273 si->global_cluster->next[i] = SWAP_ENTRY_INVALID; 3274 spin_lock_init(&si->global_cluster_lock); 3275 } 3276 3277 /* 3278 * Mark unusable pages as unavailable. The clusters aren't 3279 * marked free yet, so no list operations are involved yet. 3280 * 3281 * See setup_swap_map(): header page, bad pages, 3282 * and the EOF part of the last cluster. 3283 */ 3284 err = swap_cluster_setup_bad_slot(cluster_info, 0); 3285 if (err) 3286 goto err; 3287 for (i = 0; i < swap_header->info.nr_badpages; i++) { 3288 unsigned int page_nr = swap_header->info.badpages[i]; 3289 3290 if (page_nr >= maxpages) 3291 continue; 3292 err = swap_cluster_setup_bad_slot(cluster_info, page_nr); 3293 if (err) 3294 goto err; 3295 } 3296 for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) { 3297 err = swap_cluster_setup_bad_slot(cluster_info, i); 3298 if (err) 3299 goto err; 3300 } 3301 3302 INIT_LIST_HEAD(&si->free_clusters); 3303 INIT_LIST_HEAD(&si->full_clusters); 3304 INIT_LIST_HEAD(&si->discard_clusters); 3305 3306 for (i = 0; i < SWAP_NR_ORDERS; i++) { 3307 INIT_LIST_HEAD(&si->nonfull_clusters[i]); 3308 INIT_LIST_HEAD(&si->frag_clusters[i]); 3309 } 3310 3311 for (i = 0; i < nr_clusters; i++) { 3312 struct swap_cluster_info *ci = &cluster_info[i]; 3313 3314 if (ci->count) { 3315 ci->flags = CLUSTER_FLAG_NONFULL; 3316 list_add_tail(&ci->list, &si->nonfull_clusters[0]); 3317 } else { 3318 ci->flags = CLUSTER_FLAG_FREE; 3319 list_add_tail(&ci->list, &si->free_clusters); 3320 } 3321 } 3322 3323 return cluster_info; 3324 err: 3325 free_cluster_info(cluster_info, maxpages); 3326 return ERR_PTR(err); 3327 } 3328 3329 SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) 3330 { 3331 struct swap_info_struct *si; 3332 struct file *swap_file = NULL; 3333 struct address_space *mapping; 3334 struct dentry *dentry; 3335 int prio; 3336 int error; 3337 union swap_header *swap_header; 3338 int nr_extents; 3339 sector_t span; 3340 unsigned long maxpages; 3341 unsigned char *swap_map = NULL; 3342 unsigned long *zeromap = NULL; 3343 struct swap_cluster_info *cluster_info = NULL; 3344 struct folio *folio = NULL; 3345 struct inode *inode = NULL; 3346 bool inced_nr_rotate_swap = false; 3347 3348 if (swap_flags & ~SWAP_FLAGS_VALID) 3349 return -EINVAL; 3350 3351 if (!capable(CAP_SYS_ADMIN)) 3352 return -EPERM; 3353 3354 si = alloc_swap_info(); 3355 if (IS_ERR(si)) 3356 return PTR_ERR(si); 3357 3358 INIT_WORK(&si->discard_work, swap_discard_work); 3359 INIT_WORK(&si->reclaim_work, swap_reclaim_work); 3360 3361 CLASS(filename, name)(specialfile); 3362 swap_file = file_open_name(name, O_RDWR | O_LARGEFILE | O_EXCL, 0); 3363 if (IS_ERR(swap_file)) { 3364 error = PTR_ERR(swap_file); 3365 swap_file = NULL; 3366 goto bad_swap; 3367 } 3368 3369 si->swap_file = swap_file; 3370 mapping = swap_file->f_mapping; 3371 dentry = swap_file->f_path.dentry; 3372 inode = mapping->host; 3373 3374 error = claim_swapfile(si, inode); 3375 if (unlikely(error)) 3376 goto bad_swap; 3377 3378 inode_lock(inode); 3379 if (d_unlinked(dentry) || cant_mount(dentry)) { 3380 error = -ENOENT; 3381 goto bad_swap_unlock_inode; 3382 } 3383 if (IS_SWAPFILE(inode)) { 3384 error = -EBUSY; 3385 goto bad_swap_unlock_inode; 3386 } 3387 3388 /* 3389 * The swap subsystem needs a major overhaul to support this. 3390 * It doesn't work yet so just disable it for now. 3391 */ 3392 if (mapping_min_folio_order(mapping) > 0) { 3393 error = -EINVAL; 3394 goto bad_swap_unlock_inode; 3395 } 3396 3397 /* 3398 * Read the swap header. 3399 */ 3400 if (!mapping->a_ops->read_folio) { 3401 error = -EINVAL; 3402 goto bad_swap_unlock_inode; 3403 } 3404 folio = read_mapping_folio(mapping, 0, swap_file); 3405 if (IS_ERR(folio)) { 3406 error = PTR_ERR(folio); 3407 goto bad_swap_unlock_inode; 3408 } 3409 swap_header = kmap_local_folio(folio, 0); 3410 3411 maxpages = read_swap_header(si, swap_header, inode); 3412 if (unlikely(!maxpages)) { 3413 error = -EINVAL; 3414 goto bad_swap_unlock_inode; 3415 } 3416 3417 si->max = maxpages; 3418 si->pages = maxpages - 1; 3419 nr_extents = setup_swap_extents(si, &span); 3420 if (nr_extents < 0) { 3421 error = nr_extents; 3422 goto bad_swap_unlock_inode; 3423 } 3424 if (si->pages != si->max - 1) { 3425 pr_err("swap:%u != (max:%u - 1)\n", si->pages, si->max); 3426 error = -EINVAL; 3427 goto bad_swap_unlock_inode; 3428 } 3429 3430 maxpages = si->max; 3431 3432 /* OK, set up the swap map and apply the bad block list */ 3433 swap_map = vzalloc(maxpages); 3434 if (!swap_map) { 3435 error = -ENOMEM; 3436 goto bad_swap_unlock_inode; 3437 } 3438 3439 error = swap_cgroup_swapon(si->type, maxpages); 3440 if (error) 3441 goto bad_swap_unlock_inode; 3442 3443 error = setup_swap_map(si, swap_header, swap_map, maxpages); 3444 if (error) 3445 goto bad_swap_unlock_inode; 3446 3447 /* 3448 * Use kvmalloc_array instead of bitmap_zalloc as the allocation order might 3449 * be above MAX_PAGE_ORDER incase of a large swap file. 3450 */ 3451 zeromap = kvmalloc_array(BITS_TO_LONGS(maxpages), sizeof(long), 3452 GFP_KERNEL | __GFP_ZERO); 3453 if (!zeromap) { 3454 error = -ENOMEM; 3455 goto bad_swap_unlock_inode; 3456 } 3457 3458 if (si->bdev && bdev_stable_writes(si->bdev)) 3459 si->flags |= SWP_STABLE_WRITES; 3460 3461 if (si->bdev && bdev_synchronous(si->bdev)) 3462 si->flags |= SWP_SYNCHRONOUS_IO; 3463 3464 if (si->bdev && bdev_nonrot(si->bdev)) { 3465 si->flags |= SWP_SOLIDSTATE; 3466 } else { 3467 atomic_inc(&nr_rotate_swap); 3468 inced_nr_rotate_swap = true; 3469 } 3470 3471 cluster_info = setup_clusters(si, swap_header, maxpages); 3472 if (IS_ERR(cluster_info)) { 3473 error = PTR_ERR(cluster_info); 3474 cluster_info = NULL; 3475 goto bad_swap_unlock_inode; 3476 } 3477 3478 if ((swap_flags & SWAP_FLAG_DISCARD) && 3479 si->bdev && bdev_max_discard_sectors(si->bdev)) { 3480 /* 3481 * When discard is enabled for swap with no particular 3482 * policy flagged, we set all swap discard flags here in 3483 * order to sustain backward compatibility with older 3484 * swapon(8) releases. 3485 */ 3486 si->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD | 3487 SWP_PAGE_DISCARD); 3488 3489 /* 3490 * By flagging sys_swapon, a sysadmin can tell us to 3491 * either do single-time area discards only, or to just 3492 * perform discards for released swap page-clusters. 3493 * Now it's time to adjust the p->flags accordingly. 3494 */ 3495 if (swap_flags & SWAP_FLAG_DISCARD_ONCE) 3496 si->flags &= ~SWP_PAGE_DISCARD; 3497 else if (swap_flags & SWAP_FLAG_DISCARD_PAGES) 3498 si->flags &= ~SWP_AREA_DISCARD; 3499 3500 /* issue a swapon-time discard if it's still required */ 3501 if (si->flags & SWP_AREA_DISCARD) { 3502 int err = discard_swap(si); 3503 if (unlikely(err)) 3504 pr_err("swapon: discard_swap(%p): %d\n", 3505 si, err); 3506 } 3507 } 3508 3509 error = zswap_swapon(si->type, maxpages); 3510 if (error) 3511 goto bad_swap_unlock_inode; 3512 3513 /* 3514 * Flush any pending IO and dirty mappings before we start using this 3515 * swap device. 3516 */ 3517 inode->i_flags |= S_SWAPFILE; 3518 error = inode_drain_writes(inode); 3519 if (error) { 3520 inode->i_flags &= ~S_SWAPFILE; 3521 goto free_swap_zswap; 3522 } 3523 3524 mutex_lock(&swapon_mutex); 3525 prio = DEF_SWAP_PRIO; 3526 if (swap_flags & SWAP_FLAG_PREFER) 3527 prio = swap_flags & SWAP_FLAG_PRIO_MASK; 3528 enable_swap_info(si, prio, swap_map, cluster_info, zeromap); 3529 3530 pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s\n", 3531 K(si->pages), name->name, si->prio, nr_extents, 3532 K((unsigned long long)span), 3533 (si->flags & SWP_SOLIDSTATE) ? "SS" : "", 3534 (si->flags & SWP_DISCARDABLE) ? "D" : "", 3535 (si->flags & SWP_AREA_DISCARD) ? "s" : "", 3536 (si->flags & SWP_PAGE_DISCARD) ? "c" : ""); 3537 3538 mutex_unlock(&swapon_mutex); 3539 atomic_inc(&proc_poll_event); 3540 wake_up_interruptible(&proc_poll_wait); 3541 3542 error = 0; 3543 goto out; 3544 free_swap_zswap: 3545 zswap_swapoff(si->type); 3546 bad_swap_unlock_inode: 3547 inode_unlock(inode); 3548 bad_swap: 3549 kfree(si->global_cluster); 3550 si->global_cluster = NULL; 3551 inode = NULL; 3552 destroy_swap_extents(si); 3553 swap_cgroup_swapoff(si->type); 3554 spin_lock(&swap_lock); 3555 si->swap_file = NULL; 3556 si->flags = 0; 3557 spin_unlock(&swap_lock); 3558 vfree(swap_map); 3559 kvfree(zeromap); 3560 if (cluster_info) 3561 free_cluster_info(cluster_info, maxpages); 3562 if (inced_nr_rotate_swap) 3563 atomic_dec(&nr_rotate_swap); 3564 if (swap_file) 3565 filp_close(swap_file, NULL); 3566 out: 3567 if (!IS_ERR_OR_NULL(folio)) 3568 folio_release_kmap(folio, swap_header); 3569 if (inode) 3570 inode_unlock(inode); 3571 return error; 3572 } 3573 3574 void si_swapinfo(struct sysinfo *val) 3575 { 3576 unsigned int type; 3577 unsigned long nr_to_be_unused = 0; 3578 3579 spin_lock(&swap_lock); 3580 for (type = 0; type < nr_swapfiles; type++) { 3581 struct swap_info_struct *si = swap_info[type]; 3582 3583 if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) 3584 nr_to_be_unused += swap_usage_in_pages(si); 3585 } 3586 val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused; 3587 val->totalswap = total_swap_pages + nr_to_be_unused; 3588 spin_unlock(&swap_lock); 3589 } 3590 3591 /* 3592 * Verify that nr swap entries are valid and increment their swap map counts. 3593 * 3594 * Returns error code in following case. 3595 * - success -> 0 3596 * - swp_entry is invalid -> EINVAL 3597 * - swap-mapped reference is requested but the entry is not used. -> ENOENT 3598 * - swap-mapped reference requested but needs continued swap count. -> ENOMEM 3599 */ 3600 static int swap_dup_entries(struct swap_info_struct *si, 3601 struct swap_cluster_info *ci, 3602 unsigned long offset, 3603 unsigned char usage, int nr) 3604 { 3605 int i; 3606 unsigned char count; 3607 3608 for (i = 0; i < nr; i++) { 3609 count = si->swap_map[offset + i]; 3610 /* 3611 * For swapin out, allocator never allocates bad slots. for 3612 * swapin, readahead is guarded by swap_entry_swapped. 3613 */ 3614 if (WARN_ON(count == SWAP_MAP_BAD)) 3615 return -ENOENT; 3616 /* 3617 * Swap count duplication must be guarded by either swap cache folio (from 3618 * folio_dup_swap) or external lock of existing entry (from swap_dup_entry_direct). 3619 */ 3620 if (WARN_ON(!count && 3621 !swp_tb_is_folio(__swap_table_get(ci, offset % SWAPFILE_CLUSTER)))) 3622 return -ENOENT; 3623 if (WARN_ON((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)) 3624 return -EINVAL; 3625 } 3626 3627 for (i = 0; i < nr; i++) { 3628 count = si->swap_map[offset + i]; 3629 if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX) 3630 count += usage; 3631 else if (swap_count_continued(si, offset + i, count)) 3632 count = COUNT_CONTINUED; 3633 else { 3634 /* 3635 * Don't need to rollback changes, because if 3636 * usage == 1, there must be nr == 1. 3637 */ 3638 return -ENOMEM; 3639 } 3640 3641 WRITE_ONCE(si->swap_map[offset + i], count); 3642 } 3643 3644 return 0; 3645 } 3646 3647 static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) 3648 { 3649 int err; 3650 struct swap_info_struct *si; 3651 struct swap_cluster_info *ci; 3652 unsigned long offset = swp_offset(entry); 3653 3654 si = swap_entry_to_info(entry); 3655 if (WARN_ON_ONCE(!si)) { 3656 pr_err("%s%08lx\n", Bad_file, entry.val); 3657 return -EINVAL; 3658 } 3659 3660 VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); 3661 ci = swap_cluster_lock(si, offset); 3662 err = swap_dup_entries(si, ci, offset, usage, nr); 3663 swap_cluster_unlock(ci); 3664 return err; 3665 } 3666 3667 /* 3668 * swap_dup_entry_direct() - Increase reference count of a swap entry by one. 3669 * @entry: first swap entry from which we want to increase the refcount. 3670 * 3671 * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required 3672 * but could not be atomically allocated. Returns 0, just as if it succeeded, 3673 * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which 3674 * might occur if a page table entry has got corrupted. 3675 * 3676 * Context: Caller must ensure there is no race condition on the reference 3677 * owner. e.g., locking the PTL of a PTE containing the entry being increased. 3678 */ 3679 int swap_dup_entry_direct(swp_entry_t entry) 3680 { 3681 int err = 0; 3682 while (!err && __swap_duplicate(entry, 1, 1) == -ENOMEM) 3683 err = add_swap_count_continuation(entry, GFP_ATOMIC); 3684 return err; 3685 } 3686 3687 /* 3688 * add_swap_count_continuation - called when a swap count is duplicated 3689 * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's 3690 * page of the original vmalloc'ed swap_map, to hold the continuation count 3691 * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called 3692 * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc. 3693 * 3694 * These continuation pages are seldom referenced: the common paths all work 3695 * on the original swap_map, only referring to a continuation page when the 3696 * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX. 3697 * 3698 * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding 3699 * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL) 3700 * can be called after dropping locks. 3701 */ 3702 int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) 3703 { 3704 struct swap_info_struct *si; 3705 struct swap_cluster_info *ci; 3706 struct page *head; 3707 struct page *page; 3708 struct page *list_page; 3709 pgoff_t offset; 3710 unsigned char count; 3711 int ret = 0; 3712 3713 /* 3714 * When debugging, it's easier to use __GFP_ZERO here; but it's better 3715 * for latency not to zero a page while GFP_ATOMIC and holding locks. 3716 */ 3717 page = alloc_page(gfp_mask | __GFP_HIGHMEM); 3718 3719 si = get_swap_device(entry); 3720 if (!si) { 3721 /* 3722 * An acceptable race has occurred since the failing 3723 * __swap_duplicate(): the swap device may be swapoff 3724 */ 3725 goto outer; 3726 } 3727 3728 offset = swp_offset(entry); 3729 3730 ci = swap_cluster_lock(si, offset); 3731 3732 count = si->swap_map[offset]; 3733 3734 if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) { 3735 /* 3736 * The higher the swap count, the more likely it is that tasks 3737 * will race to add swap count continuation: we need to avoid 3738 * over-provisioning. 3739 */ 3740 goto out; 3741 } 3742 3743 if (!page) { 3744 ret = -ENOMEM; 3745 goto out; 3746 } 3747 3748 head = vmalloc_to_page(si->swap_map + offset); 3749 offset &= ~PAGE_MASK; 3750 3751 spin_lock(&si->cont_lock); 3752 /* 3753 * Page allocation does not initialize the page's lru field, 3754 * but it does always reset its private field. 3755 */ 3756 if (!page_private(head)) { 3757 BUG_ON(count & COUNT_CONTINUED); 3758 INIT_LIST_HEAD(&head->lru); 3759 set_page_private(head, SWP_CONTINUED); 3760 si->flags |= SWP_CONTINUED; 3761 } 3762 3763 list_for_each_entry(list_page, &head->lru, lru) { 3764 unsigned char *map; 3765 3766 /* 3767 * If the previous map said no continuation, but we've found 3768 * a continuation page, free our allocation and use this one. 3769 */ 3770 if (!(count & COUNT_CONTINUED)) 3771 goto out_unlock_cont; 3772 3773 map = kmap_local_page(list_page) + offset; 3774 count = *map; 3775 kunmap_local(map); 3776 3777 /* 3778 * If this continuation count now has some space in it, 3779 * free our allocation and use this one. 3780 */ 3781 if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX) 3782 goto out_unlock_cont; 3783 } 3784 3785 list_add_tail(&page->lru, &head->lru); 3786 page = NULL; /* now it's attached, don't free it */ 3787 out_unlock_cont: 3788 spin_unlock(&si->cont_lock); 3789 out: 3790 swap_cluster_unlock(ci); 3791 put_swap_device(si); 3792 outer: 3793 if (page) 3794 __free_page(page); 3795 return ret; 3796 } 3797 3798 /* 3799 * swap_count_continued - when the original swap_map count is incremented 3800 * from SWAP_MAP_MAX, check if there is already a continuation page to carry 3801 * into, carry if so, or else fail until a new continuation page is allocated; 3802 * when the original swap_map count is decremented from 0 with continuation, 3803 * borrow from the continuation and report whether it still holds more. 3804 * Called while __swap_duplicate() or caller of swap_put_entry_locked() 3805 * holds cluster lock. 3806 */ 3807 static bool swap_count_continued(struct swap_info_struct *si, 3808 pgoff_t offset, unsigned char count) 3809 { 3810 struct page *head; 3811 struct page *page; 3812 unsigned char *map; 3813 bool ret; 3814 3815 head = vmalloc_to_page(si->swap_map + offset); 3816 if (page_private(head) != SWP_CONTINUED) { 3817 BUG_ON(count & COUNT_CONTINUED); 3818 return false; /* need to add count continuation */ 3819 } 3820 3821 spin_lock(&si->cont_lock); 3822 offset &= ~PAGE_MASK; 3823 page = list_next_entry(head, lru); 3824 map = kmap_local_page(page) + offset; 3825 3826 if (count == SWAP_MAP_MAX) /* initial increment from swap_map */ 3827 goto init_map; /* jump over SWAP_CONT_MAX checks */ 3828 3829 if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */ 3830 /* 3831 * Think of how you add 1 to 999 3832 */ 3833 while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) { 3834 kunmap_local(map); 3835 page = list_next_entry(page, lru); 3836 BUG_ON(page == head); 3837 map = kmap_local_page(page) + offset; 3838 } 3839 if (*map == SWAP_CONT_MAX) { 3840 kunmap_local(map); 3841 page = list_next_entry(page, lru); 3842 if (page == head) { 3843 ret = false; /* add count continuation */ 3844 goto out; 3845 } 3846 map = kmap_local_page(page) + offset; 3847 init_map: *map = 0; /* we didn't zero the page */ 3848 } 3849 *map += 1; 3850 kunmap_local(map); 3851 while ((page = list_prev_entry(page, lru)) != head) { 3852 map = kmap_local_page(page) + offset; 3853 *map = COUNT_CONTINUED; 3854 kunmap_local(map); 3855 } 3856 ret = true; /* incremented */ 3857 3858 } else { /* decrementing */ 3859 /* 3860 * Think of how you subtract 1 from 1000 3861 */ 3862 BUG_ON(count != COUNT_CONTINUED); 3863 while (*map == COUNT_CONTINUED) { 3864 kunmap_local(map); 3865 page = list_next_entry(page, lru); 3866 BUG_ON(page == head); 3867 map = kmap_local_page(page) + offset; 3868 } 3869 BUG_ON(*map == 0); 3870 *map -= 1; 3871 if (*map == 0) 3872 count = 0; 3873 kunmap_local(map); 3874 while ((page = list_prev_entry(page, lru)) != head) { 3875 map = kmap_local_page(page) + offset; 3876 *map = SWAP_CONT_MAX | count; 3877 count = COUNT_CONTINUED; 3878 kunmap_local(map); 3879 } 3880 ret = count == COUNT_CONTINUED; 3881 } 3882 out: 3883 spin_unlock(&si->cont_lock); 3884 return ret; 3885 } 3886 3887 /* 3888 * free_swap_count_continuations - swapoff free all the continuation pages 3889 * appended to the swap_map, after swap_map is quiesced, before vfree'ing it. 3890 */ 3891 static void free_swap_count_continuations(struct swap_info_struct *si) 3892 { 3893 pgoff_t offset; 3894 3895 for (offset = 0; offset < si->max; offset += PAGE_SIZE) { 3896 struct page *head; 3897 head = vmalloc_to_page(si->swap_map + offset); 3898 if (page_private(head)) { 3899 struct page *page, *next; 3900 3901 list_for_each_entry_safe(page, next, &head->lru, lru) { 3902 list_del(&page->lru); 3903 __free_page(page); 3904 } 3905 } 3906 } 3907 } 3908 3909 #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) 3910 static bool __has_usable_swap(void) 3911 { 3912 return !plist_head_empty(&swap_active_head); 3913 } 3914 3915 void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp) 3916 { 3917 struct swap_info_struct *si; 3918 3919 if (!(gfp & __GFP_IO)) 3920 return; 3921 3922 if (!__has_usable_swap()) 3923 return; 3924 3925 if (!blk_cgroup_congested()) 3926 return; 3927 3928 /* 3929 * We've already scheduled a throttle, avoid taking the global swap 3930 * lock. 3931 */ 3932 if (current->throttle_disk) 3933 return; 3934 3935 spin_lock(&swap_avail_lock); 3936 plist_for_each_entry(si, &swap_avail_head, avail_list) { 3937 if (si->bdev) { 3938 blkcg_schedule_throttle(si->bdev->bd_disk, true); 3939 break; 3940 } 3941 } 3942 spin_unlock(&swap_avail_lock); 3943 } 3944 #endif 3945 3946 static int __init swapfile_init(void) 3947 { 3948 swapfile_maximum_size = arch_max_swapfile_size(); 3949 3950 /* 3951 * Once a cluster is freed, it's swap table content is read 3952 * only, and all swap cache readers (swap_cache_*) verifies 3953 * the content before use. So it's safe to use RCU slab here. 3954 */ 3955 if (!SWP_TABLE_USE_PAGE) 3956 swap_table_cachep = kmem_cache_create("swap_table", 3957 sizeof(struct swap_table), 3958 0, SLAB_PANIC | SLAB_TYPESAFE_BY_RCU, NULL); 3959 3960 #ifdef CONFIG_MIGRATION 3961 if (swapfile_maximum_size >= (1UL << SWP_MIG_TOTAL_BITS)) 3962 swap_migration_ad_supported = true; 3963 #endif /* CONFIG_MIGRATION */ 3964 3965 return 0; 3966 } 3967 subsys_initcall(swapfile_init); 3968