1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * linux/mm/swapfile.c 4 * 5 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 6 * Swap reorganised 29.12.95, Stephen Tweedie 7 */ 8 9 #include <linux/blkdev.h> 10 #include <linux/mm.h> 11 #include <linux/sched/mm.h> 12 #include <linux/sched/task.h> 13 #include <linux/hugetlb.h> 14 #include <linux/mman.h> 15 #include <linux/slab.h> 16 #include <linux/kernel_stat.h> 17 #include <linux/swap.h> 18 #include <linux/vmalloc.h> 19 #include <linux/pagemap.h> 20 #include <linux/namei.h> 21 #include <linux/shmem_fs.h> 22 #include <linux/blk-cgroup.h> 23 #include <linux/random.h> 24 #include <linux/writeback.h> 25 #include <linux/proc_fs.h> 26 #include <linux/seq_file.h> 27 #include <linux/init.h> 28 #include <linux/ksm.h> 29 #include <linux/rmap.h> 30 #include <linux/security.h> 31 #include <linux/backing-dev.h> 32 #include <linux/mutex.h> 33 #include <linux/capability.h> 34 #include <linux/syscalls.h> 35 #include <linux/memcontrol.h> 36 #include <linux/poll.h> 37 #include <linux/oom.h> 38 #include <linux/swapfile.h> 39 #include <linux/export.h> 40 #include <linux/swap_slots.h> 41 #include <linux/sort.h> 42 #include <linux/completion.h> 43 #include <linux/suspend.h> 44 #include <linux/zswap.h> 45 #include <linux/plist.h> 46 47 #include <asm/tlbflush.h> 48 #include <linux/swapops.h> 49 #include <linux/swap_cgroup.h> 50 #include "internal.h" 51 #include "swap.h" 52 53 static bool swap_count_continued(struct swap_info_struct *, pgoff_t, 54 unsigned char); 55 static void free_swap_count_continuations(struct swap_info_struct *); 56 static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry, 57 unsigned int nr_pages); 58 static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, 59 unsigned int nr_entries); 60 static bool folio_swapcache_freeable(struct folio *folio); 61 static struct swap_cluster_info *lock_cluster_or_swap_info( 62 struct swap_info_struct *si, unsigned long offset); 63 static void unlock_cluster_or_swap_info(struct swap_info_struct *si, 64 struct swap_cluster_info *ci); 65 66 static DEFINE_SPINLOCK(swap_lock); 67 static unsigned int nr_swapfiles; 68 atomic_long_t nr_swap_pages; 69 /* 70 * Some modules use swappable objects and may try to swap them out under 71 * memory pressure (via the shrinker). Before doing so, they may wish to 72 * check to see if any swap space is available. 73 */ 74 EXPORT_SYMBOL_GPL(nr_swap_pages); 75 /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ 76 long total_swap_pages; 77 static int least_priority = -1; 78 unsigned long swapfile_maximum_size; 79 #ifdef CONFIG_MIGRATION 80 bool swap_migration_ad_supported; 81 #endif /* CONFIG_MIGRATION */ 82 83 static const char Bad_file[] = "Bad swap file entry "; 84 static const char Unused_file[] = "Unused swap file entry "; 85 static const char Bad_offset[] = "Bad swap offset entry "; 86 static const char Unused_offset[] = "Unused swap offset entry "; 87 88 /* 89 * all active swap_info_structs 90 * protected with swap_lock, and ordered by priority. 91 */ 92 static PLIST_HEAD(swap_active_head); 93 94 /* 95 * all available (active, not full) swap_info_structs 96 * protected with swap_avail_lock, ordered by priority. 97 * This is used by folio_alloc_swap() instead of swap_active_head 98 * because swap_active_head includes all swap_info_structs, 99 * but folio_alloc_swap() doesn't need to look at full ones. 100 * This uses its own lock instead of swap_lock because when a 101 * swap_info_struct changes between not-full/full, it needs to 102 * add/remove itself to/from this list, but the swap_info_struct->lock 103 * is held and the locking order requires swap_lock to be taken 104 * before any swap_info_struct->lock. 105 */ 106 static struct plist_head *swap_avail_heads; 107 static DEFINE_SPINLOCK(swap_avail_lock); 108 109 static struct swap_info_struct *swap_info[MAX_SWAPFILES]; 110 111 static DEFINE_MUTEX(swapon_mutex); 112 113 static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait); 114 /* Activity counter to indicate that a swapon or swapoff has occurred */ 115 static atomic_t proc_poll_event = ATOMIC_INIT(0); 116 117 atomic_t nr_rotate_swap = ATOMIC_INIT(0); 118 119 static struct swap_info_struct *swap_type_to_swap_info(int type) 120 { 121 if (type >= MAX_SWAPFILES) 122 return NULL; 123 124 return READ_ONCE(swap_info[type]); /* rcu_dereference() */ 125 } 126 127 static inline unsigned char swap_count(unsigned char ent) 128 { 129 return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */ 130 } 131 132 /* Reclaim the swap entry anyway if possible */ 133 #define TTRS_ANYWAY 0x1 134 /* 135 * Reclaim the swap entry if there are no more mappings of the 136 * corresponding page 137 */ 138 #define TTRS_UNMAPPED 0x2 139 /* Reclaim the swap entry if swap is getting full */ 140 #define TTRS_FULL 0x4 141 /* Reclaim directly, bypass the slot cache and don't touch device lock */ 142 #define TTRS_DIRECT 0x8 143 144 static bool swap_is_has_cache(struct swap_info_struct *si, 145 unsigned long offset, int nr_pages) 146 { 147 unsigned char *map = si->swap_map + offset; 148 unsigned char *map_end = map + nr_pages; 149 150 do { 151 VM_BUG_ON(!(*map & SWAP_HAS_CACHE)); 152 if (*map != SWAP_HAS_CACHE) 153 return false; 154 } while (++map < map_end); 155 156 return true; 157 } 158 159 static bool swap_is_last_map(struct swap_info_struct *si, 160 unsigned long offset, int nr_pages, bool *has_cache) 161 { 162 unsigned char *map = si->swap_map + offset; 163 unsigned char *map_end = map + nr_pages; 164 unsigned char count = *map; 165 166 if (swap_count(count) != 1) 167 return false; 168 169 while (++map < map_end) { 170 if (*map != count) 171 return false; 172 } 173 174 *has_cache = !!(count & SWAP_HAS_CACHE); 175 return true; 176 } 177 178 /* 179 * returns number of pages in the folio that backs the swap entry. If positive, 180 * the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no 181 * folio was associated with the swap entry. 182 */ 183 static int __try_to_reclaim_swap(struct swap_info_struct *si, 184 unsigned long offset, unsigned long flags) 185 { 186 swp_entry_t entry = swp_entry(si->type, offset); 187 struct address_space *address_space = swap_address_space(entry); 188 struct swap_cluster_info *ci; 189 struct folio *folio; 190 int ret, nr_pages; 191 bool need_reclaim; 192 193 folio = filemap_get_folio(address_space, swap_cache_index(entry)); 194 if (IS_ERR(folio)) 195 return 0; 196 197 nr_pages = folio_nr_pages(folio); 198 ret = -nr_pages; 199 200 /* 201 * When this function is called from scan_swap_map_slots() and it's 202 * called by vmscan.c at reclaiming folios. So we hold a folio lock 203 * here. We have to use trylock for avoiding deadlock. This is a special 204 * case and you should use folio_free_swap() with explicit folio_lock() 205 * in usual operations. 206 */ 207 if (!folio_trylock(folio)) 208 goto out; 209 210 /* offset could point to the middle of a large folio */ 211 entry = folio->swap; 212 offset = swp_offset(entry); 213 214 need_reclaim = ((flags & TTRS_ANYWAY) || 215 ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || 216 ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio))); 217 if (!need_reclaim || !folio_swapcache_freeable(folio)) 218 goto out_unlock; 219 220 /* 221 * It's safe to delete the folio from swap cache only if the folio's 222 * swap_map is HAS_CACHE only, which means the slots have no page table 223 * reference or pending writeback, and can't be allocated to others. 224 */ 225 ci = lock_cluster_or_swap_info(si, offset); 226 need_reclaim = swap_is_has_cache(si, offset, nr_pages); 227 unlock_cluster_or_swap_info(si, ci); 228 if (!need_reclaim) 229 goto out_unlock; 230 231 if (!(flags & TTRS_DIRECT)) { 232 /* Free through slot cache */ 233 delete_from_swap_cache(folio); 234 folio_set_dirty(folio); 235 ret = nr_pages; 236 goto out_unlock; 237 } 238 239 xa_lock_irq(&address_space->i_pages); 240 __delete_from_swap_cache(folio, entry, NULL); 241 xa_unlock_irq(&address_space->i_pages); 242 folio_ref_sub(folio, nr_pages); 243 folio_set_dirty(folio); 244 245 spin_lock(&si->lock); 246 /* Only sinple page folio can be backed by zswap */ 247 if (nr_pages == 1) 248 zswap_invalidate(entry); 249 swap_entry_range_free(si, entry, nr_pages); 250 spin_unlock(&si->lock); 251 ret = nr_pages; 252 out_unlock: 253 folio_unlock(folio); 254 out: 255 folio_put(folio); 256 return ret; 257 } 258 259 static inline struct swap_extent *first_se(struct swap_info_struct *sis) 260 { 261 struct rb_node *rb = rb_first(&sis->swap_extent_root); 262 return rb_entry(rb, struct swap_extent, rb_node); 263 } 264 265 static inline struct swap_extent *next_se(struct swap_extent *se) 266 { 267 struct rb_node *rb = rb_next(&se->rb_node); 268 return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL; 269 } 270 271 /* 272 * swapon tell device that all the old swap contents can be discarded, 273 * to allow the swap device to optimize its wear-levelling. 274 */ 275 static int discard_swap(struct swap_info_struct *si) 276 { 277 struct swap_extent *se; 278 sector_t start_block; 279 sector_t nr_blocks; 280 int err = 0; 281 282 /* Do not discard the swap header page! */ 283 se = first_se(si); 284 start_block = (se->start_block + 1) << (PAGE_SHIFT - 9); 285 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); 286 if (nr_blocks) { 287 err = blkdev_issue_discard(si->bdev, start_block, 288 nr_blocks, GFP_KERNEL); 289 if (err) 290 return err; 291 cond_resched(); 292 } 293 294 for (se = next_se(se); se; se = next_se(se)) { 295 start_block = se->start_block << (PAGE_SHIFT - 9); 296 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); 297 298 err = blkdev_issue_discard(si->bdev, start_block, 299 nr_blocks, GFP_KERNEL); 300 if (err) 301 break; 302 303 cond_resched(); 304 } 305 return err; /* That will often be -EOPNOTSUPP */ 306 } 307 308 static struct swap_extent * 309 offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset) 310 { 311 struct swap_extent *se; 312 struct rb_node *rb; 313 314 rb = sis->swap_extent_root.rb_node; 315 while (rb) { 316 se = rb_entry(rb, struct swap_extent, rb_node); 317 if (offset < se->start_page) 318 rb = rb->rb_left; 319 else if (offset >= se->start_page + se->nr_pages) 320 rb = rb->rb_right; 321 else 322 return se; 323 } 324 /* It *must* be present */ 325 BUG(); 326 } 327 328 sector_t swap_folio_sector(struct folio *folio) 329 { 330 struct swap_info_struct *sis = swp_swap_info(folio->swap); 331 struct swap_extent *se; 332 sector_t sector; 333 pgoff_t offset; 334 335 offset = swp_offset(folio->swap); 336 se = offset_to_swap_extent(sis, offset); 337 sector = se->start_block + (offset - se->start_page); 338 return sector << (PAGE_SHIFT - 9); 339 } 340 341 /* 342 * swap allocation tell device that a cluster of swap can now be discarded, 343 * to allow the swap device to optimize its wear-levelling. 344 */ 345 static void discard_swap_cluster(struct swap_info_struct *si, 346 pgoff_t start_page, pgoff_t nr_pages) 347 { 348 struct swap_extent *se = offset_to_swap_extent(si, start_page); 349 350 while (nr_pages) { 351 pgoff_t offset = start_page - se->start_page; 352 sector_t start_block = se->start_block + offset; 353 sector_t nr_blocks = se->nr_pages - offset; 354 355 if (nr_blocks > nr_pages) 356 nr_blocks = nr_pages; 357 start_page += nr_blocks; 358 nr_pages -= nr_blocks; 359 360 start_block <<= PAGE_SHIFT - 9; 361 nr_blocks <<= PAGE_SHIFT - 9; 362 if (blkdev_issue_discard(si->bdev, start_block, 363 nr_blocks, GFP_NOIO)) 364 break; 365 366 se = next_se(se); 367 } 368 } 369 370 #ifdef CONFIG_THP_SWAP 371 #define SWAPFILE_CLUSTER HPAGE_PMD_NR 372 373 #define swap_entry_order(order) (order) 374 #else 375 #define SWAPFILE_CLUSTER 256 376 377 /* 378 * Define swap_entry_order() as constant to let compiler to optimize 379 * out some code if !CONFIG_THP_SWAP 380 */ 381 #define swap_entry_order(order) 0 382 #endif 383 #define LATENCY_LIMIT 256 384 385 static inline bool cluster_is_free(struct swap_cluster_info *info) 386 { 387 return info->flags & CLUSTER_FLAG_FREE; 388 } 389 390 static inline unsigned int cluster_index(struct swap_info_struct *si, 391 struct swap_cluster_info *ci) 392 { 393 return ci - si->cluster_info; 394 } 395 396 static inline unsigned int cluster_offset(struct swap_info_struct *si, 397 struct swap_cluster_info *ci) 398 { 399 return cluster_index(si, ci) * SWAPFILE_CLUSTER; 400 } 401 402 static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, 403 unsigned long offset) 404 { 405 struct swap_cluster_info *ci; 406 407 ci = si->cluster_info; 408 if (ci) { 409 ci += offset / SWAPFILE_CLUSTER; 410 spin_lock(&ci->lock); 411 } 412 return ci; 413 } 414 415 static inline void unlock_cluster(struct swap_cluster_info *ci) 416 { 417 if (ci) 418 spin_unlock(&ci->lock); 419 } 420 421 /* 422 * Determine the locking method in use for this device. Return 423 * swap_cluster_info if SSD-style cluster-based locking is in place. 424 */ 425 static inline struct swap_cluster_info *lock_cluster_or_swap_info( 426 struct swap_info_struct *si, unsigned long offset) 427 { 428 struct swap_cluster_info *ci; 429 430 /* Try to use fine-grained SSD-style locking if available: */ 431 ci = lock_cluster(si, offset); 432 /* Otherwise, fall back to traditional, coarse locking: */ 433 if (!ci) 434 spin_lock(&si->lock); 435 436 return ci; 437 } 438 439 static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si, 440 struct swap_cluster_info *ci) 441 { 442 if (ci) 443 unlock_cluster(ci); 444 else 445 spin_unlock(&si->lock); 446 } 447 448 /* Add a cluster to discard list and schedule it to do discard */ 449 static void swap_cluster_schedule_discard(struct swap_info_struct *si, 450 struct swap_cluster_info *ci) 451 { 452 unsigned int idx = cluster_index(si, ci); 453 /* 454 * If scan_swap_map_slots() can't find a free cluster, it will check 455 * si->swap_map directly. To make sure the discarding cluster isn't 456 * taken by scan_swap_map_slots(), mark the swap entries bad (occupied). 457 * It will be cleared after discard 458 */ 459 memset(si->swap_map + idx * SWAPFILE_CLUSTER, 460 SWAP_MAP_BAD, SWAPFILE_CLUSTER); 461 462 VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE); 463 list_move_tail(&ci->list, &si->discard_clusters); 464 ci->flags = 0; 465 schedule_work(&si->discard_work); 466 } 467 468 static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) 469 { 470 lockdep_assert_held(&si->lock); 471 lockdep_assert_held(&ci->lock); 472 473 if (ci->flags) 474 list_move_tail(&ci->list, &si->free_clusters); 475 else 476 list_add_tail(&ci->list, &si->free_clusters); 477 ci->flags = CLUSTER_FLAG_FREE; 478 ci->order = 0; 479 } 480 481 /* 482 * Doing discard actually. After a cluster discard is finished, the cluster 483 * will be added to free cluster list. caller should hold si->lock. 484 */ 485 static void swap_do_scheduled_discard(struct swap_info_struct *si) 486 { 487 struct swap_cluster_info *ci; 488 unsigned int idx; 489 490 while (!list_empty(&si->discard_clusters)) { 491 ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list); 492 list_del(&ci->list); 493 idx = cluster_index(si, ci); 494 spin_unlock(&si->lock); 495 496 discard_swap_cluster(si, idx * SWAPFILE_CLUSTER, 497 SWAPFILE_CLUSTER); 498 499 spin_lock(&si->lock); 500 spin_lock(&ci->lock); 501 __free_cluster(si, ci); 502 memset(si->swap_map + idx * SWAPFILE_CLUSTER, 503 0, SWAPFILE_CLUSTER); 504 spin_unlock(&ci->lock); 505 } 506 } 507 508 static void swap_discard_work(struct work_struct *work) 509 { 510 struct swap_info_struct *si; 511 512 si = container_of(work, struct swap_info_struct, discard_work); 513 514 spin_lock(&si->lock); 515 swap_do_scheduled_discard(si); 516 spin_unlock(&si->lock); 517 } 518 519 static void swap_users_ref_free(struct percpu_ref *ref) 520 { 521 struct swap_info_struct *si; 522 523 si = container_of(ref, struct swap_info_struct, users); 524 complete(&si->comp); 525 } 526 527 static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) 528 { 529 VM_BUG_ON(ci->count != 0); 530 lockdep_assert_held(&si->lock); 531 lockdep_assert_held(&ci->lock); 532 533 if (ci->flags & CLUSTER_FLAG_FRAG) 534 si->frag_cluster_nr[ci->order]--; 535 536 /* 537 * If the swap is discardable, prepare discard the cluster 538 * instead of free it immediately. The cluster will be freed 539 * after discard. 540 */ 541 if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) == 542 (SWP_WRITEOK | SWP_PAGE_DISCARD)) { 543 swap_cluster_schedule_discard(si, ci); 544 return; 545 } 546 547 __free_cluster(si, ci); 548 } 549 550 /* 551 * The cluster corresponding to page_nr will be used. The cluster will not be 552 * added to free cluster list and its usage counter will be increased by 1. 553 * Only used for initialization. 554 */ 555 static void inc_cluster_info_page(struct swap_info_struct *si, 556 struct swap_cluster_info *cluster_info, unsigned long page_nr) 557 { 558 unsigned long idx = page_nr / SWAPFILE_CLUSTER; 559 struct swap_cluster_info *ci; 560 561 if (!cluster_info) 562 return; 563 564 ci = cluster_info + idx; 565 ci->count++; 566 567 VM_BUG_ON(ci->count > SWAPFILE_CLUSTER); 568 VM_BUG_ON(ci->flags); 569 } 570 571 /* 572 * The cluster ci decreases @nr_pages usage. If the usage counter becomes 0, 573 * which means no page in the cluster is in use, we can optionally discard 574 * the cluster and add it to free cluster list. 575 */ 576 static void dec_cluster_info_page(struct swap_info_struct *si, 577 struct swap_cluster_info *ci, int nr_pages) 578 { 579 if (!si->cluster_info) 580 return; 581 582 VM_BUG_ON(ci->count < nr_pages); 583 VM_BUG_ON(cluster_is_free(ci)); 584 lockdep_assert_held(&si->lock); 585 lockdep_assert_held(&ci->lock); 586 ci->count -= nr_pages; 587 588 if (!ci->count) { 589 free_cluster(si, ci); 590 return; 591 } 592 593 if (!(ci->flags & CLUSTER_FLAG_NONFULL)) { 594 VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE); 595 if (ci->flags & CLUSTER_FLAG_FRAG) 596 si->frag_cluster_nr[ci->order]--; 597 list_move_tail(&ci->list, &si->nonfull_clusters[ci->order]); 598 ci->flags = CLUSTER_FLAG_NONFULL; 599 } 600 } 601 602 static bool cluster_reclaim_range(struct swap_info_struct *si, 603 struct swap_cluster_info *ci, 604 unsigned long start, unsigned long end) 605 { 606 unsigned char *map = si->swap_map; 607 unsigned long offset; 608 609 spin_unlock(&ci->lock); 610 spin_unlock(&si->lock); 611 612 for (offset = start; offset < end; offset++) { 613 switch (READ_ONCE(map[offset])) { 614 case 0: 615 continue; 616 case SWAP_HAS_CACHE: 617 if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT) > 0) 618 continue; 619 goto out; 620 default: 621 goto out; 622 } 623 } 624 out: 625 spin_lock(&si->lock); 626 spin_lock(&ci->lock); 627 628 /* 629 * Recheck the range no matter reclaim succeeded or not, the slot 630 * could have been be freed while we are not holding the lock. 631 */ 632 for (offset = start; offset < end; offset++) 633 if (READ_ONCE(map[offset])) 634 return false; 635 636 return true; 637 } 638 639 static bool cluster_scan_range(struct swap_info_struct *si, 640 struct swap_cluster_info *ci, 641 unsigned long start, unsigned int nr_pages) 642 { 643 unsigned long offset, end = start + nr_pages; 644 unsigned char *map = si->swap_map; 645 bool need_reclaim = false; 646 647 for (offset = start; offset < end; offset++) { 648 switch (READ_ONCE(map[offset])) { 649 case 0: 650 continue; 651 case SWAP_HAS_CACHE: 652 if (!vm_swap_full()) 653 return false; 654 need_reclaim = true; 655 continue; 656 default: 657 return false; 658 } 659 } 660 661 if (need_reclaim) 662 return cluster_reclaim_range(si, ci, start, end); 663 664 return true; 665 } 666 667 static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci, 668 unsigned int start, unsigned char usage, 669 unsigned int order) 670 { 671 unsigned int nr_pages = 1 << order; 672 673 if (!(si->flags & SWP_WRITEOK)) 674 return false; 675 676 if (cluster_is_free(ci)) { 677 if (nr_pages < SWAPFILE_CLUSTER) { 678 list_move_tail(&ci->list, &si->nonfull_clusters[order]); 679 ci->flags = CLUSTER_FLAG_NONFULL; 680 } 681 ci->order = order; 682 } 683 684 memset(si->swap_map + start, usage, nr_pages); 685 swap_range_alloc(si, start, nr_pages); 686 ci->count += nr_pages; 687 688 if (ci->count == SWAPFILE_CLUSTER) { 689 VM_BUG_ON(!(ci->flags & 690 (CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL | CLUSTER_FLAG_FRAG))); 691 if (ci->flags & CLUSTER_FLAG_FRAG) 692 si->frag_cluster_nr[ci->order]--; 693 list_move_tail(&ci->list, &si->full_clusters); 694 ci->flags = CLUSTER_FLAG_FULL; 695 } 696 697 return true; 698 } 699 700 static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigned long offset, 701 unsigned int *foundp, unsigned int order, 702 unsigned char usage) 703 { 704 unsigned long start = offset & ~(SWAPFILE_CLUSTER - 1); 705 unsigned long end = min(start + SWAPFILE_CLUSTER, si->max); 706 unsigned int nr_pages = 1 << order; 707 struct swap_cluster_info *ci; 708 709 if (end < nr_pages) 710 return SWAP_NEXT_INVALID; 711 end -= nr_pages; 712 713 ci = lock_cluster(si, offset); 714 if (ci->count + nr_pages > SWAPFILE_CLUSTER) { 715 offset = SWAP_NEXT_INVALID; 716 goto done; 717 } 718 719 while (offset <= end) { 720 if (cluster_scan_range(si, ci, offset, nr_pages)) { 721 if (!cluster_alloc_range(si, ci, offset, usage, order)) { 722 offset = SWAP_NEXT_INVALID; 723 goto done; 724 } 725 *foundp = offset; 726 if (ci->count == SWAPFILE_CLUSTER) { 727 offset = SWAP_NEXT_INVALID; 728 goto done; 729 } 730 offset += nr_pages; 731 break; 732 } 733 offset += nr_pages; 734 } 735 if (offset > end) 736 offset = SWAP_NEXT_INVALID; 737 done: 738 unlock_cluster(ci); 739 return offset; 740 } 741 742 /* Return true if reclaimed a whole cluster */ 743 static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) 744 { 745 long to_scan = 1; 746 unsigned long offset, end; 747 struct swap_cluster_info *ci; 748 unsigned char *map = si->swap_map; 749 int nr_reclaim; 750 751 if (force) 752 to_scan = si->inuse_pages / SWAPFILE_CLUSTER; 753 754 while (!list_empty(&si->full_clusters)) { 755 ci = list_first_entry(&si->full_clusters, struct swap_cluster_info, list); 756 list_move_tail(&ci->list, &si->full_clusters); 757 offset = cluster_offset(si, ci); 758 end = min(si->max, offset + SWAPFILE_CLUSTER); 759 to_scan--; 760 761 spin_unlock(&si->lock); 762 while (offset < end) { 763 if (READ_ONCE(map[offset]) == SWAP_HAS_CACHE) { 764 nr_reclaim = __try_to_reclaim_swap(si, offset, 765 TTRS_ANYWAY | TTRS_DIRECT); 766 if (nr_reclaim) { 767 offset += abs(nr_reclaim); 768 continue; 769 } 770 } 771 offset++; 772 } 773 spin_lock(&si->lock); 774 775 if (to_scan <= 0) 776 break; 777 } 778 } 779 780 static void swap_reclaim_work(struct work_struct *work) 781 { 782 struct swap_info_struct *si; 783 784 si = container_of(work, struct swap_info_struct, reclaim_work); 785 786 spin_lock(&si->lock); 787 swap_reclaim_full_clusters(si, true); 788 spin_unlock(&si->lock); 789 } 790 791 /* 792 * Try to get swap entries with specified order from current cpu's swap entry 793 * pool (a cluster). This might involve allocating a new cluster for current CPU 794 * too. 795 */ 796 static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order, 797 unsigned char usage) 798 { 799 struct percpu_cluster *cluster; 800 struct swap_cluster_info *ci; 801 unsigned int offset, found = 0; 802 803 new_cluster: 804 lockdep_assert_held(&si->lock); 805 cluster = this_cpu_ptr(si->percpu_cluster); 806 offset = cluster->next[order]; 807 if (offset) { 808 offset = alloc_swap_scan_cluster(si, offset, &found, order, usage); 809 if (found) 810 goto done; 811 } 812 813 if (!list_empty(&si->free_clusters)) { 814 ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list); 815 offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, order, usage); 816 /* 817 * Either we didn't touch the cluster due to swapoff, 818 * or the allocation must success. 819 */ 820 VM_BUG_ON((si->flags & SWP_WRITEOK) && !found); 821 goto done; 822 } 823 824 /* Try reclaim from full clusters if free clusters list is drained */ 825 if (vm_swap_full()) 826 swap_reclaim_full_clusters(si, false); 827 828 if (order < PMD_ORDER) { 829 unsigned int frags = 0; 830 831 while (!list_empty(&si->nonfull_clusters[order])) { 832 ci = list_first_entry(&si->nonfull_clusters[order], 833 struct swap_cluster_info, list); 834 list_move_tail(&ci->list, &si->frag_clusters[order]); 835 ci->flags = CLUSTER_FLAG_FRAG; 836 si->frag_cluster_nr[order]++; 837 offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), 838 &found, order, usage); 839 frags++; 840 if (found) 841 break; 842 } 843 844 if (!found) { 845 /* 846 * Nonfull clusters are moved to frag tail if we reached 847 * here, count them too, don't over scan the frag list. 848 */ 849 while (frags < si->frag_cluster_nr[order]) { 850 ci = list_first_entry(&si->frag_clusters[order], 851 struct swap_cluster_info, list); 852 /* 853 * Rotate the frag list to iterate, they were all failing 854 * high order allocation or moved here due to per-CPU usage, 855 * this help keeping usable cluster ahead. 856 */ 857 list_move_tail(&ci->list, &si->frag_clusters[order]); 858 offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), 859 &found, order, usage); 860 frags++; 861 if (found) 862 break; 863 } 864 } 865 } 866 867 if (found) 868 goto done; 869 870 if (!list_empty(&si->discard_clusters)) { 871 /* 872 * we don't have free cluster but have some clusters in 873 * discarding, do discard now and reclaim them, then 874 * reread cluster_next_cpu since we dropped si->lock 875 */ 876 swap_do_scheduled_discard(si); 877 goto new_cluster; 878 } 879 880 if (order) 881 goto done; 882 883 /* Order 0 stealing from higher order */ 884 for (int o = 1; o < SWAP_NR_ORDERS; o++) { 885 /* 886 * Clusters here have at least one usable slots and can't fail order 0 887 * allocation, but reclaim may drop si->lock and race with another user. 888 */ 889 while (!list_empty(&si->frag_clusters[o])) { 890 ci = list_first_entry(&si->frag_clusters[o], 891 struct swap_cluster_info, list); 892 offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), 893 &found, 0, usage); 894 if (found) 895 goto done; 896 } 897 898 while (!list_empty(&si->nonfull_clusters[o])) { 899 ci = list_first_entry(&si->nonfull_clusters[o], 900 struct swap_cluster_info, list); 901 offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), 902 &found, 0, usage); 903 if (found) 904 goto done; 905 } 906 } 907 908 done: 909 cluster->next[order] = offset; 910 return found; 911 } 912 913 static void __del_from_avail_list(struct swap_info_struct *si) 914 { 915 int nid; 916 917 assert_spin_locked(&si->lock); 918 for_each_node(nid) 919 plist_del(&si->avail_lists[nid], &swap_avail_heads[nid]); 920 } 921 922 static void del_from_avail_list(struct swap_info_struct *si) 923 { 924 spin_lock(&swap_avail_lock); 925 __del_from_avail_list(si); 926 spin_unlock(&swap_avail_lock); 927 } 928 929 static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, 930 unsigned int nr_entries) 931 { 932 unsigned int end = offset + nr_entries - 1; 933 934 if (offset == si->lowest_bit) 935 si->lowest_bit += nr_entries; 936 if (end == si->highest_bit) 937 WRITE_ONCE(si->highest_bit, si->highest_bit - nr_entries); 938 WRITE_ONCE(si->inuse_pages, si->inuse_pages + nr_entries); 939 if (si->inuse_pages == si->pages) { 940 si->lowest_bit = si->max; 941 si->highest_bit = 0; 942 del_from_avail_list(si); 943 944 if (si->cluster_info && vm_swap_full()) 945 schedule_work(&si->reclaim_work); 946 } 947 } 948 949 static void add_to_avail_list(struct swap_info_struct *si) 950 { 951 int nid; 952 953 spin_lock(&swap_avail_lock); 954 for_each_node(nid) 955 plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]); 956 spin_unlock(&swap_avail_lock); 957 } 958 959 static void swap_range_free(struct swap_info_struct *si, unsigned long offset, 960 unsigned int nr_entries) 961 { 962 unsigned long begin = offset; 963 unsigned long end = offset + nr_entries - 1; 964 void (*swap_slot_free_notify)(struct block_device *, unsigned long); 965 unsigned int i; 966 967 /* 968 * Use atomic clear_bit operations only on zeromap instead of non-atomic 969 * bitmap_clear to prevent adjacent bits corruption due to simultaneous writes. 970 */ 971 for (i = 0; i < nr_entries; i++) 972 clear_bit(offset + i, si->zeromap); 973 974 if (offset < si->lowest_bit) 975 si->lowest_bit = offset; 976 if (end > si->highest_bit) { 977 bool was_full = !si->highest_bit; 978 979 WRITE_ONCE(si->highest_bit, end); 980 if (was_full && (si->flags & SWP_WRITEOK)) 981 add_to_avail_list(si); 982 } 983 if (si->flags & SWP_BLKDEV) 984 swap_slot_free_notify = 985 si->bdev->bd_disk->fops->swap_slot_free_notify; 986 else 987 swap_slot_free_notify = NULL; 988 while (offset <= end) { 989 arch_swap_invalidate_page(si->type, offset); 990 if (swap_slot_free_notify) 991 swap_slot_free_notify(si->bdev, offset); 992 offset++; 993 } 994 clear_shadow_from_swap_cache(si->type, begin, end); 995 996 /* 997 * Make sure that try_to_unuse() observes si->inuse_pages reaching 0 998 * only after the above cleanups are done. 999 */ 1000 smp_wmb(); 1001 atomic_long_add(nr_entries, &nr_swap_pages); 1002 WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries); 1003 } 1004 1005 static void set_cluster_next(struct swap_info_struct *si, unsigned long next) 1006 { 1007 unsigned long prev; 1008 1009 if (!(si->flags & SWP_SOLIDSTATE)) { 1010 si->cluster_next = next; 1011 return; 1012 } 1013 1014 prev = this_cpu_read(*si->cluster_next_cpu); 1015 /* 1016 * Cross the swap address space size aligned trunk, choose 1017 * another trunk randomly to avoid lock contention on swap 1018 * address space if possible. 1019 */ 1020 if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) != 1021 (next >> SWAP_ADDRESS_SPACE_SHIFT)) { 1022 /* No free swap slots available */ 1023 if (si->highest_bit <= si->lowest_bit) 1024 return; 1025 next = get_random_u32_inclusive(si->lowest_bit, si->highest_bit); 1026 next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES); 1027 next = max_t(unsigned int, next, si->lowest_bit); 1028 } 1029 this_cpu_write(*si->cluster_next_cpu, next); 1030 } 1031 1032 static bool swap_offset_available_and_locked(struct swap_info_struct *si, 1033 unsigned long offset) 1034 { 1035 if (data_race(!si->swap_map[offset])) { 1036 spin_lock(&si->lock); 1037 return true; 1038 } 1039 1040 if (vm_swap_full() && READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) { 1041 spin_lock(&si->lock); 1042 return true; 1043 } 1044 1045 return false; 1046 } 1047 1048 static int cluster_alloc_swap(struct swap_info_struct *si, 1049 unsigned char usage, int nr, 1050 swp_entry_t slots[], int order) 1051 { 1052 int n_ret = 0; 1053 1054 VM_BUG_ON(!si->cluster_info); 1055 1056 si->flags += SWP_SCANNING; 1057 1058 while (n_ret < nr) { 1059 unsigned long offset = cluster_alloc_swap_entry(si, order, usage); 1060 1061 if (!offset) 1062 break; 1063 slots[n_ret++] = swp_entry(si->type, offset); 1064 } 1065 1066 si->flags -= SWP_SCANNING; 1067 1068 return n_ret; 1069 } 1070 1071 static int scan_swap_map_slots(struct swap_info_struct *si, 1072 unsigned char usage, int nr, 1073 swp_entry_t slots[], int order) 1074 { 1075 unsigned long offset; 1076 unsigned long scan_base; 1077 unsigned long last_in_cluster = 0; 1078 int latency_ration = LATENCY_LIMIT; 1079 unsigned int nr_pages = 1 << order; 1080 int n_ret = 0; 1081 bool scanned_many = false; 1082 1083 /* 1084 * We try to cluster swap pages by allocating them sequentially 1085 * in swap. Once we've allocated SWAPFILE_CLUSTER pages this 1086 * way, however, we resort to first-free allocation, starting 1087 * a new cluster. This prevents us from scattering swap pages 1088 * all over the entire swap partition, so that we reduce 1089 * overall disk seek times between swap pages. -- sct 1090 * But we do now try to find an empty cluster. -Andrea 1091 * And we let swap pages go all over an SSD partition. Hugh 1092 */ 1093 1094 if (order > 0) { 1095 /* 1096 * Should not even be attempting large allocations when huge 1097 * page swap is disabled. Warn and fail the allocation. 1098 */ 1099 if (!IS_ENABLED(CONFIG_THP_SWAP) || 1100 nr_pages > SWAPFILE_CLUSTER) { 1101 VM_WARN_ON_ONCE(1); 1102 return 0; 1103 } 1104 1105 /* 1106 * Swapfile is not block device or not using clusters so unable 1107 * to allocate large entries. 1108 */ 1109 if (!(si->flags & SWP_BLKDEV) || !si->cluster_info) 1110 return 0; 1111 } 1112 1113 if (si->cluster_info) 1114 return cluster_alloc_swap(si, usage, nr, slots, order); 1115 1116 si->flags += SWP_SCANNING; 1117 1118 /* For HDD, sequential access is more important. */ 1119 scan_base = si->cluster_next; 1120 offset = scan_base; 1121 1122 if (unlikely(!si->cluster_nr--)) { 1123 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { 1124 si->cluster_nr = SWAPFILE_CLUSTER - 1; 1125 goto checks; 1126 } 1127 1128 spin_unlock(&si->lock); 1129 1130 /* 1131 * If seek is expensive, start searching for new cluster from 1132 * start of partition, to minimize the span of allocated swap. 1133 */ 1134 scan_base = offset = si->lowest_bit; 1135 last_in_cluster = offset + SWAPFILE_CLUSTER - 1; 1136 1137 /* Locate the first empty (unaligned) cluster */ 1138 for (; last_in_cluster <= READ_ONCE(si->highest_bit); offset++) { 1139 if (si->swap_map[offset]) 1140 last_in_cluster = offset + SWAPFILE_CLUSTER; 1141 else if (offset == last_in_cluster) { 1142 spin_lock(&si->lock); 1143 offset -= SWAPFILE_CLUSTER - 1; 1144 si->cluster_next = offset; 1145 si->cluster_nr = SWAPFILE_CLUSTER - 1; 1146 goto checks; 1147 } 1148 if (unlikely(--latency_ration < 0)) { 1149 cond_resched(); 1150 latency_ration = LATENCY_LIMIT; 1151 } 1152 } 1153 1154 offset = scan_base; 1155 spin_lock(&si->lock); 1156 si->cluster_nr = SWAPFILE_CLUSTER - 1; 1157 } 1158 1159 checks: 1160 if (!(si->flags & SWP_WRITEOK)) 1161 goto no_page; 1162 if (!si->highest_bit) 1163 goto no_page; 1164 if (offset > si->highest_bit) 1165 scan_base = offset = si->lowest_bit; 1166 1167 /* reuse swap entry of cache-only swap if not busy. */ 1168 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { 1169 int swap_was_freed; 1170 spin_unlock(&si->lock); 1171 swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT); 1172 spin_lock(&si->lock); 1173 /* entry was freed successfully, try to use this again */ 1174 if (swap_was_freed > 0) 1175 goto checks; 1176 goto scan; /* check next one */ 1177 } 1178 1179 if (si->swap_map[offset]) { 1180 if (!n_ret) 1181 goto scan; 1182 else 1183 goto done; 1184 } 1185 memset(si->swap_map + offset, usage, nr_pages); 1186 1187 swap_range_alloc(si, offset, nr_pages); 1188 slots[n_ret++] = swp_entry(si->type, offset); 1189 1190 /* got enough slots or reach max slots? */ 1191 if ((n_ret == nr) || (offset >= si->highest_bit)) 1192 goto done; 1193 1194 /* search for next available slot */ 1195 1196 /* time to take a break? */ 1197 if (unlikely(--latency_ration < 0)) { 1198 if (n_ret) 1199 goto done; 1200 spin_unlock(&si->lock); 1201 cond_resched(); 1202 spin_lock(&si->lock); 1203 latency_ration = LATENCY_LIMIT; 1204 } 1205 1206 if (si->cluster_nr && !si->swap_map[++offset]) { 1207 /* non-ssd case, still more slots in cluster? */ 1208 --si->cluster_nr; 1209 goto checks; 1210 } 1211 1212 /* 1213 * Even if there's no free clusters available (fragmented), 1214 * try to scan a little more quickly with lock held unless we 1215 * have scanned too many slots already. 1216 */ 1217 if (!scanned_many) { 1218 unsigned long scan_limit; 1219 1220 if (offset < scan_base) 1221 scan_limit = scan_base; 1222 else 1223 scan_limit = si->highest_bit; 1224 for (; offset <= scan_limit && --latency_ration > 0; 1225 offset++) { 1226 if (!si->swap_map[offset]) 1227 goto checks; 1228 } 1229 } 1230 1231 done: 1232 if (order == 0) 1233 set_cluster_next(si, offset + 1); 1234 si->flags -= SWP_SCANNING; 1235 return n_ret; 1236 1237 scan: 1238 VM_WARN_ON(order > 0); 1239 spin_unlock(&si->lock); 1240 while (++offset <= READ_ONCE(si->highest_bit)) { 1241 if (unlikely(--latency_ration < 0)) { 1242 cond_resched(); 1243 latency_ration = LATENCY_LIMIT; 1244 scanned_many = true; 1245 } 1246 if (swap_offset_available_and_locked(si, offset)) 1247 goto checks; 1248 } 1249 offset = si->lowest_bit; 1250 while (offset < scan_base) { 1251 if (unlikely(--latency_ration < 0)) { 1252 cond_resched(); 1253 latency_ration = LATENCY_LIMIT; 1254 scanned_many = true; 1255 } 1256 if (swap_offset_available_and_locked(si, offset)) 1257 goto checks; 1258 offset++; 1259 } 1260 spin_lock(&si->lock); 1261 1262 no_page: 1263 si->flags -= SWP_SCANNING; 1264 return n_ret; 1265 } 1266 1267 int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order) 1268 { 1269 int order = swap_entry_order(entry_order); 1270 unsigned long size = 1 << order; 1271 struct swap_info_struct *si, *next; 1272 long avail_pgs; 1273 int n_ret = 0; 1274 int node; 1275 1276 spin_lock(&swap_avail_lock); 1277 1278 avail_pgs = atomic_long_read(&nr_swap_pages) / size; 1279 if (avail_pgs <= 0) { 1280 spin_unlock(&swap_avail_lock); 1281 goto noswap; 1282 } 1283 1284 n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs); 1285 1286 atomic_long_sub(n_goal * size, &nr_swap_pages); 1287 1288 start_over: 1289 node = numa_node_id(); 1290 plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) { 1291 /* requeue si to after same-priority siblings */ 1292 plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]); 1293 spin_unlock(&swap_avail_lock); 1294 spin_lock(&si->lock); 1295 if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { 1296 spin_lock(&swap_avail_lock); 1297 if (plist_node_empty(&si->avail_lists[node])) { 1298 spin_unlock(&si->lock); 1299 goto nextsi; 1300 } 1301 WARN(!si->highest_bit, 1302 "swap_info %d in list but !highest_bit\n", 1303 si->type); 1304 WARN(!(si->flags & SWP_WRITEOK), 1305 "swap_info %d in list but !SWP_WRITEOK\n", 1306 si->type); 1307 __del_from_avail_list(si); 1308 spin_unlock(&si->lock); 1309 goto nextsi; 1310 } 1311 n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, 1312 n_goal, swp_entries, order); 1313 spin_unlock(&si->lock); 1314 if (n_ret || size > 1) 1315 goto check_out; 1316 cond_resched(); 1317 1318 spin_lock(&swap_avail_lock); 1319 nextsi: 1320 /* 1321 * if we got here, it's likely that si was almost full before, 1322 * and since scan_swap_map_slots() can drop the si->lock, 1323 * multiple callers probably all tried to get a page from the 1324 * same si and it filled up before we could get one; or, the si 1325 * filled up between us dropping swap_avail_lock and taking 1326 * si->lock. Since we dropped the swap_avail_lock, the 1327 * swap_avail_head list may have been modified; so if next is 1328 * still in the swap_avail_head list then try it, otherwise 1329 * start over if we have not gotten any slots. 1330 */ 1331 if (plist_node_empty(&next->avail_lists[node])) 1332 goto start_over; 1333 } 1334 1335 spin_unlock(&swap_avail_lock); 1336 1337 check_out: 1338 if (n_ret < n_goal) 1339 atomic_long_add((long)(n_goal - n_ret) * size, 1340 &nr_swap_pages); 1341 noswap: 1342 return n_ret; 1343 } 1344 1345 static struct swap_info_struct *_swap_info_get(swp_entry_t entry) 1346 { 1347 struct swap_info_struct *si; 1348 unsigned long offset; 1349 1350 if (!entry.val) 1351 goto out; 1352 si = swp_swap_info(entry); 1353 if (!si) 1354 goto bad_nofile; 1355 if (data_race(!(si->flags & SWP_USED))) 1356 goto bad_device; 1357 offset = swp_offset(entry); 1358 if (offset >= si->max) 1359 goto bad_offset; 1360 if (data_race(!si->swap_map[swp_offset(entry)])) 1361 goto bad_free; 1362 return si; 1363 1364 bad_free: 1365 pr_err("%s: %s%08lx\n", __func__, Unused_offset, entry.val); 1366 goto out; 1367 bad_offset: 1368 pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val); 1369 goto out; 1370 bad_device: 1371 pr_err("%s: %s%08lx\n", __func__, Unused_file, entry.val); 1372 goto out; 1373 bad_nofile: 1374 pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val); 1375 out: 1376 return NULL; 1377 } 1378 1379 static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry, 1380 struct swap_info_struct *q) 1381 { 1382 struct swap_info_struct *p; 1383 1384 p = _swap_info_get(entry); 1385 1386 if (p != q) { 1387 if (q != NULL) 1388 spin_unlock(&q->lock); 1389 if (p != NULL) 1390 spin_lock(&p->lock); 1391 } 1392 return p; 1393 } 1394 1395 static unsigned char __swap_entry_free_locked(struct swap_info_struct *si, 1396 unsigned long offset, 1397 unsigned char usage) 1398 { 1399 unsigned char count; 1400 unsigned char has_cache; 1401 1402 count = si->swap_map[offset]; 1403 1404 has_cache = count & SWAP_HAS_CACHE; 1405 count &= ~SWAP_HAS_CACHE; 1406 1407 if (usage == SWAP_HAS_CACHE) { 1408 VM_BUG_ON(!has_cache); 1409 has_cache = 0; 1410 } else if (count == SWAP_MAP_SHMEM) { 1411 /* 1412 * Or we could insist on shmem.c using a special 1413 * swap_shmem_free() and free_shmem_swap_and_cache()... 1414 */ 1415 count = 0; 1416 } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) { 1417 if (count == COUNT_CONTINUED) { 1418 if (swap_count_continued(si, offset, count)) 1419 count = SWAP_MAP_MAX | COUNT_CONTINUED; 1420 else 1421 count = SWAP_MAP_MAX; 1422 } else 1423 count--; 1424 } 1425 1426 usage = count | has_cache; 1427 if (usage) 1428 WRITE_ONCE(si->swap_map[offset], usage); 1429 else 1430 WRITE_ONCE(si->swap_map[offset], SWAP_HAS_CACHE); 1431 1432 return usage; 1433 } 1434 1435 /* 1436 * When we get a swap entry, if there aren't some other ways to 1437 * prevent swapoff, such as the folio in swap cache is locked, RCU 1438 * reader side is locked, etc., the swap entry may become invalid 1439 * because of swapoff. Then, we need to enclose all swap related 1440 * functions with get_swap_device() and put_swap_device(), unless the 1441 * swap functions call get/put_swap_device() by themselves. 1442 * 1443 * RCU reader side lock (including any spinlock) is sufficient to 1444 * prevent swapoff, because synchronize_rcu() is called in swapoff() 1445 * before freeing data structures. 1446 * 1447 * Check whether swap entry is valid in the swap device. If so, 1448 * return pointer to swap_info_struct, and keep the swap entry valid 1449 * via preventing the swap device from being swapoff, until 1450 * put_swap_device() is called. Otherwise return NULL. 1451 * 1452 * Notice that swapoff or swapoff+swapon can still happen before the 1453 * percpu_ref_tryget_live() in get_swap_device() or after the 1454 * percpu_ref_put() in put_swap_device() if there isn't any other way 1455 * to prevent swapoff. The caller must be prepared for that. For 1456 * example, the following situation is possible. 1457 * 1458 * CPU1 CPU2 1459 * do_swap_page() 1460 * ... swapoff+swapon 1461 * __read_swap_cache_async() 1462 * swapcache_prepare() 1463 * __swap_duplicate() 1464 * // check swap_map 1465 * // verify PTE not changed 1466 * 1467 * In __swap_duplicate(), the swap_map need to be checked before 1468 * changing partly because the specified swap entry may be for another 1469 * swap device which has been swapoff. And in do_swap_page(), after 1470 * the page is read from the swap device, the PTE is verified not 1471 * changed with the page table locked to check whether the swap device 1472 * has been swapoff or swapoff+swapon. 1473 */ 1474 struct swap_info_struct *get_swap_device(swp_entry_t entry) 1475 { 1476 struct swap_info_struct *si; 1477 unsigned long offset; 1478 1479 if (!entry.val) 1480 goto out; 1481 si = swp_swap_info(entry); 1482 if (!si) 1483 goto bad_nofile; 1484 if (!percpu_ref_tryget_live(&si->users)) 1485 goto out; 1486 /* 1487 * Guarantee the si->users are checked before accessing other 1488 * fields of swap_info_struct. 1489 * 1490 * Paired with the spin_unlock() after setup_swap_info() in 1491 * enable_swap_info(). 1492 */ 1493 smp_rmb(); 1494 offset = swp_offset(entry); 1495 if (offset >= si->max) 1496 goto put_out; 1497 1498 return si; 1499 bad_nofile: 1500 pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val); 1501 out: 1502 return NULL; 1503 put_out: 1504 pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val); 1505 percpu_ref_put(&si->users); 1506 return NULL; 1507 } 1508 1509 static unsigned char __swap_entry_free(struct swap_info_struct *si, 1510 swp_entry_t entry) 1511 { 1512 struct swap_cluster_info *ci; 1513 unsigned long offset = swp_offset(entry); 1514 unsigned char usage; 1515 1516 ci = lock_cluster_or_swap_info(si, offset); 1517 usage = __swap_entry_free_locked(si, offset, 1); 1518 unlock_cluster_or_swap_info(si, ci); 1519 if (!usage) 1520 free_swap_slot(entry); 1521 1522 return usage; 1523 } 1524 1525 static bool __swap_entries_free(struct swap_info_struct *si, 1526 swp_entry_t entry, int nr) 1527 { 1528 unsigned long offset = swp_offset(entry); 1529 unsigned int type = swp_type(entry); 1530 struct swap_cluster_info *ci; 1531 bool has_cache = false; 1532 unsigned char count; 1533 int i; 1534 1535 if (nr <= 1 || swap_count(data_race(si->swap_map[offset])) != 1) 1536 goto fallback; 1537 /* cross into another cluster */ 1538 if (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER) 1539 goto fallback; 1540 1541 ci = lock_cluster_or_swap_info(si, offset); 1542 if (!swap_is_last_map(si, offset, nr, &has_cache)) { 1543 unlock_cluster_or_swap_info(si, ci); 1544 goto fallback; 1545 } 1546 for (i = 0; i < nr; i++) 1547 WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE); 1548 unlock_cluster_or_swap_info(si, ci); 1549 1550 if (!has_cache) { 1551 for (i = 0; i < nr; i++) 1552 zswap_invalidate(swp_entry(si->type, offset + i)); 1553 spin_lock(&si->lock); 1554 swap_entry_range_free(si, entry, nr); 1555 spin_unlock(&si->lock); 1556 } 1557 return has_cache; 1558 1559 fallback: 1560 for (i = 0; i < nr; i++) { 1561 if (data_race(si->swap_map[offset + i])) { 1562 count = __swap_entry_free(si, swp_entry(type, offset + i)); 1563 if (count == SWAP_HAS_CACHE) 1564 has_cache = true; 1565 } else { 1566 WARN_ON_ONCE(1); 1567 } 1568 } 1569 return has_cache; 1570 } 1571 1572 /* 1573 * Drop the last HAS_CACHE flag of swap entries, caller have to 1574 * ensure all entries belong to the same cgroup. 1575 */ 1576 static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry, 1577 unsigned int nr_pages) 1578 { 1579 unsigned long offset = swp_offset(entry); 1580 unsigned char *map = si->swap_map + offset; 1581 unsigned char *map_end = map + nr_pages; 1582 struct swap_cluster_info *ci; 1583 1584 ci = lock_cluster(si, offset); 1585 do { 1586 VM_BUG_ON(*map != SWAP_HAS_CACHE); 1587 *map = 0; 1588 } while (++map < map_end); 1589 dec_cluster_info_page(si, ci, nr_pages); 1590 unlock_cluster(ci); 1591 1592 mem_cgroup_uncharge_swap(entry, nr_pages); 1593 swap_range_free(si, offset, nr_pages); 1594 } 1595 1596 static void cluster_swap_free_nr(struct swap_info_struct *si, 1597 unsigned long offset, int nr_pages, 1598 unsigned char usage) 1599 { 1600 struct swap_cluster_info *ci; 1601 DECLARE_BITMAP(to_free, BITS_PER_LONG) = { 0 }; 1602 int i, nr; 1603 1604 ci = lock_cluster_or_swap_info(si, offset); 1605 while (nr_pages) { 1606 nr = min(BITS_PER_LONG, nr_pages); 1607 for (i = 0; i < nr; i++) { 1608 if (!__swap_entry_free_locked(si, offset + i, usage)) 1609 bitmap_set(to_free, i, 1); 1610 } 1611 if (!bitmap_empty(to_free, BITS_PER_LONG)) { 1612 unlock_cluster_or_swap_info(si, ci); 1613 for_each_set_bit(i, to_free, BITS_PER_LONG) 1614 free_swap_slot(swp_entry(si->type, offset + i)); 1615 if (nr == nr_pages) 1616 return; 1617 bitmap_clear(to_free, 0, BITS_PER_LONG); 1618 ci = lock_cluster_or_swap_info(si, offset); 1619 } 1620 offset += nr; 1621 nr_pages -= nr; 1622 } 1623 unlock_cluster_or_swap_info(si, ci); 1624 } 1625 1626 /* 1627 * Caller has made sure that the swap device corresponding to entry 1628 * is still around or has not been recycled. 1629 */ 1630 void swap_free_nr(swp_entry_t entry, int nr_pages) 1631 { 1632 int nr; 1633 struct swap_info_struct *sis; 1634 unsigned long offset = swp_offset(entry); 1635 1636 sis = _swap_info_get(entry); 1637 if (!sis) 1638 return; 1639 1640 while (nr_pages) { 1641 nr = min_t(int, nr_pages, SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); 1642 cluster_swap_free_nr(sis, offset, nr, 1); 1643 offset += nr; 1644 nr_pages -= nr; 1645 } 1646 } 1647 1648 /* 1649 * Called after dropping swapcache to decrease refcnt to swap entries. 1650 */ 1651 void put_swap_folio(struct folio *folio, swp_entry_t entry) 1652 { 1653 unsigned long offset = swp_offset(entry); 1654 struct swap_cluster_info *ci; 1655 struct swap_info_struct *si; 1656 int size = 1 << swap_entry_order(folio_order(folio)); 1657 1658 si = _swap_info_get(entry); 1659 if (!si) 1660 return; 1661 1662 ci = lock_cluster_or_swap_info(si, offset); 1663 if (size > 1 && swap_is_has_cache(si, offset, size)) { 1664 unlock_cluster_or_swap_info(si, ci); 1665 spin_lock(&si->lock); 1666 swap_entry_range_free(si, entry, size); 1667 spin_unlock(&si->lock); 1668 return; 1669 } 1670 for (int i = 0; i < size; i++, entry.val++) { 1671 if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) { 1672 unlock_cluster_or_swap_info(si, ci); 1673 free_swap_slot(entry); 1674 if (i == size - 1) 1675 return; 1676 lock_cluster_or_swap_info(si, offset); 1677 } 1678 } 1679 unlock_cluster_or_swap_info(si, ci); 1680 } 1681 1682 static int swp_entry_cmp(const void *ent1, const void *ent2) 1683 { 1684 const swp_entry_t *e1 = ent1, *e2 = ent2; 1685 1686 return (int)swp_type(*e1) - (int)swp_type(*e2); 1687 } 1688 1689 void swapcache_free_entries(swp_entry_t *entries, int n) 1690 { 1691 struct swap_info_struct *p, *prev; 1692 int i; 1693 1694 if (n <= 0) 1695 return; 1696 1697 prev = NULL; 1698 p = NULL; 1699 1700 /* 1701 * Sort swap entries by swap device, so each lock is only taken once. 1702 * nr_swapfiles isn't absolutely correct, but the overhead of sort() is 1703 * so low that it isn't necessary to optimize further. 1704 */ 1705 if (nr_swapfiles > 1) 1706 sort(entries, n, sizeof(entries[0]), swp_entry_cmp, NULL); 1707 for (i = 0; i < n; ++i) { 1708 p = swap_info_get_cont(entries[i], prev); 1709 if (p) 1710 swap_entry_range_free(p, entries[i], 1); 1711 prev = p; 1712 } 1713 if (p) 1714 spin_unlock(&p->lock); 1715 } 1716 1717 int __swap_count(swp_entry_t entry) 1718 { 1719 struct swap_info_struct *si = swp_swap_info(entry); 1720 pgoff_t offset = swp_offset(entry); 1721 1722 return swap_count(si->swap_map[offset]); 1723 } 1724 1725 /* 1726 * How many references to @entry are currently swapped out? 1727 * This does not give an exact answer when swap count is continued, 1728 * but does include the high COUNT_CONTINUED flag to allow for that. 1729 */ 1730 int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) 1731 { 1732 pgoff_t offset = swp_offset(entry); 1733 struct swap_cluster_info *ci; 1734 int count; 1735 1736 ci = lock_cluster_or_swap_info(si, offset); 1737 count = swap_count(si->swap_map[offset]); 1738 unlock_cluster_or_swap_info(si, ci); 1739 return count; 1740 } 1741 1742 /* 1743 * How many references to @entry are currently swapped out? 1744 * This considers COUNT_CONTINUED so it returns exact answer. 1745 */ 1746 int swp_swapcount(swp_entry_t entry) 1747 { 1748 int count, tmp_count, n; 1749 struct swap_info_struct *si; 1750 struct swap_cluster_info *ci; 1751 struct page *page; 1752 pgoff_t offset; 1753 unsigned char *map; 1754 1755 si = _swap_info_get(entry); 1756 if (!si) 1757 return 0; 1758 1759 offset = swp_offset(entry); 1760 1761 ci = lock_cluster_or_swap_info(si, offset); 1762 1763 count = swap_count(si->swap_map[offset]); 1764 if (!(count & COUNT_CONTINUED)) 1765 goto out; 1766 1767 count &= ~COUNT_CONTINUED; 1768 n = SWAP_MAP_MAX + 1; 1769 1770 page = vmalloc_to_page(si->swap_map + offset); 1771 offset &= ~PAGE_MASK; 1772 VM_BUG_ON(page_private(page) != SWP_CONTINUED); 1773 1774 do { 1775 page = list_next_entry(page, lru); 1776 map = kmap_local_page(page); 1777 tmp_count = map[offset]; 1778 kunmap_local(map); 1779 1780 count += (tmp_count & ~COUNT_CONTINUED) * n; 1781 n *= (SWAP_CONT_MAX + 1); 1782 } while (tmp_count & COUNT_CONTINUED); 1783 out: 1784 unlock_cluster_or_swap_info(si, ci); 1785 return count; 1786 } 1787 1788 static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, 1789 swp_entry_t entry, int order) 1790 { 1791 struct swap_cluster_info *ci; 1792 unsigned char *map = si->swap_map; 1793 unsigned int nr_pages = 1 << order; 1794 unsigned long roffset = swp_offset(entry); 1795 unsigned long offset = round_down(roffset, nr_pages); 1796 int i; 1797 bool ret = false; 1798 1799 ci = lock_cluster_or_swap_info(si, offset); 1800 if (!ci || nr_pages == 1) { 1801 if (swap_count(map[roffset])) 1802 ret = true; 1803 goto unlock_out; 1804 } 1805 for (i = 0; i < nr_pages; i++) { 1806 if (swap_count(map[offset + i])) { 1807 ret = true; 1808 break; 1809 } 1810 } 1811 unlock_out: 1812 unlock_cluster_or_swap_info(si, ci); 1813 return ret; 1814 } 1815 1816 static bool folio_swapped(struct folio *folio) 1817 { 1818 swp_entry_t entry = folio->swap; 1819 struct swap_info_struct *si = _swap_info_get(entry); 1820 1821 if (!si) 1822 return false; 1823 1824 if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!folio_test_large(folio))) 1825 return swap_swapcount(si, entry) != 0; 1826 1827 return swap_page_trans_huge_swapped(si, entry, folio_order(folio)); 1828 } 1829 1830 static bool folio_swapcache_freeable(struct folio *folio) 1831 { 1832 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 1833 1834 if (!folio_test_swapcache(folio)) 1835 return false; 1836 if (folio_test_writeback(folio)) 1837 return false; 1838 1839 /* 1840 * Once hibernation has begun to create its image of memory, 1841 * there's a danger that one of the calls to folio_free_swap() 1842 * - most probably a call from __try_to_reclaim_swap() while 1843 * hibernation is allocating its own swap pages for the image, 1844 * but conceivably even a call from memory reclaim - will free 1845 * the swap from a folio which has already been recorded in the 1846 * image as a clean swapcache folio, and then reuse its swap for 1847 * another page of the image. On waking from hibernation, the 1848 * original folio might be freed under memory pressure, then 1849 * later read back in from swap, now with the wrong data. 1850 * 1851 * Hibernation suspends storage while it is writing the image 1852 * to disk so check that here. 1853 */ 1854 if (pm_suspended_storage()) 1855 return false; 1856 1857 return true; 1858 } 1859 1860 /** 1861 * folio_free_swap() - Free the swap space used for this folio. 1862 * @folio: The folio to remove. 1863 * 1864 * If swap is getting full, or if there are no more mappings of this folio, 1865 * then call folio_free_swap to free its swap space. 1866 * 1867 * Return: true if we were able to release the swap space. 1868 */ 1869 bool folio_free_swap(struct folio *folio) 1870 { 1871 if (!folio_swapcache_freeable(folio)) 1872 return false; 1873 if (folio_swapped(folio)) 1874 return false; 1875 1876 delete_from_swap_cache(folio); 1877 folio_set_dirty(folio); 1878 return true; 1879 } 1880 1881 /** 1882 * free_swap_and_cache_nr() - Release reference on range of swap entries and 1883 * reclaim their cache if no more references remain. 1884 * @entry: First entry of range. 1885 * @nr: Number of entries in range. 1886 * 1887 * For each swap entry in the contiguous range, release a reference. If any swap 1888 * entries become free, try to reclaim their underlying folios, if present. The 1889 * offset range is defined by [entry.offset, entry.offset + nr). 1890 */ 1891 void free_swap_and_cache_nr(swp_entry_t entry, int nr) 1892 { 1893 const unsigned long start_offset = swp_offset(entry); 1894 const unsigned long end_offset = start_offset + nr; 1895 struct swap_info_struct *si; 1896 bool any_only_cache = false; 1897 unsigned long offset; 1898 1899 if (non_swap_entry(entry)) 1900 return; 1901 1902 si = get_swap_device(entry); 1903 if (!si) 1904 return; 1905 1906 if (WARN_ON(end_offset > si->max)) 1907 goto out; 1908 1909 /* 1910 * First free all entries in the range. 1911 */ 1912 any_only_cache = __swap_entries_free(si, entry, nr); 1913 1914 /* 1915 * Short-circuit the below loop if none of the entries had their 1916 * reference drop to zero. 1917 */ 1918 if (!any_only_cache) 1919 goto out; 1920 1921 /* 1922 * Now go back over the range trying to reclaim the swap cache. This is 1923 * more efficient for large folios because we will only try to reclaim 1924 * the swap once per folio in the common case. If we do 1925 * __swap_entry_free() and __try_to_reclaim_swap() in the same loop, the 1926 * latter will get a reference and lock the folio for every individual 1927 * page but will only succeed once the swap slot for every subpage is 1928 * zero. 1929 */ 1930 for (offset = start_offset; offset < end_offset; offset += nr) { 1931 nr = 1; 1932 if (READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) { 1933 /* 1934 * Folios are always naturally aligned in swap so 1935 * advance forward to the next boundary. Zero means no 1936 * folio was found for the swap entry, so advance by 1 1937 * in this case. Negative value means folio was found 1938 * but could not be reclaimed. Here we can still advance 1939 * to the next boundary. 1940 */ 1941 nr = __try_to_reclaim_swap(si, offset, 1942 TTRS_UNMAPPED | TTRS_FULL); 1943 if (nr == 0) 1944 nr = 1; 1945 else if (nr < 0) 1946 nr = -nr; 1947 nr = ALIGN(offset + 1, nr) - offset; 1948 } 1949 } 1950 1951 out: 1952 put_swap_device(si); 1953 } 1954 1955 #ifdef CONFIG_HIBERNATION 1956 1957 swp_entry_t get_swap_page_of_type(int type) 1958 { 1959 struct swap_info_struct *si = swap_type_to_swap_info(type); 1960 swp_entry_t entry = {0}; 1961 1962 if (!si) 1963 goto fail; 1964 1965 /* This is called for allocating swap entry, not cache */ 1966 spin_lock(&si->lock); 1967 if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry, 0)) 1968 atomic_long_dec(&nr_swap_pages); 1969 spin_unlock(&si->lock); 1970 fail: 1971 return entry; 1972 } 1973 1974 /* 1975 * Find the swap type that corresponds to given device (if any). 1976 * 1977 * @offset - number of the PAGE_SIZE-sized block of the device, starting 1978 * from 0, in which the swap header is expected to be located. 1979 * 1980 * This is needed for the suspend to disk (aka swsusp). 1981 */ 1982 int swap_type_of(dev_t device, sector_t offset) 1983 { 1984 int type; 1985 1986 if (!device) 1987 return -1; 1988 1989 spin_lock(&swap_lock); 1990 for (type = 0; type < nr_swapfiles; type++) { 1991 struct swap_info_struct *sis = swap_info[type]; 1992 1993 if (!(sis->flags & SWP_WRITEOK)) 1994 continue; 1995 1996 if (device == sis->bdev->bd_dev) { 1997 struct swap_extent *se = first_se(sis); 1998 1999 if (se->start_block == offset) { 2000 spin_unlock(&swap_lock); 2001 return type; 2002 } 2003 } 2004 } 2005 spin_unlock(&swap_lock); 2006 return -ENODEV; 2007 } 2008 2009 int find_first_swap(dev_t *device) 2010 { 2011 int type; 2012 2013 spin_lock(&swap_lock); 2014 for (type = 0; type < nr_swapfiles; type++) { 2015 struct swap_info_struct *sis = swap_info[type]; 2016 2017 if (!(sis->flags & SWP_WRITEOK)) 2018 continue; 2019 *device = sis->bdev->bd_dev; 2020 spin_unlock(&swap_lock); 2021 return type; 2022 } 2023 spin_unlock(&swap_lock); 2024 return -ENODEV; 2025 } 2026 2027 /* 2028 * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev 2029 * corresponding to given index in swap_info (swap type). 2030 */ 2031 sector_t swapdev_block(int type, pgoff_t offset) 2032 { 2033 struct swap_info_struct *si = swap_type_to_swap_info(type); 2034 struct swap_extent *se; 2035 2036 if (!si || !(si->flags & SWP_WRITEOK)) 2037 return 0; 2038 se = offset_to_swap_extent(si, offset); 2039 return se->start_block + (offset - se->start_page); 2040 } 2041 2042 /* 2043 * Return either the total number of swap pages of given type, or the number 2044 * of free pages of that type (depending on @free) 2045 * 2046 * This is needed for software suspend 2047 */ 2048 unsigned int count_swap_pages(int type, int free) 2049 { 2050 unsigned int n = 0; 2051 2052 spin_lock(&swap_lock); 2053 if ((unsigned int)type < nr_swapfiles) { 2054 struct swap_info_struct *sis = swap_info[type]; 2055 2056 spin_lock(&sis->lock); 2057 if (sis->flags & SWP_WRITEOK) { 2058 n = sis->pages; 2059 if (free) 2060 n -= sis->inuse_pages; 2061 } 2062 spin_unlock(&sis->lock); 2063 } 2064 spin_unlock(&swap_lock); 2065 return n; 2066 } 2067 #endif /* CONFIG_HIBERNATION */ 2068 2069 static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte) 2070 { 2071 return pte_same(pte_swp_clear_flags(pte), swp_pte); 2072 } 2073 2074 /* 2075 * No need to decide whether this PTE shares the swap entry with others, 2076 * just let do_wp_page work it out if a write is requested later - to 2077 * force COW, vm_page_prot omits write permission from any private vma. 2078 */ 2079 static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, 2080 unsigned long addr, swp_entry_t entry, struct folio *folio) 2081 { 2082 struct page *page; 2083 struct folio *swapcache; 2084 spinlock_t *ptl; 2085 pte_t *pte, new_pte, old_pte; 2086 bool hwpoisoned = false; 2087 int ret = 1; 2088 2089 swapcache = folio; 2090 folio = ksm_might_need_to_copy(folio, vma, addr); 2091 if (unlikely(!folio)) 2092 return -ENOMEM; 2093 else if (unlikely(folio == ERR_PTR(-EHWPOISON))) { 2094 hwpoisoned = true; 2095 folio = swapcache; 2096 } 2097 2098 page = folio_file_page(folio, swp_offset(entry)); 2099 if (PageHWPoison(page)) 2100 hwpoisoned = true; 2101 2102 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 2103 if (unlikely(!pte || !pte_same_as_swp(ptep_get(pte), 2104 swp_entry_to_pte(entry)))) { 2105 ret = 0; 2106 goto out; 2107 } 2108 2109 old_pte = ptep_get(pte); 2110 2111 if (unlikely(hwpoisoned || !folio_test_uptodate(folio))) { 2112 swp_entry_t swp_entry; 2113 2114 dec_mm_counter(vma->vm_mm, MM_SWAPENTS); 2115 if (hwpoisoned) { 2116 swp_entry = make_hwpoison_entry(page); 2117 } else { 2118 swp_entry = make_poisoned_swp_entry(); 2119 } 2120 new_pte = swp_entry_to_pte(swp_entry); 2121 ret = 0; 2122 goto setpte; 2123 } 2124 2125 /* 2126 * Some architectures may have to restore extra metadata to the page 2127 * when reading from swap. This metadata may be indexed by swap entry 2128 * so this must be called before swap_free(). 2129 */ 2130 arch_swap_restore(folio_swap(entry, folio), folio); 2131 2132 dec_mm_counter(vma->vm_mm, MM_SWAPENTS); 2133 inc_mm_counter(vma->vm_mm, MM_ANONPAGES); 2134 folio_get(folio); 2135 if (folio == swapcache) { 2136 rmap_t rmap_flags = RMAP_NONE; 2137 2138 /* 2139 * See do_swap_page(): writeback would be problematic. 2140 * However, we do a folio_wait_writeback() just before this 2141 * call and have the folio locked. 2142 */ 2143 VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); 2144 if (pte_swp_exclusive(old_pte)) 2145 rmap_flags |= RMAP_EXCLUSIVE; 2146 /* 2147 * We currently only expect small !anon folios, which are either 2148 * fully exclusive or fully shared. If we ever get large folios 2149 * here, we have to be careful. 2150 */ 2151 if (!folio_test_anon(folio)) { 2152 VM_WARN_ON_ONCE(folio_test_large(folio)); 2153 VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); 2154 folio_add_new_anon_rmap(folio, vma, addr, rmap_flags); 2155 } else { 2156 folio_add_anon_rmap_pte(folio, page, vma, addr, rmap_flags); 2157 } 2158 } else { /* ksm created a completely new copy */ 2159 folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); 2160 folio_add_lru_vma(folio, vma); 2161 } 2162 new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot)); 2163 if (pte_swp_soft_dirty(old_pte)) 2164 new_pte = pte_mksoft_dirty(new_pte); 2165 if (pte_swp_uffd_wp(old_pte)) 2166 new_pte = pte_mkuffd_wp(new_pte); 2167 setpte: 2168 set_pte_at(vma->vm_mm, addr, pte, new_pte); 2169 swap_free(entry); 2170 out: 2171 if (pte) 2172 pte_unmap_unlock(pte, ptl); 2173 if (folio != swapcache) { 2174 folio_unlock(folio); 2175 folio_put(folio); 2176 } 2177 return ret; 2178 } 2179 2180 static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 2181 unsigned long addr, unsigned long end, 2182 unsigned int type) 2183 { 2184 pte_t *pte = NULL; 2185 struct swap_info_struct *si; 2186 2187 si = swap_info[type]; 2188 do { 2189 struct folio *folio; 2190 unsigned long offset; 2191 unsigned char swp_count; 2192 swp_entry_t entry; 2193 int ret; 2194 pte_t ptent; 2195 2196 if (!pte++) { 2197 pte = pte_offset_map(pmd, addr); 2198 if (!pte) 2199 break; 2200 } 2201 2202 ptent = ptep_get_lockless(pte); 2203 2204 if (!is_swap_pte(ptent)) 2205 continue; 2206 2207 entry = pte_to_swp_entry(ptent); 2208 if (swp_type(entry) != type) 2209 continue; 2210 2211 offset = swp_offset(entry); 2212 pte_unmap(pte); 2213 pte = NULL; 2214 2215 folio = swap_cache_get_folio(entry, vma, addr); 2216 if (!folio) { 2217 struct vm_fault vmf = { 2218 .vma = vma, 2219 .address = addr, 2220 .real_address = addr, 2221 .pmd = pmd, 2222 }; 2223 2224 folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, 2225 &vmf); 2226 } 2227 if (!folio) { 2228 swp_count = READ_ONCE(si->swap_map[offset]); 2229 if (swp_count == 0 || swp_count == SWAP_MAP_BAD) 2230 continue; 2231 return -ENOMEM; 2232 } 2233 2234 folio_lock(folio); 2235 folio_wait_writeback(folio); 2236 ret = unuse_pte(vma, pmd, addr, entry, folio); 2237 if (ret < 0) { 2238 folio_unlock(folio); 2239 folio_put(folio); 2240 return ret; 2241 } 2242 2243 folio_free_swap(folio); 2244 folio_unlock(folio); 2245 folio_put(folio); 2246 } while (addr += PAGE_SIZE, addr != end); 2247 2248 if (pte) 2249 pte_unmap(pte); 2250 return 0; 2251 } 2252 2253 static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, 2254 unsigned long addr, unsigned long end, 2255 unsigned int type) 2256 { 2257 pmd_t *pmd; 2258 unsigned long next; 2259 int ret; 2260 2261 pmd = pmd_offset(pud, addr); 2262 do { 2263 cond_resched(); 2264 next = pmd_addr_end(addr, end); 2265 ret = unuse_pte_range(vma, pmd, addr, next, type); 2266 if (ret) 2267 return ret; 2268 } while (pmd++, addr = next, addr != end); 2269 return 0; 2270 } 2271 2272 static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d, 2273 unsigned long addr, unsigned long end, 2274 unsigned int type) 2275 { 2276 pud_t *pud; 2277 unsigned long next; 2278 int ret; 2279 2280 pud = pud_offset(p4d, addr); 2281 do { 2282 next = pud_addr_end(addr, end); 2283 if (pud_none_or_clear_bad(pud)) 2284 continue; 2285 ret = unuse_pmd_range(vma, pud, addr, next, type); 2286 if (ret) 2287 return ret; 2288 } while (pud++, addr = next, addr != end); 2289 return 0; 2290 } 2291 2292 static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd, 2293 unsigned long addr, unsigned long end, 2294 unsigned int type) 2295 { 2296 p4d_t *p4d; 2297 unsigned long next; 2298 int ret; 2299 2300 p4d = p4d_offset(pgd, addr); 2301 do { 2302 next = p4d_addr_end(addr, end); 2303 if (p4d_none_or_clear_bad(p4d)) 2304 continue; 2305 ret = unuse_pud_range(vma, p4d, addr, next, type); 2306 if (ret) 2307 return ret; 2308 } while (p4d++, addr = next, addr != end); 2309 return 0; 2310 } 2311 2312 static int unuse_vma(struct vm_area_struct *vma, unsigned int type) 2313 { 2314 pgd_t *pgd; 2315 unsigned long addr, end, next; 2316 int ret; 2317 2318 addr = vma->vm_start; 2319 end = vma->vm_end; 2320 2321 pgd = pgd_offset(vma->vm_mm, addr); 2322 do { 2323 next = pgd_addr_end(addr, end); 2324 if (pgd_none_or_clear_bad(pgd)) 2325 continue; 2326 ret = unuse_p4d_range(vma, pgd, addr, next, type); 2327 if (ret) 2328 return ret; 2329 } while (pgd++, addr = next, addr != end); 2330 return 0; 2331 } 2332 2333 static int unuse_mm(struct mm_struct *mm, unsigned int type) 2334 { 2335 struct vm_area_struct *vma; 2336 int ret = 0; 2337 VMA_ITERATOR(vmi, mm, 0); 2338 2339 mmap_read_lock(mm); 2340 for_each_vma(vmi, vma) { 2341 if (vma->anon_vma && !is_vm_hugetlb_page(vma)) { 2342 ret = unuse_vma(vma, type); 2343 if (ret) 2344 break; 2345 } 2346 2347 cond_resched(); 2348 } 2349 mmap_read_unlock(mm); 2350 return ret; 2351 } 2352 2353 /* 2354 * Scan swap_map from current position to next entry still in use. 2355 * Return 0 if there are no inuse entries after prev till end of 2356 * the map. 2357 */ 2358 static unsigned int find_next_to_unuse(struct swap_info_struct *si, 2359 unsigned int prev) 2360 { 2361 unsigned int i; 2362 unsigned char count; 2363 2364 /* 2365 * No need for swap_lock here: we're just looking 2366 * for whether an entry is in use, not modifying it; false 2367 * hits are okay, and sys_swapoff() has already prevented new 2368 * allocations from this area (while holding swap_lock). 2369 */ 2370 for (i = prev + 1; i < si->max; i++) { 2371 count = READ_ONCE(si->swap_map[i]); 2372 if (count && swap_count(count) != SWAP_MAP_BAD) 2373 break; 2374 if ((i % LATENCY_LIMIT) == 0) 2375 cond_resched(); 2376 } 2377 2378 if (i == si->max) 2379 i = 0; 2380 2381 return i; 2382 } 2383 2384 static int try_to_unuse(unsigned int type) 2385 { 2386 struct mm_struct *prev_mm; 2387 struct mm_struct *mm; 2388 struct list_head *p; 2389 int retval = 0; 2390 struct swap_info_struct *si = swap_info[type]; 2391 struct folio *folio; 2392 swp_entry_t entry; 2393 unsigned int i; 2394 2395 if (!READ_ONCE(si->inuse_pages)) 2396 goto success; 2397 2398 retry: 2399 retval = shmem_unuse(type); 2400 if (retval) 2401 return retval; 2402 2403 prev_mm = &init_mm; 2404 mmget(prev_mm); 2405 2406 spin_lock(&mmlist_lock); 2407 p = &init_mm.mmlist; 2408 while (READ_ONCE(si->inuse_pages) && 2409 !signal_pending(current) && 2410 (p = p->next) != &init_mm.mmlist) { 2411 2412 mm = list_entry(p, struct mm_struct, mmlist); 2413 if (!mmget_not_zero(mm)) 2414 continue; 2415 spin_unlock(&mmlist_lock); 2416 mmput(prev_mm); 2417 prev_mm = mm; 2418 retval = unuse_mm(mm, type); 2419 if (retval) { 2420 mmput(prev_mm); 2421 return retval; 2422 } 2423 2424 /* 2425 * Make sure that we aren't completely killing 2426 * interactive performance. 2427 */ 2428 cond_resched(); 2429 spin_lock(&mmlist_lock); 2430 } 2431 spin_unlock(&mmlist_lock); 2432 2433 mmput(prev_mm); 2434 2435 i = 0; 2436 while (READ_ONCE(si->inuse_pages) && 2437 !signal_pending(current) && 2438 (i = find_next_to_unuse(si, i)) != 0) { 2439 2440 entry = swp_entry(type, i); 2441 folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry)); 2442 if (IS_ERR(folio)) 2443 continue; 2444 2445 /* 2446 * It is conceivable that a racing task removed this folio from 2447 * swap cache just before we acquired the page lock. The folio 2448 * might even be back in swap cache on another swap area. But 2449 * that is okay, folio_free_swap() only removes stale folios. 2450 */ 2451 folio_lock(folio); 2452 folio_wait_writeback(folio); 2453 folio_free_swap(folio); 2454 folio_unlock(folio); 2455 folio_put(folio); 2456 } 2457 2458 /* 2459 * Lets check again to see if there are still swap entries in the map. 2460 * If yes, we would need to do retry the unuse logic again. 2461 * Under global memory pressure, swap entries can be reinserted back 2462 * into process space after the mmlist loop above passes over them. 2463 * 2464 * Limit the number of retries? No: when mmget_not_zero() 2465 * above fails, that mm is likely to be freeing swap from 2466 * exit_mmap(), which proceeds at its own independent pace; 2467 * and even shmem_writepage() could have been preempted after 2468 * folio_alloc_swap(), temporarily hiding that swap. It's easy 2469 * and robust (though cpu-intensive) just to keep retrying. 2470 */ 2471 if (READ_ONCE(si->inuse_pages)) { 2472 if (!signal_pending(current)) 2473 goto retry; 2474 return -EINTR; 2475 } 2476 2477 success: 2478 /* 2479 * Make sure that further cleanups after try_to_unuse() returns happen 2480 * after swap_range_free() reduces si->inuse_pages to 0. 2481 */ 2482 smp_mb(); 2483 return 0; 2484 } 2485 2486 /* 2487 * After a successful try_to_unuse, if no swap is now in use, we know 2488 * we can empty the mmlist. swap_lock must be held on entry and exit. 2489 * Note that mmlist_lock nests inside swap_lock, and an mm must be 2490 * added to the mmlist just after page_duplicate - before would be racy. 2491 */ 2492 static void drain_mmlist(void) 2493 { 2494 struct list_head *p, *next; 2495 unsigned int type; 2496 2497 for (type = 0; type < nr_swapfiles; type++) 2498 if (swap_info[type]->inuse_pages) 2499 return; 2500 spin_lock(&mmlist_lock); 2501 list_for_each_safe(p, next, &init_mm.mmlist) 2502 list_del_init(p); 2503 spin_unlock(&mmlist_lock); 2504 } 2505 2506 /* 2507 * Free all of a swapdev's extent information 2508 */ 2509 static void destroy_swap_extents(struct swap_info_struct *sis) 2510 { 2511 while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) { 2512 struct rb_node *rb = sis->swap_extent_root.rb_node; 2513 struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node); 2514 2515 rb_erase(rb, &sis->swap_extent_root); 2516 kfree(se); 2517 } 2518 2519 if (sis->flags & SWP_ACTIVATED) { 2520 struct file *swap_file = sis->swap_file; 2521 struct address_space *mapping = swap_file->f_mapping; 2522 2523 sis->flags &= ~SWP_ACTIVATED; 2524 if (mapping->a_ops->swap_deactivate) 2525 mapping->a_ops->swap_deactivate(swap_file); 2526 } 2527 } 2528 2529 /* 2530 * Add a block range (and the corresponding page range) into this swapdev's 2531 * extent tree. 2532 * 2533 * This function rather assumes that it is called in ascending page order. 2534 */ 2535 int 2536 add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, 2537 unsigned long nr_pages, sector_t start_block) 2538 { 2539 struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL; 2540 struct swap_extent *se; 2541 struct swap_extent *new_se; 2542 2543 /* 2544 * place the new node at the right most since the 2545 * function is called in ascending page order. 2546 */ 2547 while (*link) { 2548 parent = *link; 2549 link = &parent->rb_right; 2550 } 2551 2552 if (parent) { 2553 se = rb_entry(parent, struct swap_extent, rb_node); 2554 BUG_ON(se->start_page + se->nr_pages != start_page); 2555 if (se->start_block + se->nr_pages == start_block) { 2556 /* Merge it */ 2557 se->nr_pages += nr_pages; 2558 return 0; 2559 } 2560 } 2561 2562 /* No merge, insert a new extent. */ 2563 new_se = kmalloc(sizeof(*se), GFP_KERNEL); 2564 if (new_se == NULL) 2565 return -ENOMEM; 2566 new_se->start_page = start_page; 2567 new_se->nr_pages = nr_pages; 2568 new_se->start_block = start_block; 2569 2570 rb_link_node(&new_se->rb_node, parent, link); 2571 rb_insert_color(&new_se->rb_node, &sis->swap_extent_root); 2572 return 1; 2573 } 2574 EXPORT_SYMBOL_GPL(add_swap_extent); 2575 2576 /* 2577 * A `swap extent' is a simple thing which maps a contiguous range of pages 2578 * onto a contiguous range of disk blocks. A rbtree of swap extents is 2579 * built at swapon time and is then used at swap_writepage/swap_read_folio 2580 * time for locating where on disk a page belongs. 2581 * 2582 * If the swapfile is an S_ISBLK block device, a single extent is installed. 2583 * This is done so that the main operating code can treat S_ISBLK and S_ISREG 2584 * swap files identically. 2585 * 2586 * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap 2587 * extent rbtree operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK 2588 * swapfiles are handled *identically* after swapon time. 2589 * 2590 * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks 2591 * and will parse them into a rbtree, in PAGE_SIZE chunks. If some stray 2592 * blocks are found which do not fall within the PAGE_SIZE alignment 2593 * requirements, they are simply tossed out - we will never use those blocks 2594 * for swapping. 2595 * 2596 * For all swap devices we set S_SWAPFILE across the life of the swapon. This 2597 * prevents users from writing to the swap device, which will corrupt memory. 2598 * 2599 * The amount of disk space which a single swap extent represents varies. 2600 * Typically it is in the 1-4 megabyte range. So we can have hundreds of 2601 * extents in the rbtree. - akpm. 2602 */ 2603 static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) 2604 { 2605 struct file *swap_file = sis->swap_file; 2606 struct address_space *mapping = swap_file->f_mapping; 2607 struct inode *inode = mapping->host; 2608 int ret; 2609 2610 if (S_ISBLK(inode->i_mode)) { 2611 ret = add_swap_extent(sis, 0, sis->max, 0); 2612 *span = sis->pages; 2613 return ret; 2614 } 2615 2616 if (mapping->a_ops->swap_activate) { 2617 ret = mapping->a_ops->swap_activate(sis, swap_file, span); 2618 if (ret < 0) 2619 return ret; 2620 sis->flags |= SWP_ACTIVATED; 2621 if ((sis->flags & SWP_FS_OPS) && 2622 sio_pool_init() != 0) { 2623 destroy_swap_extents(sis); 2624 return -ENOMEM; 2625 } 2626 return ret; 2627 } 2628 2629 return generic_swapfile_activate(sis, swap_file, span); 2630 } 2631 2632 static int swap_node(struct swap_info_struct *si) 2633 { 2634 struct block_device *bdev; 2635 2636 if (si->bdev) 2637 bdev = si->bdev; 2638 else 2639 bdev = si->swap_file->f_inode->i_sb->s_bdev; 2640 2641 return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE; 2642 } 2643 2644 static void setup_swap_info(struct swap_info_struct *si, int prio, 2645 unsigned char *swap_map, 2646 struct swap_cluster_info *cluster_info, 2647 unsigned long *zeromap) 2648 { 2649 int i; 2650 2651 if (prio >= 0) 2652 si->prio = prio; 2653 else 2654 si->prio = --least_priority; 2655 /* 2656 * the plist prio is negated because plist ordering is 2657 * low-to-high, while swap ordering is high-to-low 2658 */ 2659 si->list.prio = -si->prio; 2660 for_each_node(i) { 2661 if (si->prio >= 0) 2662 si->avail_lists[i].prio = -si->prio; 2663 else { 2664 if (swap_node(si) == i) 2665 si->avail_lists[i].prio = 1; 2666 else 2667 si->avail_lists[i].prio = -si->prio; 2668 } 2669 } 2670 si->swap_map = swap_map; 2671 si->cluster_info = cluster_info; 2672 si->zeromap = zeromap; 2673 } 2674 2675 static void _enable_swap_info(struct swap_info_struct *si) 2676 { 2677 si->flags |= SWP_WRITEOK; 2678 atomic_long_add(si->pages, &nr_swap_pages); 2679 total_swap_pages += si->pages; 2680 2681 assert_spin_locked(&swap_lock); 2682 /* 2683 * both lists are plists, and thus priority ordered. 2684 * swap_active_head needs to be priority ordered for swapoff(), 2685 * which on removal of any swap_info_struct with an auto-assigned 2686 * (i.e. negative) priority increments the auto-assigned priority 2687 * of any lower-priority swap_info_structs. 2688 * swap_avail_head needs to be priority ordered for folio_alloc_swap(), 2689 * which allocates swap pages from the highest available priority 2690 * swap_info_struct. 2691 */ 2692 plist_add(&si->list, &swap_active_head); 2693 2694 /* add to available list iff swap device is not full */ 2695 if (si->highest_bit) 2696 add_to_avail_list(si); 2697 } 2698 2699 static void enable_swap_info(struct swap_info_struct *si, int prio, 2700 unsigned char *swap_map, 2701 struct swap_cluster_info *cluster_info, 2702 unsigned long *zeromap) 2703 { 2704 spin_lock(&swap_lock); 2705 spin_lock(&si->lock); 2706 setup_swap_info(si, prio, swap_map, cluster_info, zeromap); 2707 spin_unlock(&si->lock); 2708 spin_unlock(&swap_lock); 2709 /* 2710 * Finished initializing swap device, now it's safe to reference it. 2711 */ 2712 percpu_ref_resurrect(&si->users); 2713 spin_lock(&swap_lock); 2714 spin_lock(&si->lock); 2715 _enable_swap_info(si); 2716 spin_unlock(&si->lock); 2717 spin_unlock(&swap_lock); 2718 } 2719 2720 static void reinsert_swap_info(struct swap_info_struct *si) 2721 { 2722 spin_lock(&swap_lock); 2723 spin_lock(&si->lock); 2724 setup_swap_info(si, si->prio, si->swap_map, si->cluster_info, si->zeromap); 2725 _enable_swap_info(si); 2726 spin_unlock(&si->lock); 2727 spin_unlock(&swap_lock); 2728 } 2729 2730 static bool __has_usable_swap(void) 2731 { 2732 return !plist_head_empty(&swap_active_head); 2733 } 2734 2735 bool has_usable_swap(void) 2736 { 2737 bool ret; 2738 2739 spin_lock(&swap_lock); 2740 ret = __has_usable_swap(); 2741 spin_unlock(&swap_lock); 2742 return ret; 2743 } 2744 2745 SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) 2746 { 2747 struct swap_info_struct *p = NULL; 2748 unsigned char *swap_map; 2749 unsigned long *zeromap; 2750 struct swap_cluster_info *cluster_info; 2751 struct file *swap_file, *victim; 2752 struct address_space *mapping; 2753 struct inode *inode; 2754 struct filename *pathname; 2755 int err, found = 0; 2756 2757 if (!capable(CAP_SYS_ADMIN)) 2758 return -EPERM; 2759 2760 BUG_ON(!current->mm); 2761 2762 pathname = getname(specialfile); 2763 if (IS_ERR(pathname)) 2764 return PTR_ERR(pathname); 2765 2766 victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0); 2767 err = PTR_ERR(victim); 2768 if (IS_ERR(victim)) 2769 goto out; 2770 2771 mapping = victim->f_mapping; 2772 spin_lock(&swap_lock); 2773 plist_for_each_entry(p, &swap_active_head, list) { 2774 if (p->flags & SWP_WRITEOK) { 2775 if (p->swap_file->f_mapping == mapping) { 2776 found = 1; 2777 break; 2778 } 2779 } 2780 } 2781 if (!found) { 2782 err = -EINVAL; 2783 spin_unlock(&swap_lock); 2784 goto out_dput; 2785 } 2786 if (!security_vm_enough_memory_mm(current->mm, p->pages)) 2787 vm_unacct_memory(p->pages); 2788 else { 2789 err = -ENOMEM; 2790 spin_unlock(&swap_lock); 2791 goto out_dput; 2792 } 2793 spin_lock(&p->lock); 2794 del_from_avail_list(p); 2795 if (p->prio < 0) { 2796 struct swap_info_struct *si = p; 2797 int nid; 2798 2799 plist_for_each_entry_continue(si, &swap_active_head, list) { 2800 si->prio++; 2801 si->list.prio--; 2802 for_each_node(nid) { 2803 if (si->avail_lists[nid].prio != 1) 2804 si->avail_lists[nid].prio--; 2805 } 2806 } 2807 least_priority++; 2808 } 2809 plist_del(&p->list, &swap_active_head); 2810 atomic_long_sub(p->pages, &nr_swap_pages); 2811 total_swap_pages -= p->pages; 2812 p->flags &= ~SWP_WRITEOK; 2813 spin_unlock(&p->lock); 2814 spin_unlock(&swap_lock); 2815 2816 disable_swap_slots_cache_lock(); 2817 2818 set_current_oom_origin(); 2819 err = try_to_unuse(p->type); 2820 clear_current_oom_origin(); 2821 2822 if (err) { 2823 /* re-insert swap space back into swap_list */ 2824 reinsert_swap_info(p); 2825 reenable_swap_slots_cache_unlock(); 2826 goto out_dput; 2827 } 2828 2829 reenable_swap_slots_cache_unlock(); 2830 2831 /* 2832 * Wait for swap operations protected by get/put_swap_device() 2833 * to complete. Because of synchronize_rcu() here, all swap 2834 * operations protected by RCU reader side lock (including any 2835 * spinlock) will be waited too. This makes it easy to 2836 * prevent folio_test_swapcache() and the following swap cache 2837 * operations from racing with swapoff. 2838 */ 2839 percpu_ref_kill(&p->users); 2840 synchronize_rcu(); 2841 wait_for_completion(&p->comp); 2842 2843 flush_work(&p->discard_work); 2844 flush_work(&p->reclaim_work); 2845 2846 destroy_swap_extents(p); 2847 if (p->flags & SWP_CONTINUED) 2848 free_swap_count_continuations(p); 2849 2850 if (!p->bdev || !bdev_nonrot(p->bdev)) 2851 atomic_dec(&nr_rotate_swap); 2852 2853 mutex_lock(&swapon_mutex); 2854 spin_lock(&swap_lock); 2855 spin_lock(&p->lock); 2856 drain_mmlist(); 2857 2858 /* wait for anyone still in scan_swap_map_slots */ 2859 p->highest_bit = 0; /* cuts scans short */ 2860 while (p->flags >= SWP_SCANNING) { 2861 spin_unlock(&p->lock); 2862 spin_unlock(&swap_lock); 2863 schedule_timeout_uninterruptible(1); 2864 spin_lock(&swap_lock); 2865 spin_lock(&p->lock); 2866 } 2867 2868 swap_file = p->swap_file; 2869 p->swap_file = NULL; 2870 p->max = 0; 2871 swap_map = p->swap_map; 2872 p->swap_map = NULL; 2873 zeromap = p->zeromap; 2874 p->zeromap = NULL; 2875 cluster_info = p->cluster_info; 2876 p->cluster_info = NULL; 2877 spin_unlock(&p->lock); 2878 spin_unlock(&swap_lock); 2879 arch_swap_invalidate_area(p->type); 2880 zswap_swapoff(p->type); 2881 mutex_unlock(&swapon_mutex); 2882 free_percpu(p->percpu_cluster); 2883 p->percpu_cluster = NULL; 2884 free_percpu(p->cluster_next_cpu); 2885 p->cluster_next_cpu = NULL; 2886 vfree(swap_map); 2887 kvfree(zeromap); 2888 kvfree(cluster_info); 2889 /* Destroy swap account information */ 2890 swap_cgroup_swapoff(p->type); 2891 exit_swap_address_space(p->type); 2892 2893 inode = mapping->host; 2894 2895 inode_lock(inode); 2896 inode->i_flags &= ~S_SWAPFILE; 2897 inode_unlock(inode); 2898 filp_close(swap_file, NULL); 2899 2900 /* 2901 * Clear the SWP_USED flag after all resources are freed so that swapon 2902 * can reuse this swap_info in alloc_swap_info() safely. It is ok to 2903 * not hold p->lock after we cleared its SWP_WRITEOK. 2904 */ 2905 spin_lock(&swap_lock); 2906 p->flags = 0; 2907 spin_unlock(&swap_lock); 2908 2909 err = 0; 2910 atomic_inc(&proc_poll_event); 2911 wake_up_interruptible(&proc_poll_wait); 2912 2913 out_dput: 2914 filp_close(victim, NULL); 2915 out: 2916 putname(pathname); 2917 return err; 2918 } 2919 2920 #ifdef CONFIG_PROC_FS 2921 static __poll_t swaps_poll(struct file *file, poll_table *wait) 2922 { 2923 struct seq_file *seq = file->private_data; 2924 2925 poll_wait(file, &proc_poll_wait, wait); 2926 2927 if (seq->poll_event != atomic_read(&proc_poll_event)) { 2928 seq->poll_event = atomic_read(&proc_poll_event); 2929 return EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI; 2930 } 2931 2932 return EPOLLIN | EPOLLRDNORM; 2933 } 2934 2935 /* iterator */ 2936 static void *swap_start(struct seq_file *swap, loff_t *pos) 2937 { 2938 struct swap_info_struct *si; 2939 int type; 2940 loff_t l = *pos; 2941 2942 mutex_lock(&swapon_mutex); 2943 2944 if (!l) 2945 return SEQ_START_TOKEN; 2946 2947 for (type = 0; (si = swap_type_to_swap_info(type)); type++) { 2948 if (!(si->flags & SWP_USED) || !si->swap_map) 2949 continue; 2950 if (!--l) 2951 return si; 2952 } 2953 2954 return NULL; 2955 } 2956 2957 static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) 2958 { 2959 struct swap_info_struct *si = v; 2960 int type; 2961 2962 if (v == SEQ_START_TOKEN) 2963 type = 0; 2964 else 2965 type = si->type + 1; 2966 2967 ++(*pos); 2968 for (; (si = swap_type_to_swap_info(type)); type++) { 2969 if (!(si->flags & SWP_USED) || !si->swap_map) 2970 continue; 2971 return si; 2972 } 2973 2974 return NULL; 2975 } 2976 2977 static void swap_stop(struct seq_file *swap, void *v) 2978 { 2979 mutex_unlock(&swapon_mutex); 2980 } 2981 2982 static int swap_show(struct seq_file *swap, void *v) 2983 { 2984 struct swap_info_struct *si = v; 2985 struct file *file; 2986 int len; 2987 unsigned long bytes, inuse; 2988 2989 if (si == SEQ_START_TOKEN) { 2990 seq_puts(swap, "Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n"); 2991 return 0; 2992 } 2993 2994 bytes = K(si->pages); 2995 inuse = K(READ_ONCE(si->inuse_pages)); 2996 2997 file = si->swap_file; 2998 len = seq_file_path(swap, file, " \t\n\\"); 2999 seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n", 3000 len < 40 ? 40 - len : 1, " ", 3001 S_ISBLK(file_inode(file)->i_mode) ? 3002 "partition" : "file\t", 3003 bytes, bytes < 10000000 ? "\t" : "", 3004 inuse, inuse < 10000000 ? "\t" : "", 3005 si->prio); 3006 return 0; 3007 } 3008 3009 static const struct seq_operations swaps_op = { 3010 .start = swap_start, 3011 .next = swap_next, 3012 .stop = swap_stop, 3013 .show = swap_show 3014 }; 3015 3016 static int swaps_open(struct inode *inode, struct file *file) 3017 { 3018 struct seq_file *seq; 3019 int ret; 3020 3021 ret = seq_open(file, &swaps_op); 3022 if (ret) 3023 return ret; 3024 3025 seq = file->private_data; 3026 seq->poll_event = atomic_read(&proc_poll_event); 3027 return 0; 3028 } 3029 3030 static const struct proc_ops swaps_proc_ops = { 3031 .proc_flags = PROC_ENTRY_PERMANENT, 3032 .proc_open = swaps_open, 3033 .proc_read = seq_read, 3034 .proc_lseek = seq_lseek, 3035 .proc_release = seq_release, 3036 .proc_poll = swaps_poll, 3037 }; 3038 3039 static int __init procswaps_init(void) 3040 { 3041 proc_create("swaps", 0, NULL, &swaps_proc_ops); 3042 return 0; 3043 } 3044 __initcall(procswaps_init); 3045 #endif /* CONFIG_PROC_FS */ 3046 3047 #ifdef MAX_SWAPFILES_CHECK 3048 static int __init max_swapfiles_check(void) 3049 { 3050 MAX_SWAPFILES_CHECK(); 3051 return 0; 3052 } 3053 late_initcall(max_swapfiles_check); 3054 #endif 3055 3056 static struct swap_info_struct *alloc_swap_info(void) 3057 { 3058 struct swap_info_struct *p; 3059 struct swap_info_struct *defer = NULL; 3060 unsigned int type; 3061 int i; 3062 3063 p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL); 3064 if (!p) 3065 return ERR_PTR(-ENOMEM); 3066 3067 if (percpu_ref_init(&p->users, swap_users_ref_free, 3068 PERCPU_REF_INIT_DEAD, GFP_KERNEL)) { 3069 kvfree(p); 3070 return ERR_PTR(-ENOMEM); 3071 } 3072 3073 spin_lock(&swap_lock); 3074 for (type = 0; type < nr_swapfiles; type++) { 3075 if (!(swap_info[type]->flags & SWP_USED)) 3076 break; 3077 } 3078 if (type >= MAX_SWAPFILES) { 3079 spin_unlock(&swap_lock); 3080 percpu_ref_exit(&p->users); 3081 kvfree(p); 3082 return ERR_PTR(-EPERM); 3083 } 3084 if (type >= nr_swapfiles) { 3085 p->type = type; 3086 /* 3087 * Publish the swap_info_struct after initializing it. 3088 * Note that kvzalloc() above zeroes all its fields. 3089 */ 3090 smp_store_release(&swap_info[type], p); /* rcu_assign_pointer() */ 3091 nr_swapfiles++; 3092 } else { 3093 defer = p; 3094 p = swap_info[type]; 3095 /* 3096 * Do not memset this entry: a racing procfs swap_next() 3097 * would be relying on p->type to remain valid. 3098 */ 3099 } 3100 p->swap_extent_root = RB_ROOT; 3101 plist_node_init(&p->list, 0); 3102 for_each_node(i) 3103 plist_node_init(&p->avail_lists[i], 0); 3104 p->flags = SWP_USED; 3105 spin_unlock(&swap_lock); 3106 if (defer) { 3107 percpu_ref_exit(&defer->users); 3108 kvfree(defer); 3109 } 3110 spin_lock_init(&p->lock); 3111 spin_lock_init(&p->cont_lock); 3112 init_completion(&p->comp); 3113 3114 return p; 3115 } 3116 3117 static int claim_swapfile(struct swap_info_struct *si, struct inode *inode) 3118 { 3119 if (S_ISBLK(inode->i_mode)) { 3120 si->bdev = I_BDEV(inode); 3121 /* 3122 * Zoned block devices contain zones that have a sequential 3123 * write only restriction. Hence zoned block devices are not 3124 * suitable for swapping. Disallow them here. 3125 */ 3126 if (bdev_is_zoned(si->bdev)) 3127 return -EINVAL; 3128 si->flags |= SWP_BLKDEV; 3129 } else if (S_ISREG(inode->i_mode)) { 3130 si->bdev = inode->i_sb->s_bdev; 3131 } 3132 3133 return 0; 3134 } 3135 3136 3137 /* 3138 * Find out how many pages are allowed for a single swap device. There 3139 * are two limiting factors: 3140 * 1) the number of bits for the swap offset in the swp_entry_t type, and 3141 * 2) the number of bits in the swap pte, as defined by the different 3142 * architectures. 3143 * 3144 * In order to find the largest possible bit mask, a swap entry with 3145 * swap type 0 and swap offset ~0UL is created, encoded to a swap pte, 3146 * decoded to a swp_entry_t again, and finally the swap offset is 3147 * extracted. 3148 * 3149 * This will mask all the bits from the initial ~0UL mask that can't 3150 * be encoded in either the swp_entry_t or the architecture definition 3151 * of a swap pte. 3152 */ 3153 unsigned long generic_max_swapfile_size(void) 3154 { 3155 return swp_offset(pte_to_swp_entry( 3156 swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; 3157 } 3158 3159 /* Can be overridden by an architecture for additional checks. */ 3160 __weak unsigned long arch_max_swapfile_size(void) 3161 { 3162 return generic_max_swapfile_size(); 3163 } 3164 3165 static unsigned long read_swap_header(struct swap_info_struct *si, 3166 union swap_header *swap_header, 3167 struct inode *inode) 3168 { 3169 int i; 3170 unsigned long maxpages; 3171 unsigned long swapfilepages; 3172 unsigned long last_page; 3173 3174 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { 3175 pr_err("Unable to find swap-space signature\n"); 3176 return 0; 3177 } 3178 3179 /* swap partition endianness hack... */ 3180 if (swab32(swap_header->info.version) == 1) { 3181 swab32s(&swap_header->info.version); 3182 swab32s(&swap_header->info.last_page); 3183 swab32s(&swap_header->info.nr_badpages); 3184 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) 3185 return 0; 3186 for (i = 0; i < swap_header->info.nr_badpages; i++) 3187 swab32s(&swap_header->info.badpages[i]); 3188 } 3189 /* Check the swap header's sub-version */ 3190 if (swap_header->info.version != 1) { 3191 pr_warn("Unable to handle swap header version %d\n", 3192 swap_header->info.version); 3193 return 0; 3194 } 3195 3196 si->lowest_bit = 1; 3197 si->cluster_next = 1; 3198 si->cluster_nr = 0; 3199 3200 maxpages = swapfile_maximum_size; 3201 last_page = swap_header->info.last_page; 3202 if (!last_page) { 3203 pr_warn("Empty swap-file\n"); 3204 return 0; 3205 } 3206 if (last_page > maxpages) { 3207 pr_warn("Truncating oversized swap area, only using %luk out of %luk\n", 3208 K(maxpages), K(last_page)); 3209 } 3210 if (maxpages > last_page) { 3211 maxpages = last_page + 1; 3212 /* p->max is an unsigned int: don't overflow it */ 3213 if ((unsigned int)maxpages == 0) 3214 maxpages = UINT_MAX; 3215 } 3216 si->highest_bit = maxpages - 1; 3217 3218 if (!maxpages) 3219 return 0; 3220 swapfilepages = i_size_read(inode) >> PAGE_SHIFT; 3221 if (swapfilepages && maxpages > swapfilepages) { 3222 pr_warn("Swap area shorter than signature indicates\n"); 3223 return 0; 3224 } 3225 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) 3226 return 0; 3227 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) 3228 return 0; 3229 3230 return maxpages; 3231 } 3232 3233 #define SWAP_CLUSTER_INFO_COLS \ 3234 DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info)) 3235 #define SWAP_CLUSTER_SPACE_COLS \ 3236 DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER) 3237 #define SWAP_CLUSTER_COLS \ 3238 max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS) 3239 3240 static int setup_swap_map_and_extents(struct swap_info_struct *si, 3241 union swap_header *swap_header, 3242 unsigned char *swap_map, 3243 unsigned long maxpages, 3244 sector_t *span) 3245 { 3246 unsigned int nr_good_pages; 3247 unsigned long i; 3248 int nr_extents; 3249 3250 nr_good_pages = maxpages - 1; /* omit header page */ 3251 3252 for (i = 0; i < swap_header->info.nr_badpages; i++) { 3253 unsigned int page_nr = swap_header->info.badpages[i]; 3254 if (page_nr == 0 || page_nr > swap_header->info.last_page) 3255 return -EINVAL; 3256 if (page_nr < maxpages) { 3257 swap_map[page_nr] = SWAP_MAP_BAD; 3258 nr_good_pages--; 3259 } 3260 } 3261 3262 if (nr_good_pages) { 3263 swap_map[0] = SWAP_MAP_BAD; 3264 si->max = maxpages; 3265 si->pages = nr_good_pages; 3266 nr_extents = setup_swap_extents(si, span); 3267 if (nr_extents < 0) 3268 return nr_extents; 3269 nr_good_pages = si->pages; 3270 } 3271 if (!nr_good_pages) { 3272 pr_warn("Empty swap-file\n"); 3273 return -EINVAL; 3274 } 3275 3276 return nr_extents; 3277 } 3278 3279 static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, 3280 union swap_header *swap_header, 3281 unsigned long maxpages) 3282 { 3283 unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); 3284 unsigned long col = si->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS; 3285 struct swap_cluster_info *cluster_info; 3286 unsigned long i, j, k, idx; 3287 int cpu, err = -ENOMEM; 3288 3289 cluster_info = kvcalloc(nr_clusters, sizeof(*cluster_info), GFP_KERNEL); 3290 if (!cluster_info) 3291 goto err; 3292 3293 for (i = 0; i < nr_clusters; i++) 3294 spin_lock_init(&cluster_info[i].lock); 3295 3296 si->cluster_next_cpu = alloc_percpu(unsigned int); 3297 if (!si->cluster_next_cpu) 3298 goto err_free; 3299 3300 /* Random start position to help with wear leveling */ 3301 for_each_possible_cpu(cpu) 3302 per_cpu(*si->cluster_next_cpu, cpu) = 3303 get_random_u32_inclusive(1, si->highest_bit); 3304 3305 si->percpu_cluster = alloc_percpu(struct percpu_cluster); 3306 if (!si->percpu_cluster) 3307 goto err_free; 3308 3309 for_each_possible_cpu(cpu) { 3310 struct percpu_cluster *cluster; 3311 3312 cluster = per_cpu_ptr(si->percpu_cluster, cpu); 3313 for (i = 0; i < SWAP_NR_ORDERS; i++) 3314 cluster->next[i] = SWAP_NEXT_INVALID; 3315 } 3316 3317 /* 3318 * Mark unusable pages as unavailable. The clusters aren't 3319 * marked free yet, so no list operations are involved yet. 3320 * 3321 * See setup_swap_map_and_extents(): header page, bad pages, 3322 * and the EOF part of the last cluster. 3323 */ 3324 inc_cluster_info_page(si, cluster_info, 0); 3325 for (i = 0; i < swap_header->info.nr_badpages; i++) 3326 inc_cluster_info_page(si, cluster_info, 3327 swap_header->info.badpages[i]); 3328 for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) 3329 inc_cluster_info_page(si, cluster_info, i); 3330 3331 INIT_LIST_HEAD(&si->free_clusters); 3332 INIT_LIST_HEAD(&si->full_clusters); 3333 INIT_LIST_HEAD(&si->discard_clusters); 3334 3335 for (i = 0; i < SWAP_NR_ORDERS; i++) { 3336 INIT_LIST_HEAD(&si->nonfull_clusters[i]); 3337 INIT_LIST_HEAD(&si->frag_clusters[i]); 3338 si->frag_cluster_nr[i] = 0; 3339 } 3340 3341 /* 3342 * Reduce false cache line sharing between cluster_info and 3343 * sharing same address space. 3344 */ 3345 for (k = 0; k < SWAP_CLUSTER_COLS; k++) { 3346 j = (k + col) % SWAP_CLUSTER_COLS; 3347 for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) { 3348 struct swap_cluster_info *ci; 3349 idx = i * SWAP_CLUSTER_COLS + j; 3350 ci = cluster_info + idx; 3351 if (idx >= nr_clusters) 3352 continue; 3353 if (ci->count) { 3354 ci->flags = CLUSTER_FLAG_NONFULL; 3355 list_add_tail(&ci->list, &si->nonfull_clusters[0]); 3356 continue; 3357 } 3358 ci->flags = CLUSTER_FLAG_FREE; 3359 list_add_tail(&ci->list, &si->free_clusters); 3360 } 3361 } 3362 3363 return cluster_info; 3364 3365 err_free: 3366 kvfree(cluster_info); 3367 err: 3368 return ERR_PTR(err); 3369 } 3370 3371 SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) 3372 { 3373 struct swap_info_struct *si; 3374 struct filename *name; 3375 struct file *swap_file = NULL; 3376 struct address_space *mapping; 3377 struct dentry *dentry; 3378 int prio; 3379 int error; 3380 union swap_header *swap_header; 3381 int nr_extents; 3382 sector_t span; 3383 unsigned long maxpages; 3384 unsigned char *swap_map = NULL; 3385 unsigned long *zeromap = NULL; 3386 struct swap_cluster_info *cluster_info = NULL; 3387 struct folio *folio = NULL; 3388 struct inode *inode = NULL; 3389 bool inced_nr_rotate_swap = false; 3390 3391 if (swap_flags & ~SWAP_FLAGS_VALID) 3392 return -EINVAL; 3393 3394 if (!capable(CAP_SYS_ADMIN)) 3395 return -EPERM; 3396 3397 if (!swap_avail_heads) 3398 return -ENOMEM; 3399 3400 si = alloc_swap_info(); 3401 if (IS_ERR(si)) 3402 return PTR_ERR(si); 3403 3404 INIT_WORK(&si->discard_work, swap_discard_work); 3405 INIT_WORK(&si->reclaim_work, swap_reclaim_work); 3406 3407 name = getname(specialfile); 3408 if (IS_ERR(name)) { 3409 error = PTR_ERR(name); 3410 name = NULL; 3411 goto bad_swap; 3412 } 3413 swap_file = file_open_name(name, O_RDWR | O_LARGEFILE | O_EXCL, 0); 3414 if (IS_ERR(swap_file)) { 3415 error = PTR_ERR(swap_file); 3416 swap_file = NULL; 3417 goto bad_swap; 3418 } 3419 3420 si->swap_file = swap_file; 3421 mapping = swap_file->f_mapping; 3422 dentry = swap_file->f_path.dentry; 3423 inode = mapping->host; 3424 3425 error = claim_swapfile(si, inode); 3426 if (unlikely(error)) 3427 goto bad_swap; 3428 3429 inode_lock(inode); 3430 if (d_unlinked(dentry) || cant_mount(dentry)) { 3431 error = -ENOENT; 3432 goto bad_swap_unlock_inode; 3433 } 3434 if (IS_SWAPFILE(inode)) { 3435 error = -EBUSY; 3436 goto bad_swap_unlock_inode; 3437 } 3438 3439 /* 3440 * Read the swap header. 3441 */ 3442 if (!mapping->a_ops->read_folio) { 3443 error = -EINVAL; 3444 goto bad_swap_unlock_inode; 3445 } 3446 folio = read_mapping_folio(mapping, 0, swap_file); 3447 if (IS_ERR(folio)) { 3448 error = PTR_ERR(folio); 3449 goto bad_swap_unlock_inode; 3450 } 3451 swap_header = kmap_local_folio(folio, 0); 3452 3453 maxpages = read_swap_header(si, swap_header, inode); 3454 if (unlikely(!maxpages)) { 3455 error = -EINVAL; 3456 goto bad_swap_unlock_inode; 3457 } 3458 3459 /* OK, set up the swap map and apply the bad block list */ 3460 swap_map = vzalloc(maxpages); 3461 if (!swap_map) { 3462 error = -ENOMEM; 3463 goto bad_swap_unlock_inode; 3464 } 3465 3466 error = swap_cgroup_swapon(si->type, maxpages); 3467 if (error) 3468 goto bad_swap_unlock_inode; 3469 3470 nr_extents = setup_swap_map_and_extents(si, swap_header, swap_map, 3471 maxpages, &span); 3472 if (unlikely(nr_extents < 0)) { 3473 error = nr_extents; 3474 goto bad_swap_unlock_inode; 3475 } 3476 3477 /* 3478 * Use kvmalloc_array instead of bitmap_zalloc as the allocation order might 3479 * be above MAX_PAGE_ORDER incase of a large swap file. 3480 */ 3481 zeromap = kvmalloc_array(BITS_TO_LONGS(maxpages), sizeof(long), 3482 GFP_KERNEL | __GFP_ZERO); 3483 if (!zeromap) { 3484 error = -ENOMEM; 3485 goto bad_swap_unlock_inode; 3486 } 3487 3488 if (si->bdev && bdev_stable_writes(si->bdev)) 3489 si->flags |= SWP_STABLE_WRITES; 3490 3491 if (si->bdev && bdev_synchronous(si->bdev)) 3492 si->flags |= SWP_SYNCHRONOUS_IO; 3493 3494 if (si->bdev && bdev_nonrot(si->bdev)) { 3495 si->flags |= SWP_SOLIDSTATE; 3496 3497 cluster_info = setup_clusters(si, swap_header, maxpages); 3498 if (IS_ERR(cluster_info)) { 3499 error = PTR_ERR(cluster_info); 3500 cluster_info = NULL; 3501 goto bad_swap_unlock_inode; 3502 } 3503 } else { 3504 atomic_inc(&nr_rotate_swap); 3505 inced_nr_rotate_swap = true; 3506 } 3507 3508 if ((swap_flags & SWAP_FLAG_DISCARD) && 3509 si->bdev && bdev_max_discard_sectors(si->bdev)) { 3510 /* 3511 * When discard is enabled for swap with no particular 3512 * policy flagged, we set all swap discard flags here in 3513 * order to sustain backward compatibility with older 3514 * swapon(8) releases. 3515 */ 3516 si->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD | 3517 SWP_PAGE_DISCARD); 3518 3519 /* 3520 * By flagging sys_swapon, a sysadmin can tell us to 3521 * either do single-time area discards only, or to just 3522 * perform discards for released swap page-clusters. 3523 * Now it's time to adjust the p->flags accordingly. 3524 */ 3525 if (swap_flags & SWAP_FLAG_DISCARD_ONCE) 3526 si->flags &= ~SWP_PAGE_DISCARD; 3527 else if (swap_flags & SWAP_FLAG_DISCARD_PAGES) 3528 si->flags &= ~SWP_AREA_DISCARD; 3529 3530 /* issue a swapon-time discard if it's still required */ 3531 if (si->flags & SWP_AREA_DISCARD) { 3532 int err = discard_swap(si); 3533 if (unlikely(err)) 3534 pr_err("swapon: discard_swap(%p): %d\n", 3535 si, err); 3536 } 3537 } 3538 3539 error = init_swap_address_space(si->type, maxpages); 3540 if (error) 3541 goto bad_swap_unlock_inode; 3542 3543 error = zswap_swapon(si->type, maxpages); 3544 if (error) 3545 goto free_swap_address_space; 3546 3547 /* 3548 * Flush any pending IO and dirty mappings before we start using this 3549 * swap device. 3550 */ 3551 inode->i_flags |= S_SWAPFILE; 3552 error = inode_drain_writes(inode); 3553 if (error) { 3554 inode->i_flags &= ~S_SWAPFILE; 3555 goto free_swap_zswap; 3556 } 3557 3558 mutex_lock(&swapon_mutex); 3559 prio = -1; 3560 if (swap_flags & SWAP_FLAG_PREFER) 3561 prio = 3562 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; 3563 enable_swap_info(si, prio, swap_map, cluster_info, zeromap); 3564 3565 pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s\n", 3566 K(si->pages), name->name, si->prio, nr_extents, 3567 K((unsigned long long)span), 3568 (si->flags & SWP_SOLIDSTATE) ? "SS" : "", 3569 (si->flags & SWP_DISCARDABLE) ? "D" : "", 3570 (si->flags & SWP_AREA_DISCARD) ? "s" : "", 3571 (si->flags & SWP_PAGE_DISCARD) ? "c" : ""); 3572 3573 mutex_unlock(&swapon_mutex); 3574 atomic_inc(&proc_poll_event); 3575 wake_up_interruptible(&proc_poll_wait); 3576 3577 error = 0; 3578 goto out; 3579 free_swap_zswap: 3580 zswap_swapoff(si->type); 3581 free_swap_address_space: 3582 exit_swap_address_space(si->type); 3583 bad_swap_unlock_inode: 3584 inode_unlock(inode); 3585 bad_swap: 3586 free_percpu(si->percpu_cluster); 3587 si->percpu_cluster = NULL; 3588 free_percpu(si->cluster_next_cpu); 3589 si->cluster_next_cpu = NULL; 3590 inode = NULL; 3591 destroy_swap_extents(si); 3592 swap_cgroup_swapoff(si->type); 3593 spin_lock(&swap_lock); 3594 si->swap_file = NULL; 3595 si->flags = 0; 3596 spin_unlock(&swap_lock); 3597 vfree(swap_map); 3598 kvfree(zeromap); 3599 kvfree(cluster_info); 3600 if (inced_nr_rotate_swap) 3601 atomic_dec(&nr_rotate_swap); 3602 if (swap_file) 3603 filp_close(swap_file, NULL); 3604 out: 3605 if (!IS_ERR_OR_NULL(folio)) 3606 folio_release_kmap(folio, swap_header); 3607 if (name) 3608 putname(name); 3609 if (inode) 3610 inode_unlock(inode); 3611 if (!error) 3612 enable_swap_slots_cache(); 3613 return error; 3614 } 3615 3616 void si_swapinfo(struct sysinfo *val) 3617 { 3618 unsigned int type; 3619 unsigned long nr_to_be_unused = 0; 3620 3621 spin_lock(&swap_lock); 3622 for (type = 0; type < nr_swapfiles; type++) { 3623 struct swap_info_struct *si = swap_info[type]; 3624 3625 if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) 3626 nr_to_be_unused += READ_ONCE(si->inuse_pages); 3627 } 3628 val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused; 3629 val->totalswap = total_swap_pages + nr_to_be_unused; 3630 spin_unlock(&swap_lock); 3631 } 3632 3633 /* 3634 * Verify that nr swap entries are valid and increment their swap map counts. 3635 * 3636 * Returns error code in following case. 3637 * - success -> 0 3638 * - swp_entry is invalid -> EINVAL 3639 * - swp_entry is migration entry -> EINVAL 3640 * - swap-cache reference is requested but there is already one. -> EEXIST 3641 * - swap-cache reference is requested but the entry is not used. -> ENOENT 3642 * - swap-mapped reference requested but needs continued swap count. -> ENOMEM 3643 */ 3644 static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) 3645 { 3646 struct swap_info_struct *si; 3647 struct swap_cluster_info *ci; 3648 unsigned long offset; 3649 unsigned char count; 3650 unsigned char has_cache; 3651 int err, i; 3652 3653 si = swp_swap_info(entry); 3654 3655 offset = swp_offset(entry); 3656 VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); 3657 VM_WARN_ON(usage == 1 && nr > 1); 3658 ci = lock_cluster_or_swap_info(si, offset); 3659 3660 err = 0; 3661 for (i = 0; i < nr; i++) { 3662 count = si->swap_map[offset + i]; 3663 3664 /* 3665 * swapin_readahead() doesn't check if a swap entry is valid, so the 3666 * swap entry could be SWAP_MAP_BAD. Check here with lock held. 3667 */ 3668 if (unlikely(swap_count(count) == SWAP_MAP_BAD)) { 3669 err = -ENOENT; 3670 goto unlock_out; 3671 } 3672 3673 has_cache = count & SWAP_HAS_CACHE; 3674 count &= ~SWAP_HAS_CACHE; 3675 3676 if (!count && !has_cache) { 3677 err = -ENOENT; 3678 } else if (usage == SWAP_HAS_CACHE) { 3679 if (has_cache) 3680 err = -EEXIST; 3681 } else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) { 3682 err = -EINVAL; 3683 } 3684 3685 if (err) 3686 goto unlock_out; 3687 } 3688 3689 for (i = 0; i < nr; i++) { 3690 count = si->swap_map[offset + i]; 3691 has_cache = count & SWAP_HAS_CACHE; 3692 count &= ~SWAP_HAS_CACHE; 3693 3694 if (usage == SWAP_HAS_CACHE) 3695 has_cache = SWAP_HAS_CACHE; 3696 else if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX) 3697 count += usage; 3698 else if (swap_count_continued(si, offset + i, count)) 3699 count = COUNT_CONTINUED; 3700 else { 3701 /* 3702 * Don't need to rollback changes, because if 3703 * usage == 1, there must be nr == 1. 3704 */ 3705 err = -ENOMEM; 3706 goto unlock_out; 3707 } 3708 3709 WRITE_ONCE(si->swap_map[offset + i], count | has_cache); 3710 } 3711 3712 unlock_out: 3713 unlock_cluster_or_swap_info(si, ci); 3714 return err; 3715 } 3716 3717 /* 3718 * Help swapoff by noting that swap entry belongs to shmem/tmpfs 3719 * (in which case its reference count is never incremented). 3720 */ 3721 void swap_shmem_alloc(swp_entry_t entry, int nr) 3722 { 3723 __swap_duplicate(entry, SWAP_MAP_SHMEM, nr); 3724 } 3725 3726 /* 3727 * Increase reference count of swap entry by 1. 3728 * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required 3729 * but could not be atomically allocated. Returns 0, just as if it succeeded, 3730 * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which 3731 * might occur if a page table entry has got corrupted. 3732 */ 3733 int swap_duplicate(swp_entry_t entry) 3734 { 3735 int err = 0; 3736 3737 while (!err && __swap_duplicate(entry, 1, 1) == -ENOMEM) 3738 err = add_swap_count_continuation(entry, GFP_ATOMIC); 3739 return err; 3740 } 3741 3742 /* 3743 * @entry: first swap entry from which we allocate nr swap cache. 3744 * 3745 * Called when allocating swap cache for existing swap entries, 3746 * This can return error codes. Returns 0 at success. 3747 * -EEXIST means there is a swap cache. 3748 * Note: return code is different from swap_duplicate(). 3749 */ 3750 int swapcache_prepare(swp_entry_t entry, int nr) 3751 { 3752 return __swap_duplicate(entry, SWAP_HAS_CACHE, nr); 3753 } 3754 3755 void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr) 3756 { 3757 unsigned long offset = swp_offset(entry); 3758 3759 cluster_swap_free_nr(si, offset, nr, SWAP_HAS_CACHE); 3760 } 3761 3762 struct swap_info_struct *swp_swap_info(swp_entry_t entry) 3763 { 3764 return swap_type_to_swap_info(swp_type(entry)); 3765 } 3766 3767 /* 3768 * out-of-line methods to avoid include hell. 3769 */ 3770 struct address_space *swapcache_mapping(struct folio *folio) 3771 { 3772 return swp_swap_info(folio->swap)->swap_file->f_mapping; 3773 } 3774 EXPORT_SYMBOL_GPL(swapcache_mapping); 3775 3776 pgoff_t __folio_swap_cache_index(struct folio *folio) 3777 { 3778 return swap_cache_index(folio->swap); 3779 } 3780 EXPORT_SYMBOL_GPL(__folio_swap_cache_index); 3781 3782 /* 3783 * add_swap_count_continuation - called when a swap count is duplicated 3784 * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's 3785 * page of the original vmalloc'ed swap_map, to hold the continuation count 3786 * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called 3787 * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc. 3788 * 3789 * These continuation pages are seldom referenced: the common paths all work 3790 * on the original swap_map, only referring to a continuation page when the 3791 * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX. 3792 * 3793 * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding 3794 * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL) 3795 * can be called after dropping locks. 3796 */ 3797 int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) 3798 { 3799 struct swap_info_struct *si; 3800 struct swap_cluster_info *ci; 3801 struct page *head; 3802 struct page *page; 3803 struct page *list_page; 3804 pgoff_t offset; 3805 unsigned char count; 3806 int ret = 0; 3807 3808 /* 3809 * When debugging, it's easier to use __GFP_ZERO here; but it's better 3810 * for latency not to zero a page while GFP_ATOMIC and holding locks. 3811 */ 3812 page = alloc_page(gfp_mask | __GFP_HIGHMEM); 3813 3814 si = get_swap_device(entry); 3815 if (!si) { 3816 /* 3817 * An acceptable race has occurred since the failing 3818 * __swap_duplicate(): the swap device may be swapoff 3819 */ 3820 goto outer; 3821 } 3822 spin_lock(&si->lock); 3823 3824 offset = swp_offset(entry); 3825 3826 ci = lock_cluster(si, offset); 3827 3828 count = swap_count(si->swap_map[offset]); 3829 3830 if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) { 3831 /* 3832 * The higher the swap count, the more likely it is that tasks 3833 * will race to add swap count continuation: we need to avoid 3834 * over-provisioning. 3835 */ 3836 goto out; 3837 } 3838 3839 if (!page) { 3840 ret = -ENOMEM; 3841 goto out; 3842 } 3843 3844 head = vmalloc_to_page(si->swap_map + offset); 3845 offset &= ~PAGE_MASK; 3846 3847 spin_lock(&si->cont_lock); 3848 /* 3849 * Page allocation does not initialize the page's lru field, 3850 * but it does always reset its private field. 3851 */ 3852 if (!page_private(head)) { 3853 BUG_ON(count & COUNT_CONTINUED); 3854 INIT_LIST_HEAD(&head->lru); 3855 set_page_private(head, SWP_CONTINUED); 3856 si->flags |= SWP_CONTINUED; 3857 } 3858 3859 list_for_each_entry(list_page, &head->lru, lru) { 3860 unsigned char *map; 3861 3862 /* 3863 * If the previous map said no continuation, but we've found 3864 * a continuation page, free our allocation and use this one. 3865 */ 3866 if (!(count & COUNT_CONTINUED)) 3867 goto out_unlock_cont; 3868 3869 map = kmap_local_page(list_page) + offset; 3870 count = *map; 3871 kunmap_local(map); 3872 3873 /* 3874 * If this continuation count now has some space in it, 3875 * free our allocation and use this one. 3876 */ 3877 if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX) 3878 goto out_unlock_cont; 3879 } 3880 3881 list_add_tail(&page->lru, &head->lru); 3882 page = NULL; /* now it's attached, don't free it */ 3883 out_unlock_cont: 3884 spin_unlock(&si->cont_lock); 3885 out: 3886 unlock_cluster(ci); 3887 spin_unlock(&si->lock); 3888 put_swap_device(si); 3889 outer: 3890 if (page) 3891 __free_page(page); 3892 return ret; 3893 } 3894 3895 /* 3896 * swap_count_continued - when the original swap_map count is incremented 3897 * from SWAP_MAP_MAX, check if there is already a continuation page to carry 3898 * into, carry if so, or else fail until a new continuation page is allocated; 3899 * when the original swap_map count is decremented from 0 with continuation, 3900 * borrow from the continuation and report whether it still holds more. 3901 * Called while __swap_duplicate() or swap_entry_free() holds swap or cluster 3902 * lock. 3903 */ 3904 static bool swap_count_continued(struct swap_info_struct *si, 3905 pgoff_t offset, unsigned char count) 3906 { 3907 struct page *head; 3908 struct page *page; 3909 unsigned char *map; 3910 bool ret; 3911 3912 head = vmalloc_to_page(si->swap_map + offset); 3913 if (page_private(head) != SWP_CONTINUED) { 3914 BUG_ON(count & COUNT_CONTINUED); 3915 return false; /* need to add count continuation */ 3916 } 3917 3918 spin_lock(&si->cont_lock); 3919 offset &= ~PAGE_MASK; 3920 page = list_next_entry(head, lru); 3921 map = kmap_local_page(page) + offset; 3922 3923 if (count == SWAP_MAP_MAX) /* initial increment from swap_map */ 3924 goto init_map; /* jump over SWAP_CONT_MAX checks */ 3925 3926 if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */ 3927 /* 3928 * Think of how you add 1 to 999 3929 */ 3930 while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) { 3931 kunmap_local(map); 3932 page = list_next_entry(page, lru); 3933 BUG_ON(page == head); 3934 map = kmap_local_page(page) + offset; 3935 } 3936 if (*map == SWAP_CONT_MAX) { 3937 kunmap_local(map); 3938 page = list_next_entry(page, lru); 3939 if (page == head) { 3940 ret = false; /* add count continuation */ 3941 goto out; 3942 } 3943 map = kmap_local_page(page) + offset; 3944 init_map: *map = 0; /* we didn't zero the page */ 3945 } 3946 *map += 1; 3947 kunmap_local(map); 3948 while ((page = list_prev_entry(page, lru)) != head) { 3949 map = kmap_local_page(page) + offset; 3950 *map = COUNT_CONTINUED; 3951 kunmap_local(map); 3952 } 3953 ret = true; /* incremented */ 3954 3955 } else { /* decrementing */ 3956 /* 3957 * Think of how you subtract 1 from 1000 3958 */ 3959 BUG_ON(count != COUNT_CONTINUED); 3960 while (*map == COUNT_CONTINUED) { 3961 kunmap_local(map); 3962 page = list_next_entry(page, lru); 3963 BUG_ON(page == head); 3964 map = kmap_local_page(page) + offset; 3965 } 3966 BUG_ON(*map == 0); 3967 *map -= 1; 3968 if (*map == 0) 3969 count = 0; 3970 kunmap_local(map); 3971 while ((page = list_prev_entry(page, lru)) != head) { 3972 map = kmap_local_page(page) + offset; 3973 *map = SWAP_CONT_MAX | count; 3974 count = COUNT_CONTINUED; 3975 kunmap_local(map); 3976 } 3977 ret = count == COUNT_CONTINUED; 3978 } 3979 out: 3980 spin_unlock(&si->cont_lock); 3981 return ret; 3982 } 3983 3984 /* 3985 * free_swap_count_continuations - swapoff free all the continuation pages 3986 * appended to the swap_map, after swap_map is quiesced, before vfree'ing it. 3987 */ 3988 static void free_swap_count_continuations(struct swap_info_struct *si) 3989 { 3990 pgoff_t offset; 3991 3992 for (offset = 0; offset < si->max; offset += PAGE_SIZE) { 3993 struct page *head; 3994 head = vmalloc_to_page(si->swap_map + offset); 3995 if (page_private(head)) { 3996 struct page *page, *next; 3997 3998 list_for_each_entry_safe(page, next, &head->lru, lru) { 3999 list_del(&page->lru); 4000 __free_page(page); 4001 } 4002 } 4003 } 4004 } 4005 4006 #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) 4007 void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp) 4008 { 4009 struct swap_info_struct *si, *next; 4010 int nid = folio_nid(folio); 4011 4012 if (!(gfp & __GFP_IO)) 4013 return; 4014 4015 if (!__has_usable_swap()) 4016 return; 4017 4018 if (!blk_cgroup_congested()) 4019 return; 4020 4021 /* 4022 * We've already scheduled a throttle, avoid taking the global swap 4023 * lock. 4024 */ 4025 if (current->throttle_disk) 4026 return; 4027 4028 spin_lock(&swap_avail_lock); 4029 plist_for_each_entry_safe(si, next, &swap_avail_heads[nid], 4030 avail_lists[nid]) { 4031 if (si->bdev) { 4032 blkcg_schedule_throttle(si->bdev->bd_disk, true); 4033 break; 4034 } 4035 } 4036 spin_unlock(&swap_avail_lock); 4037 } 4038 #endif 4039 4040 static int __init swapfile_init(void) 4041 { 4042 int nid; 4043 4044 swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head), 4045 GFP_KERNEL); 4046 if (!swap_avail_heads) { 4047 pr_emerg("Not enough memory for swap heads, swap is disabled\n"); 4048 return -ENOMEM; 4049 } 4050 4051 for_each_node(nid) 4052 plist_head_init(&swap_avail_heads[nid]); 4053 4054 swapfile_maximum_size = arch_max_swapfile_size(); 4055 4056 #ifdef CONFIG_MIGRATION 4057 if (swapfile_maximum_size >= (1UL << SWP_MIG_TOTAL_BITS)) 4058 swap_migration_ad_supported = true; 4059 #endif /* CONFIG_MIGRATION */ 4060 4061 return 0; 4062 } 4063 subsys_initcall(swapfile_init); 4064