1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * linux/mm/swapfile.c
4 *
5 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
6 * Swap reorganised 29.12.95, Stephen Tweedie
7 */
8
9 #include <linux/blkdev.h>
10 #include <linux/mm.h>
11 #include <linux/sched/mm.h>
12 #include <linux/sched/task.h>
13 #include <linux/hugetlb.h>
14 #include <linux/mman.h>
15 #include <linux/slab.h>
16 #include <linux/kernel_stat.h>
17 #include <linux/swap.h>
18 #include <linux/vmalloc.h>
19 #include <linux/pagemap.h>
20 #include <linux/namei.h>
21 #include <linux/shmem_fs.h>
22 #include <linux/blk-cgroup.h>
23 #include <linux/random.h>
24 #include <linux/writeback.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/init.h>
28 #include <linux/ksm.h>
29 #include <linux/rmap.h>
30 #include <linux/security.h>
31 #include <linux/backing-dev.h>
32 #include <linux/mutex.h>
33 #include <linux/capability.h>
34 #include <linux/syscalls.h>
35 #include <linux/memcontrol.h>
36 #include <linux/poll.h>
37 #include <linux/oom.h>
38 #include <linux/swapfile.h>
39 #include <linux/export.h>
40 #include <linux/sort.h>
41 #include <linux/completion.h>
42 #include <linux/suspend.h>
43 #include <linux/zswap.h>
44 #include <linux/plist.h>
45
46 #include <asm/tlbflush.h>
47 #include <linux/leafops.h>
48 #include <linux/swap_cgroup.h>
49 #include "swap_table.h"
50 #include "internal.h"
51 #include "swap_table.h"
52 #include "swap.h"
53
54 static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
55 unsigned char);
56 static void free_swap_count_continuations(struct swap_info_struct *);
57 static void swap_range_alloc(struct swap_info_struct *si,
58 unsigned int nr_entries);
59 static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr);
60 static void swap_put_entry_locked(struct swap_info_struct *si,
61 struct swap_cluster_info *ci,
62 unsigned long offset);
63 static bool folio_swapcache_freeable(struct folio *folio);
64 static void move_cluster(struct swap_info_struct *si,
65 struct swap_cluster_info *ci, struct list_head *list,
66 enum swap_cluster_flags new_flags);
67
68 static DEFINE_SPINLOCK(swap_lock);
69 static unsigned int nr_swapfiles;
70 atomic_long_t nr_swap_pages;
71 /*
72 * Some modules use swappable objects and may try to swap them out under
73 * memory pressure (via the shrinker). Before doing so, they may wish to
74 * check to see if any swap space is available.
75 */
76 EXPORT_SYMBOL_GPL(nr_swap_pages);
77 /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
78 long total_swap_pages;
79 #define DEF_SWAP_PRIO -1
80 unsigned long swapfile_maximum_size;
81 #ifdef CONFIG_MIGRATION
82 bool swap_migration_ad_supported;
83 #endif /* CONFIG_MIGRATION */
84
85 static const char Bad_file[] = "Bad swap file entry ";
86 static const char Bad_offset[] = "Bad swap offset entry ";
87
88 /*
89 * all active swap_info_structs
90 * protected with swap_lock, and ordered by priority.
91 */
92 static PLIST_HEAD(swap_active_head);
93
94 /*
95 * all available (active, not full) swap_info_structs
96 * protected with swap_avail_lock, ordered by priority.
97 * This is used by folio_alloc_swap() instead of swap_active_head
98 * because swap_active_head includes all swap_info_structs,
99 * but folio_alloc_swap() doesn't need to look at full ones.
100 * This uses its own lock instead of swap_lock because when a
101 * swap_info_struct changes between not-full/full, it needs to
102 * add/remove itself to/from this list, but the swap_info_struct->lock
103 * is held and the locking order requires swap_lock to be taken
104 * before any swap_info_struct->lock.
105 */
106 static PLIST_HEAD(swap_avail_head);
107 static DEFINE_SPINLOCK(swap_avail_lock);
108
109 struct swap_info_struct *swap_info[MAX_SWAPFILES];
110
111 static struct kmem_cache *swap_table_cachep;
112
113 static DEFINE_MUTEX(swapon_mutex);
114
115 static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
116 /* Activity counter to indicate that a swapon or swapoff has occurred */
117 static atomic_t proc_poll_event = ATOMIC_INIT(0);
118
119 atomic_t nr_rotate_swap = ATOMIC_INIT(0);
120
121 struct percpu_swap_cluster {
122 struct swap_info_struct *si[SWAP_NR_ORDERS];
123 unsigned long offset[SWAP_NR_ORDERS];
124 local_lock_t lock;
125 };
126
127 static DEFINE_PER_CPU(struct percpu_swap_cluster, percpu_swap_cluster) = {
128 .si = { NULL },
129 .offset = { SWAP_ENTRY_INVALID },
130 .lock = INIT_LOCAL_LOCK(),
131 };
132
133 /* May return NULL on invalid type, caller must check for NULL return */
swap_type_to_info(int type)134 static struct swap_info_struct *swap_type_to_info(int type)
135 {
136 if (type >= MAX_SWAPFILES)
137 return NULL;
138 return READ_ONCE(swap_info[type]); /* rcu_dereference() */
139 }
140
141 /* May return NULL on invalid entry, caller must check for NULL return */
swap_entry_to_info(swp_entry_t entry)142 static struct swap_info_struct *swap_entry_to_info(swp_entry_t entry)
143 {
144 return swap_type_to_info(swp_type(entry));
145 }
146
147 /*
148 * Use the second highest bit of inuse_pages counter as the indicator
149 * if one swap device is on the available plist, so the atomic can
150 * still be updated arithmetically while having special data embedded.
151 *
152 * inuse_pages counter is the only thing indicating if a device should
153 * be on avail_lists or not (except swapon / swapoff). By embedding the
154 * off-list bit in the atomic counter, updates no longer need any lock
155 * to check the list status.
156 *
157 * This bit will be set if the device is not on the plist and not
158 * usable, will be cleared if the device is on the plist.
159 */
160 #define SWAP_USAGE_OFFLIST_BIT (1UL << (BITS_PER_TYPE(atomic_t) - 2))
161 #define SWAP_USAGE_COUNTER_MASK (~SWAP_USAGE_OFFLIST_BIT)
swap_usage_in_pages(struct swap_info_struct * si)162 static long swap_usage_in_pages(struct swap_info_struct *si)
163 {
164 return atomic_long_read(&si->inuse_pages) & SWAP_USAGE_COUNTER_MASK;
165 }
166
167 /* Reclaim the swap entry anyway if possible */
168 #define TTRS_ANYWAY 0x1
169 /*
170 * Reclaim the swap entry if there are no more mappings of the
171 * corresponding page
172 */
173 #define TTRS_UNMAPPED 0x2
174 /* Reclaim the swap entry if swap is getting full */
175 #define TTRS_FULL 0x4
176
swap_only_has_cache(struct swap_info_struct * si,struct swap_cluster_info * ci,unsigned long offset,int nr_pages)177 static bool swap_only_has_cache(struct swap_info_struct *si,
178 struct swap_cluster_info *ci,
179 unsigned long offset, int nr_pages)
180 {
181 unsigned int ci_off = offset % SWAPFILE_CLUSTER;
182 unsigned char *map = si->swap_map + offset;
183 unsigned char *map_end = map + nr_pages;
184 unsigned long swp_tb;
185
186 do {
187 swp_tb = __swap_table_get(ci, ci_off);
188 VM_WARN_ON_ONCE(!swp_tb_is_folio(swp_tb));
189 if (*map)
190 return false;
191 ++ci_off;
192 } while (++map < map_end);
193
194 return true;
195 }
196
197 /*
198 * returns number of pages in the folio that backs the swap entry. If positive,
199 * the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no
200 * folio was associated with the swap entry.
201 */
__try_to_reclaim_swap(struct swap_info_struct * si,unsigned long offset,unsigned long flags)202 static int __try_to_reclaim_swap(struct swap_info_struct *si,
203 unsigned long offset, unsigned long flags)
204 {
205 const swp_entry_t entry = swp_entry(si->type, offset);
206 struct swap_cluster_info *ci;
207 struct folio *folio;
208 int ret, nr_pages;
209 bool need_reclaim;
210
211 again:
212 folio = swap_cache_get_folio(entry);
213 if (!folio)
214 return 0;
215
216 nr_pages = folio_nr_pages(folio);
217 ret = -nr_pages;
218
219 /*
220 * We hold a folio lock here. We have to use trylock for
221 * avoiding deadlock. This is a special case and you should
222 * use folio_free_swap() with explicit folio_lock() in usual
223 * operations.
224 */
225 if (!folio_trylock(folio))
226 goto out;
227
228 /*
229 * Offset could point to the middle of a large folio, or folio
230 * may no longer point to the expected offset before it's locked.
231 */
232 if (!folio_matches_swap_entry(folio, entry)) {
233 folio_unlock(folio);
234 folio_put(folio);
235 goto again;
236 }
237 offset = swp_offset(folio->swap);
238
239 need_reclaim = ((flags & TTRS_ANYWAY) ||
240 ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) ||
241 ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio)));
242 if (!need_reclaim || !folio_swapcache_freeable(folio))
243 goto out_unlock;
244
245 /*
246 * It's safe to delete the folio from swap cache only if the folio
247 * is in swap cache with swap count == 0. The slots have no page table
248 * reference or pending writeback, and can't be allocated to others.
249 */
250 ci = swap_cluster_lock(si, offset);
251 need_reclaim = swap_only_has_cache(si, ci, offset, nr_pages);
252 swap_cluster_unlock(ci);
253 if (!need_reclaim)
254 goto out_unlock;
255
256 swap_cache_del_folio(folio);
257 folio_set_dirty(folio);
258 ret = nr_pages;
259 out_unlock:
260 folio_unlock(folio);
261 out:
262 folio_put(folio);
263 return ret;
264 }
265
first_se(struct swap_info_struct * sis)266 static inline struct swap_extent *first_se(struct swap_info_struct *sis)
267 {
268 struct rb_node *rb = rb_first(&sis->swap_extent_root);
269 return rb_entry(rb, struct swap_extent, rb_node);
270 }
271
next_se(struct swap_extent * se)272 static inline struct swap_extent *next_se(struct swap_extent *se)
273 {
274 struct rb_node *rb = rb_next(&se->rb_node);
275 return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL;
276 }
277
278 /*
279 * swapon tell device that all the old swap contents can be discarded,
280 * to allow the swap device to optimize its wear-levelling.
281 */
discard_swap(struct swap_info_struct * si)282 static int discard_swap(struct swap_info_struct *si)
283 {
284 struct swap_extent *se;
285 sector_t start_block;
286 sector_t nr_blocks;
287 int err = 0;
288
289 /* Do not discard the swap header page! */
290 se = first_se(si);
291 start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
292 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
293 if (nr_blocks) {
294 err = blkdev_issue_discard(si->bdev, start_block,
295 nr_blocks, GFP_KERNEL);
296 if (err)
297 return err;
298 cond_resched();
299 }
300
301 for (se = next_se(se); se; se = next_se(se)) {
302 start_block = se->start_block << (PAGE_SHIFT - 9);
303 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
304
305 err = blkdev_issue_discard(si->bdev, start_block,
306 nr_blocks, GFP_KERNEL);
307 if (err)
308 break;
309
310 cond_resched();
311 }
312 return err; /* That will often be -EOPNOTSUPP */
313 }
314
315 static struct swap_extent *
offset_to_swap_extent(struct swap_info_struct * sis,unsigned long offset)316 offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset)
317 {
318 struct swap_extent *se;
319 struct rb_node *rb;
320
321 rb = sis->swap_extent_root.rb_node;
322 while (rb) {
323 se = rb_entry(rb, struct swap_extent, rb_node);
324 if (offset < se->start_page)
325 rb = rb->rb_left;
326 else if (offset >= se->start_page + se->nr_pages)
327 rb = rb->rb_right;
328 else
329 return se;
330 }
331 /* It *must* be present */
332 BUG();
333 }
334
swap_folio_sector(struct folio * folio)335 sector_t swap_folio_sector(struct folio *folio)
336 {
337 struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
338 struct swap_extent *se;
339 sector_t sector;
340 pgoff_t offset;
341
342 offset = swp_offset(folio->swap);
343 se = offset_to_swap_extent(sis, offset);
344 sector = se->start_block + (offset - se->start_page);
345 return sector << (PAGE_SHIFT - 9);
346 }
347
348 /*
349 * swap allocation tell device that a cluster of swap can now be discarded,
350 * to allow the swap device to optimize its wear-levelling.
351 */
discard_swap_cluster(struct swap_info_struct * si,pgoff_t start_page,pgoff_t nr_pages)352 static void discard_swap_cluster(struct swap_info_struct *si,
353 pgoff_t start_page, pgoff_t nr_pages)
354 {
355 struct swap_extent *se = offset_to_swap_extent(si, start_page);
356
357 while (nr_pages) {
358 pgoff_t offset = start_page - se->start_page;
359 sector_t start_block = se->start_block + offset;
360 sector_t nr_blocks = se->nr_pages - offset;
361
362 if (nr_blocks > nr_pages)
363 nr_blocks = nr_pages;
364 start_page += nr_blocks;
365 nr_pages -= nr_blocks;
366
367 start_block <<= PAGE_SHIFT - 9;
368 nr_blocks <<= PAGE_SHIFT - 9;
369 if (blkdev_issue_discard(si->bdev, start_block,
370 nr_blocks, GFP_NOIO))
371 break;
372
373 se = next_se(se);
374 }
375 }
376
377 #define LATENCY_LIMIT 256
378
cluster_is_empty(struct swap_cluster_info * info)379 static inline bool cluster_is_empty(struct swap_cluster_info *info)
380 {
381 return info->count == 0;
382 }
383
cluster_is_discard(struct swap_cluster_info * info)384 static inline bool cluster_is_discard(struct swap_cluster_info *info)
385 {
386 return info->flags == CLUSTER_FLAG_DISCARD;
387 }
388
cluster_table_is_alloced(struct swap_cluster_info * ci)389 static inline bool cluster_table_is_alloced(struct swap_cluster_info *ci)
390 {
391 return rcu_dereference_protected(ci->table, lockdep_is_held(&ci->lock));
392 }
393
cluster_is_usable(struct swap_cluster_info * ci,int order)394 static inline bool cluster_is_usable(struct swap_cluster_info *ci, int order)
395 {
396 if (unlikely(ci->flags > CLUSTER_FLAG_USABLE))
397 return false;
398 if (!cluster_table_is_alloced(ci))
399 return false;
400 if (!order)
401 return true;
402 return cluster_is_empty(ci) || order == ci->order;
403 }
404
cluster_index(struct swap_info_struct * si,struct swap_cluster_info * ci)405 static inline unsigned int cluster_index(struct swap_info_struct *si,
406 struct swap_cluster_info *ci)
407 {
408 return ci - si->cluster_info;
409 }
410
cluster_offset(struct swap_info_struct * si,struct swap_cluster_info * ci)411 static inline unsigned int cluster_offset(struct swap_info_struct *si,
412 struct swap_cluster_info *ci)
413 {
414 return cluster_index(si, ci) * SWAPFILE_CLUSTER;
415 }
416
swap_table_alloc(gfp_t gfp)417 static struct swap_table *swap_table_alloc(gfp_t gfp)
418 {
419 struct folio *folio;
420
421 if (!SWP_TABLE_USE_PAGE)
422 return kmem_cache_zalloc(swap_table_cachep, gfp);
423
424 folio = folio_alloc(gfp | __GFP_ZERO, 0);
425 if (folio)
426 return folio_address(folio);
427 return NULL;
428 }
429
swap_table_free_folio_rcu_cb(struct rcu_head * head)430 static void swap_table_free_folio_rcu_cb(struct rcu_head *head)
431 {
432 struct folio *folio;
433
434 folio = page_folio(container_of(head, struct page, rcu_head));
435 folio_put(folio);
436 }
437
swap_table_free(struct swap_table * table)438 static void swap_table_free(struct swap_table *table)
439 {
440 if (!SWP_TABLE_USE_PAGE) {
441 kmem_cache_free(swap_table_cachep, table);
442 return;
443 }
444
445 call_rcu(&(folio_page(virt_to_folio(table), 0)->rcu_head),
446 swap_table_free_folio_rcu_cb);
447 }
448
swap_cluster_free_table(struct swap_cluster_info * ci)449 static void swap_cluster_free_table(struct swap_cluster_info *ci)
450 {
451 unsigned int ci_off;
452 struct swap_table *table;
453
454 /* Only empty cluster's table is allow to be freed */
455 lockdep_assert_held(&ci->lock);
456 VM_WARN_ON_ONCE(!cluster_is_empty(ci));
457 for (ci_off = 0; ci_off < SWAPFILE_CLUSTER; ci_off++)
458 VM_WARN_ON_ONCE(!swp_tb_is_null(__swap_table_get(ci, ci_off)));
459 table = (void *)rcu_dereference_protected(ci->table, true);
460 rcu_assign_pointer(ci->table, NULL);
461
462 swap_table_free(table);
463 }
464
465 /*
466 * Allocate swap table for one cluster. Attempt an atomic allocation first,
467 * then fallback to sleeping allocation.
468 */
469 static struct swap_cluster_info *
swap_cluster_alloc_table(struct swap_info_struct * si,struct swap_cluster_info * ci)470 swap_cluster_alloc_table(struct swap_info_struct *si,
471 struct swap_cluster_info *ci)
472 {
473 struct swap_table *table;
474
475 /*
476 * Only cluster isolation from the allocator does table allocation.
477 * Swap allocator uses percpu clusters and holds the local lock.
478 */
479 lockdep_assert_held(&ci->lock);
480 lockdep_assert_held(&this_cpu_ptr(&percpu_swap_cluster)->lock);
481
482 /* The cluster must be free and was just isolated from the free list. */
483 VM_WARN_ON_ONCE(ci->flags || !cluster_is_empty(ci));
484
485 table = swap_table_alloc(__GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
486 if (table) {
487 rcu_assign_pointer(ci->table, table);
488 return ci;
489 }
490
491 /*
492 * Try a sleep allocation. Each isolated free cluster may cause
493 * a sleep allocation, but there is a limited number of them, so
494 * the potential recursive allocation is limited.
495 */
496 spin_unlock(&ci->lock);
497 if (!(si->flags & SWP_SOLIDSTATE))
498 spin_unlock(&si->global_cluster_lock);
499 local_unlock(&percpu_swap_cluster.lock);
500
501 table = swap_table_alloc(__GFP_HIGH | __GFP_NOMEMALLOC | GFP_KERNEL);
502
503 /*
504 * Back to atomic context. We might have migrated to a new CPU with a
505 * usable percpu cluster. But just keep using the isolated cluster to
506 * make things easier. Migration indicates a slight change of workload
507 * so using a new free cluster might not be a bad idea, and the worst
508 * could happen with ignoring the percpu cluster is fragmentation,
509 * which is acceptable since this fallback and race is rare.
510 */
511 local_lock(&percpu_swap_cluster.lock);
512 if (!(si->flags & SWP_SOLIDSTATE))
513 spin_lock(&si->global_cluster_lock);
514 spin_lock(&ci->lock);
515
516 /* Nothing except this helper should touch a dangling empty cluster. */
517 if (WARN_ON_ONCE(cluster_table_is_alloced(ci))) {
518 if (table)
519 swap_table_free(table);
520 return ci;
521 }
522
523 if (!table) {
524 move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE);
525 spin_unlock(&ci->lock);
526 return NULL;
527 }
528
529 rcu_assign_pointer(ci->table, table);
530 return ci;
531 }
532
move_cluster(struct swap_info_struct * si,struct swap_cluster_info * ci,struct list_head * list,enum swap_cluster_flags new_flags)533 static void move_cluster(struct swap_info_struct *si,
534 struct swap_cluster_info *ci, struct list_head *list,
535 enum swap_cluster_flags new_flags)
536 {
537 VM_WARN_ON(ci->flags == new_flags);
538
539 BUILD_BUG_ON(1 << sizeof(ci->flags) * BITS_PER_BYTE < CLUSTER_FLAG_MAX);
540 lockdep_assert_held(&ci->lock);
541
542 spin_lock(&si->lock);
543 if (ci->flags == CLUSTER_FLAG_NONE)
544 list_add_tail(&ci->list, list);
545 else
546 list_move_tail(&ci->list, list);
547 spin_unlock(&si->lock);
548 ci->flags = new_flags;
549 }
550
551 /* Add a cluster to discard list and schedule it to do discard */
swap_cluster_schedule_discard(struct swap_info_struct * si,struct swap_cluster_info * ci)552 static void swap_cluster_schedule_discard(struct swap_info_struct *si,
553 struct swap_cluster_info *ci)
554 {
555 VM_BUG_ON(ci->flags == CLUSTER_FLAG_FREE);
556 move_cluster(si, ci, &si->discard_clusters, CLUSTER_FLAG_DISCARD);
557 schedule_work(&si->discard_work);
558 }
559
__free_cluster(struct swap_info_struct * si,struct swap_cluster_info * ci)560 static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
561 {
562 swap_cluster_free_table(ci);
563 move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE);
564 ci->order = 0;
565 }
566
567 /*
568 * Isolate and lock the first cluster that is not contented on a list,
569 * clean its flag before taken off-list. Cluster flag must be in sync
570 * with list status, so cluster updaters can always know the cluster
571 * list status without touching si lock.
572 *
573 * Note it's possible that all clusters on a list are contented so
574 * this returns NULL for an non-empty list.
575 */
isolate_lock_cluster(struct swap_info_struct * si,struct list_head * list)576 static struct swap_cluster_info *isolate_lock_cluster(
577 struct swap_info_struct *si, struct list_head *list)
578 {
579 struct swap_cluster_info *ci, *found = NULL;
580
581 spin_lock(&si->lock);
582 list_for_each_entry(ci, list, list) {
583 if (!spin_trylock(&ci->lock))
584 continue;
585
586 /* We may only isolate and clear flags of following lists */
587 VM_BUG_ON(!ci->flags);
588 VM_BUG_ON(ci->flags > CLUSTER_FLAG_USABLE &&
589 ci->flags != CLUSTER_FLAG_FULL);
590
591 list_del(&ci->list);
592 ci->flags = CLUSTER_FLAG_NONE;
593 found = ci;
594 break;
595 }
596 spin_unlock(&si->lock);
597
598 if (found && !cluster_table_is_alloced(found)) {
599 /* Only an empty free cluster's swap table can be freed. */
600 VM_WARN_ON_ONCE(list != &si->free_clusters);
601 VM_WARN_ON_ONCE(!cluster_is_empty(found));
602 return swap_cluster_alloc_table(si, found);
603 }
604
605 return found;
606 }
607
608 /*
609 * Doing discard actually. After a cluster discard is finished, the cluster
610 * will be added to free cluster list. Discard cluster is a bit special as
611 * they don't participate in allocation or reclaim, so clusters marked as
612 * CLUSTER_FLAG_DISCARD must remain off-list or on discard list.
613 */
swap_do_scheduled_discard(struct swap_info_struct * si)614 static bool swap_do_scheduled_discard(struct swap_info_struct *si)
615 {
616 struct swap_cluster_info *ci;
617 bool ret = false;
618 unsigned int idx;
619
620 spin_lock(&si->lock);
621 while (!list_empty(&si->discard_clusters)) {
622 ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list);
623 /*
624 * Delete the cluster from list to prepare for discard, but keep
625 * the CLUSTER_FLAG_DISCARD flag, percpu_swap_cluster could be
626 * pointing to it, or ran into by relocate_cluster.
627 */
628 list_del(&ci->list);
629 idx = cluster_index(si, ci);
630 spin_unlock(&si->lock);
631 discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
632 SWAPFILE_CLUSTER);
633
634 spin_lock(&ci->lock);
635 /*
636 * Discard is done, clear its flags as it's off-list, then
637 * return the cluster to allocation list.
638 */
639 ci->flags = CLUSTER_FLAG_NONE;
640 __free_cluster(si, ci);
641 spin_unlock(&ci->lock);
642 ret = true;
643 spin_lock(&si->lock);
644 }
645 spin_unlock(&si->lock);
646 return ret;
647 }
648
swap_discard_work(struct work_struct * work)649 static void swap_discard_work(struct work_struct *work)
650 {
651 struct swap_info_struct *si;
652
653 si = container_of(work, struct swap_info_struct, discard_work);
654
655 swap_do_scheduled_discard(si);
656 }
657
swap_users_ref_free(struct percpu_ref * ref)658 static void swap_users_ref_free(struct percpu_ref *ref)
659 {
660 struct swap_info_struct *si;
661
662 si = container_of(ref, struct swap_info_struct, users);
663 complete(&si->comp);
664 }
665
666 /*
667 * Must be called after freeing if ci->count == 0, moves the cluster to free
668 * or discard list.
669 */
free_cluster(struct swap_info_struct * si,struct swap_cluster_info * ci)670 static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
671 {
672 VM_BUG_ON(ci->count != 0);
673 VM_BUG_ON(ci->flags == CLUSTER_FLAG_FREE);
674 lockdep_assert_held(&ci->lock);
675
676 /*
677 * If the swap is discardable, prepare discard the cluster
678 * instead of free it immediately. The cluster will be freed
679 * after discard.
680 */
681 if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
682 (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
683 swap_cluster_schedule_discard(si, ci);
684 return;
685 }
686
687 __free_cluster(si, ci);
688 }
689
690 /*
691 * Must be called after freeing if ci->count != 0, moves the cluster to
692 * nonfull list.
693 */
partial_free_cluster(struct swap_info_struct * si,struct swap_cluster_info * ci)694 static void partial_free_cluster(struct swap_info_struct *si,
695 struct swap_cluster_info *ci)
696 {
697 VM_BUG_ON(!ci->count || ci->count == SWAPFILE_CLUSTER);
698 lockdep_assert_held(&ci->lock);
699
700 if (ci->flags != CLUSTER_FLAG_NONFULL)
701 move_cluster(si, ci, &si->nonfull_clusters[ci->order],
702 CLUSTER_FLAG_NONFULL);
703 }
704
705 /*
706 * Must be called after allocation, moves the cluster to full or frag list.
707 * Note: allocation doesn't acquire si lock, and may drop the ci lock for
708 * reclaim, so the cluster could be any where when called.
709 */
relocate_cluster(struct swap_info_struct * si,struct swap_cluster_info * ci)710 static void relocate_cluster(struct swap_info_struct *si,
711 struct swap_cluster_info *ci)
712 {
713 lockdep_assert_held(&ci->lock);
714
715 /* Discard cluster must remain off-list or on discard list */
716 if (cluster_is_discard(ci))
717 return;
718
719 if (!ci->count) {
720 if (ci->flags != CLUSTER_FLAG_FREE)
721 free_cluster(si, ci);
722 } else if (ci->count != SWAPFILE_CLUSTER) {
723 if (ci->flags != CLUSTER_FLAG_FRAG)
724 move_cluster(si, ci, &si->frag_clusters[ci->order],
725 CLUSTER_FLAG_FRAG);
726 } else {
727 if (ci->flags != CLUSTER_FLAG_FULL)
728 move_cluster(si, ci, &si->full_clusters,
729 CLUSTER_FLAG_FULL);
730 }
731 }
732
733 /*
734 * The cluster corresponding to @offset will be accounted as having one bad
735 * slot. The cluster will not be added to the free cluster list, and its
736 * usage counter will be increased by 1. Only used for initialization.
737 */
swap_cluster_setup_bad_slot(struct swap_cluster_info * cluster_info,unsigned long offset)738 static int swap_cluster_setup_bad_slot(struct swap_cluster_info *cluster_info,
739 unsigned long offset)
740 {
741 unsigned long idx = offset / SWAPFILE_CLUSTER;
742 struct swap_table *table;
743 struct swap_cluster_info *ci;
744
745 ci = cluster_info + idx;
746 if (!ci->table) {
747 table = swap_table_alloc(GFP_KERNEL);
748 if (!table)
749 return -ENOMEM;
750 rcu_assign_pointer(ci->table, table);
751 }
752
753 ci->count++;
754
755 WARN_ON(ci->count > SWAPFILE_CLUSTER);
756 WARN_ON(ci->flags);
757
758 return 0;
759 }
760
761 /*
762 * Reclaim drops the ci lock, so the cluster may become unusable (freed or
763 * stolen by a lower order). @usable will be set to false if that happens.
764 */
cluster_reclaim_range(struct swap_info_struct * si,struct swap_cluster_info * ci,unsigned long start,unsigned int order,bool * usable)765 static bool cluster_reclaim_range(struct swap_info_struct *si,
766 struct swap_cluster_info *ci,
767 unsigned long start, unsigned int order,
768 bool *usable)
769 {
770 unsigned int nr_pages = 1 << order;
771 unsigned long offset = start, end = start + nr_pages;
772 unsigned char *map = si->swap_map;
773 unsigned long swp_tb;
774
775 spin_unlock(&ci->lock);
776 do {
777 if (READ_ONCE(map[offset]))
778 break;
779 swp_tb = swap_table_get(ci, offset % SWAPFILE_CLUSTER);
780 if (swp_tb_is_folio(swp_tb)) {
781 if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY) < 0)
782 break;
783 }
784 } while (++offset < end);
785 spin_lock(&ci->lock);
786
787 /*
788 * We just dropped ci->lock so cluster could be used by another
789 * order or got freed, check if it's still usable or empty.
790 */
791 if (!cluster_is_usable(ci, order)) {
792 *usable = false;
793 return false;
794 }
795 *usable = true;
796
797 /* Fast path, no need to scan if the whole cluster is empty */
798 if (cluster_is_empty(ci))
799 return true;
800
801 /*
802 * Recheck the range no matter reclaim succeeded or not, the slot
803 * could have been be freed while we are not holding the lock.
804 */
805 for (offset = start; offset < end; offset++) {
806 swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER);
807 if (map[offset] || !swp_tb_is_null(swp_tb))
808 return false;
809 }
810
811 return true;
812 }
813
cluster_scan_range(struct swap_info_struct * si,struct swap_cluster_info * ci,unsigned long offset,unsigned int nr_pages,bool * need_reclaim)814 static bool cluster_scan_range(struct swap_info_struct *si,
815 struct swap_cluster_info *ci,
816 unsigned long offset, unsigned int nr_pages,
817 bool *need_reclaim)
818 {
819 unsigned long end = offset + nr_pages;
820 unsigned char *map = si->swap_map;
821 unsigned long swp_tb;
822
823 if (cluster_is_empty(ci))
824 return true;
825
826 do {
827 if (map[offset])
828 return false;
829 swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER);
830 if (swp_tb_is_folio(swp_tb)) {
831 if (!vm_swap_full())
832 return false;
833 *need_reclaim = true;
834 } else {
835 /* A entry with no count and no cache must be null */
836 VM_WARN_ON_ONCE(!swp_tb_is_null(swp_tb));
837 }
838 } while (++offset < end);
839
840 return true;
841 }
842
843 /*
844 * Currently, the swap table is not used for count tracking, just
845 * do a sanity check here to ensure nothing leaked, so the swap
846 * table should be empty upon freeing.
847 */
swap_cluster_assert_table_empty(struct swap_cluster_info * ci,unsigned int start,unsigned int nr)848 static void swap_cluster_assert_table_empty(struct swap_cluster_info *ci,
849 unsigned int start, unsigned int nr)
850 {
851 unsigned int ci_off = start % SWAPFILE_CLUSTER;
852 unsigned int ci_end = ci_off + nr;
853 unsigned long swp_tb;
854
855 if (IS_ENABLED(CONFIG_DEBUG_VM)) {
856 do {
857 swp_tb = __swap_table_get(ci, ci_off);
858 VM_WARN_ON_ONCE(!swp_tb_is_null(swp_tb));
859 } while (++ci_off < ci_end);
860 }
861 }
862
cluster_alloc_range(struct swap_info_struct * si,struct swap_cluster_info * ci,struct folio * folio,unsigned int offset)863 static bool cluster_alloc_range(struct swap_info_struct *si,
864 struct swap_cluster_info *ci,
865 struct folio *folio,
866 unsigned int offset)
867 {
868 unsigned long nr_pages;
869 unsigned int order;
870
871 lockdep_assert_held(&ci->lock);
872
873 if (!(si->flags & SWP_WRITEOK))
874 return false;
875
876 /*
877 * All mm swap allocation starts with a folio (folio_alloc_swap),
878 * it's also the only allocation path for large orders allocation.
879 * Such swap slots starts with count == 0 and will be increased
880 * upon folio unmap.
881 *
882 * Else, it's a exclusive order 0 allocation for hibernation.
883 * The slot starts with count == 1 and never increases.
884 */
885 if (likely(folio)) {
886 order = folio_order(folio);
887 nr_pages = 1 << order;
888 __swap_cache_add_folio(ci, folio, swp_entry(si->type, offset));
889 } else if (IS_ENABLED(CONFIG_HIBERNATION)) {
890 order = 0;
891 nr_pages = 1;
892 WARN_ON_ONCE(si->swap_map[offset]);
893 si->swap_map[offset] = 1;
894 swap_cluster_assert_table_empty(ci, offset, 1);
895 } else {
896 /* Allocation without folio is only possible with hibernation */
897 WARN_ON_ONCE(1);
898 return false;
899 }
900
901 /*
902 * The first allocation in a cluster makes the
903 * cluster exclusive to this order
904 */
905 if (cluster_is_empty(ci))
906 ci->order = order;
907 ci->count += nr_pages;
908 swap_range_alloc(si, nr_pages);
909
910 return true;
911 }
912
913 /* Try use a new cluster for current CPU and allocate from it. */
alloc_swap_scan_cluster(struct swap_info_struct * si,struct swap_cluster_info * ci,struct folio * folio,unsigned long offset)914 static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si,
915 struct swap_cluster_info *ci,
916 struct folio *folio, unsigned long offset)
917 {
918 unsigned int next = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID;
919 unsigned long start = ALIGN_DOWN(offset, SWAPFILE_CLUSTER);
920 unsigned long end = min(start + SWAPFILE_CLUSTER, si->max);
921 unsigned int order = likely(folio) ? folio_order(folio) : 0;
922 unsigned int nr_pages = 1 << order;
923 bool need_reclaim, ret, usable;
924
925 lockdep_assert_held(&ci->lock);
926 VM_WARN_ON(!cluster_is_usable(ci, order));
927
928 if (end < nr_pages || ci->count + nr_pages > SWAPFILE_CLUSTER)
929 goto out;
930
931 for (end -= nr_pages; offset <= end; offset += nr_pages) {
932 need_reclaim = false;
933 if (!cluster_scan_range(si, ci, offset, nr_pages, &need_reclaim))
934 continue;
935 if (need_reclaim) {
936 ret = cluster_reclaim_range(si, ci, offset, order, &usable);
937 if (!usable)
938 goto out;
939 if (cluster_is_empty(ci))
940 offset = start;
941 /* Reclaim failed but cluster is usable, try next */
942 if (!ret)
943 continue;
944 }
945 if (!cluster_alloc_range(si, ci, folio, offset))
946 break;
947 found = offset;
948 offset += nr_pages;
949 if (ci->count < SWAPFILE_CLUSTER && offset <= end)
950 next = offset;
951 break;
952 }
953 out:
954 relocate_cluster(si, ci);
955 swap_cluster_unlock(ci);
956 if (si->flags & SWP_SOLIDSTATE) {
957 this_cpu_write(percpu_swap_cluster.offset[order], next);
958 this_cpu_write(percpu_swap_cluster.si[order], si);
959 } else {
960 si->global_cluster->next[order] = next;
961 }
962 return found;
963 }
964
alloc_swap_scan_list(struct swap_info_struct * si,struct list_head * list,struct folio * folio,bool scan_all)965 static unsigned int alloc_swap_scan_list(struct swap_info_struct *si,
966 struct list_head *list,
967 struct folio *folio,
968 bool scan_all)
969 {
970 unsigned int found = SWAP_ENTRY_INVALID;
971
972 do {
973 struct swap_cluster_info *ci = isolate_lock_cluster(si, list);
974 unsigned long offset;
975
976 if (!ci)
977 break;
978 offset = cluster_offset(si, ci);
979 found = alloc_swap_scan_cluster(si, ci, folio, offset);
980 if (found)
981 break;
982 } while (scan_all);
983
984 return found;
985 }
986
swap_reclaim_full_clusters(struct swap_info_struct * si,bool force)987 static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
988 {
989 long to_scan = 1;
990 unsigned long offset, end;
991 struct swap_cluster_info *ci;
992 unsigned char *map = si->swap_map;
993 int nr_reclaim;
994
995 if (force)
996 to_scan = swap_usage_in_pages(si) / SWAPFILE_CLUSTER;
997
998 while ((ci = isolate_lock_cluster(si, &si->full_clusters))) {
999 offset = cluster_offset(si, ci);
1000 end = min(si->max, offset + SWAPFILE_CLUSTER);
1001 to_scan--;
1002
1003 while (offset < end) {
1004 if (!READ_ONCE(map[offset]) &&
1005 swp_tb_is_folio(swap_table_get(ci, offset % SWAPFILE_CLUSTER))) {
1006 spin_unlock(&ci->lock);
1007 nr_reclaim = __try_to_reclaim_swap(si, offset,
1008 TTRS_ANYWAY);
1009 spin_lock(&ci->lock);
1010 if (nr_reclaim) {
1011 offset += abs(nr_reclaim);
1012 continue;
1013 }
1014 }
1015 offset++;
1016 }
1017
1018 /* in case no swap cache is reclaimed */
1019 if (ci->flags == CLUSTER_FLAG_NONE)
1020 relocate_cluster(si, ci);
1021
1022 swap_cluster_unlock(ci);
1023 if (to_scan <= 0)
1024 break;
1025 }
1026 }
1027
swap_reclaim_work(struct work_struct * work)1028 static void swap_reclaim_work(struct work_struct *work)
1029 {
1030 struct swap_info_struct *si;
1031
1032 si = container_of(work, struct swap_info_struct, reclaim_work);
1033
1034 swap_reclaim_full_clusters(si, true);
1035 }
1036
1037 /*
1038 * Try to allocate swap entries with specified order and try set a new
1039 * cluster for current CPU too.
1040 */
cluster_alloc_swap_entry(struct swap_info_struct * si,struct folio * folio)1041 static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si,
1042 struct folio *folio)
1043 {
1044 struct swap_cluster_info *ci;
1045 unsigned int order = likely(folio) ? folio_order(folio) : 0;
1046 unsigned int offset = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID;
1047
1048 /*
1049 * Swapfile is not block device so unable
1050 * to allocate large entries.
1051 */
1052 if (order && !(si->flags & SWP_BLKDEV))
1053 return 0;
1054
1055 if (!(si->flags & SWP_SOLIDSTATE)) {
1056 /* Serialize HDD SWAP allocation for each device. */
1057 spin_lock(&si->global_cluster_lock);
1058 offset = si->global_cluster->next[order];
1059 if (offset == SWAP_ENTRY_INVALID)
1060 goto new_cluster;
1061
1062 ci = swap_cluster_lock(si, offset);
1063 /* Cluster could have been used by another order */
1064 if (cluster_is_usable(ci, order)) {
1065 if (cluster_is_empty(ci))
1066 offset = cluster_offset(si, ci);
1067 found = alloc_swap_scan_cluster(si, ci, folio, offset);
1068 } else {
1069 swap_cluster_unlock(ci);
1070 }
1071 if (found)
1072 goto done;
1073 }
1074
1075 new_cluster:
1076 /*
1077 * If the device need discard, prefer new cluster over nonfull
1078 * to spread out the writes.
1079 */
1080 if (si->flags & SWP_PAGE_DISCARD) {
1081 found = alloc_swap_scan_list(si, &si->free_clusters, folio, false);
1082 if (found)
1083 goto done;
1084 }
1085
1086 if (order < PMD_ORDER) {
1087 found = alloc_swap_scan_list(si, &si->nonfull_clusters[order], folio, true);
1088 if (found)
1089 goto done;
1090 }
1091
1092 if (!(si->flags & SWP_PAGE_DISCARD)) {
1093 found = alloc_swap_scan_list(si, &si->free_clusters, folio, false);
1094 if (found)
1095 goto done;
1096 }
1097
1098 /* Try reclaim full clusters if free and nonfull lists are drained */
1099 if (vm_swap_full())
1100 swap_reclaim_full_clusters(si, false);
1101
1102 if (order < PMD_ORDER) {
1103 /*
1104 * Scan only one fragment cluster is good enough. Order 0
1105 * allocation will surely success, and large allocation
1106 * failure is not critical. Scanning one cluster still
1107 * keeps the list rotated and reclaimed (for clean swap cache).
1108 */
1109 found = alloc_swap_scan_list(si, &si->frag_clusters[order], folio, false);
1110 if (found)
1111 goto done;
1112 }
1113
1114 if (order)
1115 goto done;
1116
1117 /* Order 0 stealing from higher order */
1118 for (int o = 1; o < SWAP_NR_ORDERS; o++) {
1119 /*
1120 * Clusters here have at least one usable slots and can't fail order 0
1121 * allocation, but reclaim may drop si->lock and race with another user.
1122 */
1123 found = alloc_swap_scan_list(si, &si->frag_clusters[o], folio, true);
1124 if (found)
1125 goto done;
1126
1127 found = alloc_swap_scan_list(si, &si->nonfull_clusters[o], folio, true);
1128 if (found)
1129 goto done;
1130 }
1131 done:
1132 if (!(si->flags & SWP_SOLIDSTATE))
1133 spin_unlock(&si->global_cluster_lock);
1134
1135 return found;
1136 }
1137
1138 /* SWAP_USAGE_OFFLIST_BIT can only be set by this helper. */
del_from_avail_list(struct swap_info_struct * si,bool swapoff)1139 static void del_from_avail_list(struct swap_info_struct *si, bool swapoff)
1140 {
1141 unsigned long pages;
1142
1143 spin_lock(&swap_avail_lock);
1144
1145 if (swapoff) {
1146 /*
1147 * Forcefully remove it. Clear the SWP_WRITEOK flags for
1148 * swapoff here so it's synchronized by both si->lock and
1149 * swap_avail_lock, to ensure the result can be seen by
1150 * add_to_avail_list.
1151 */
1152 lockdep_assert_held(&si->lock);
1153 si->flags &= ~SWP_WRITEOK;
1154 atomic_long_or(SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages);
1155 } else {
1156 /*
1157 * If not called by swapoff, take it off-list only if it's
1158 * full and SWAP_USAGE_OFFLIST_BIT is not set (strictly
1159 * si->inuse_pages == pages), any concurrent slot freeing,
1160 * or device already removed from plist by someone else
1161 * will make this return false.
1162 */
1163 pages = si->pages;
1164 if (!atomic_long_try_cmpxchg(&si->inuse_pages, &pages,
1165 pages | SWAP_USAGE_OFFLIST_BIT))
1166 goto skip;
1167 }
1168
1169 plist_del(&si->avail_list, &swap_avail_head);
1170
1171 skip:
1172 spin_unlock(&swap_avail_lock);
1173 }
1174
1175 /* SWAP_USAGE_OFFLIST_BIT can only be cleared by this helper. */
add_to_avail_list(struct swap_info_struct * si,bool swapon)1176 static void add_to_avail_list(struct swap_info_struct *si, bool swapon)
1177 {
1178 long val;
1179 unsigned long pages;
1180
1181 spin_lock(&swap_avail_lock);
1182
1183 /* Corresponding to SWP_WRITEOK clearing in del_from_avail_list */
1184 if (swapon) {
1185 lockdep_assert_held(&si->lock);
1186 si->flags |= SWP_WRITEOK;
1187 } else {
1188 if (!(READ_ONCE(si->flags) & SWP_WRITEOK))
1189 goto skip;
1190 }
1191
1192 if (!(atomic_long_read(&si->inuse_pages) & SWAP_USAGE_OFFLIST_BIT))
1193 goto skip;
1194
1195 val = atomic_long_fetch_and_relaxed(~SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages);
1196
1197 /*
1198 * When device is full and device is on the plist, only one updater will
1199 * see (inuse_pages == si->pages) and will call del_from_avail_list. If
1200 * that updater happen to be here, just skip adding.
1201 */
1202 pages = si->pages;
1203 if (val == pages) {
1204 /* Just like the cmpxchg in del_from_avail_list */
1205 if (atomic_long_try_cmpxchg(&si->inuse_pages, &pages,
1206 pages | SWAP_USAGE_OFFLIST_BIT))
1207 goto skip;
1208 }
1209
1210 plist_add(&si->avail_list, &swap_avail_head);
1211
1212 skip:
1213 spin_unlock(&swap_avail_lock);
1214 }
1215
1216 /*
1217 * swap_usage_add / swap_usage_sub of each slot are serialized by ci->lock
1218 * within each cluster, so the total contribution to the global counter should
1219 * always be positive and cannot exceed the total number of usable slots.
1220 */
swap_usage_add(struct swap_info_struct * si,unsigned int nr_entries)1221 static bool swap_usage_add(struct swap_info_struct *si, unsigned int nr_entries)
1222 {
1223 long val = atomic_long_add_return_relaxed(nr_entries, &si->inuse_pages);
1224
1225 /*
1226 * If device is full, and SWAP_USAGE_OFFLIST_BIT is not set,
1227 * remove it from the plist.
1228 */
1229 if (unlikely(val == si->pages)) {
1230 del_from_avail_list(si, false);
1231 return true;
1232 }
1233
1234 return false;
1235 }
1236
swap_usage_sub(struct swap_info_struct * si,unsigned int nr_entries)1237 static void swap_usage_sub(struct swap_info_struct *si, unsigned int nr_entries)
1238 {
1239 long val = atomic_long_sub_return_relaxed(nr_entries, &si->inuse_pages);
1240
1241 /*
1242 * If device is not full, and SWAP_USAGE_OFFLIST_BIT is set,
1243 * add it to the plist.
1244 */
1245 if (unlikely(val & SWAP_USAGE_OFFLIST_BIT))
1246 add_to_avail_list(si, false);
1247 }
1248
swap_range_alloc(struct swap_info_struct * si,unsigned int nr_entries)1249 static void swap_range_alloc(struct swap_info_struct *si,
1250 unsigned int nr_entries)
1251 {
1252 if (swap_usage_add(si, nr_entries)) {
1253 if (vm_swap_full())
1254 schedule_work(&si->reclaim_work);
1255 }
1256 atomic_long_sub(nr_entries, &nr_swap_pages);
1257 }
1258
swap_range_free(struct swap_info_struct * si,unsigned long offset,unsigned int nr_entries)1259 static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
1260 unsigned int nr_entries)
1261 {
1262 unsigned long begin = offset;
1263 unsigned long end = offset + nr_entries - 1;
1264 void (*swap_slot_free_notify)(struct block_device *, unsigned long);
1265 unsigned int i;
1266
1267 /*
1268 * Use atomic clear_bit operations only on zeromap instead of non-atomic
1269 * bitmap_clear to prevent adjacent bits corruption due to simultaneous writes.
1270 */
1271 for (i = 0; i < nr_entries; i++) {
1272 clear_bit(offset + i, si->zeromap);
1273 zswap_invalidate(swp_entry(si->type, offset + i));
1274 }
1275
1276 if (si->flags & SWP_BLKDEV)
1277 swap_slot_free_notify =
1278 si->bdev->bd_disk->fops->swap_slot_free_notify;
1279 else
1280 swap_slot_free_notify = NULL;
1281 while (offset <= end) {
1282 arch_swap_invalidate_page(si->type, offset);
1283 if (swap_slot_free_notify)
1284 swap_slot_free_notify(si->bdev, offset);
1285 offset++;
1286 }
1287 __swap_cache_clear_shadow(swp_entry(si->type, begin), nr_entries);
1288
1289 /*
1290 * Make sure that try_to_unuse() observes si->inuse_pages reaching 0
1291 * only after the above cleanups are done.
1292 */
1293 smp_wmb();
1294 atomic_long_add(nr_entries, &nr_swap_pages);
1295 swap_usage_sub(si, nr_entries);
1296 }
1297
get_swap_device_info(struct swap_info_struct * si)1298 static bool get_swap_device_info(struct swap_info_struct *si)
1299 {
1300 if (!percpu_ref_tryget_live(&si->users))
1301 return false;
1302 /*
1303 * Guarantee the si->users are checked before accessing other
1304 * fields of swap_info_struct, and si->flags (SWP_WRITEOK) is
1305 * up to dated.
1306 *
1307 * Paired with the spin_unlock() after setup_swap_info() in
1308 * enable_swap_info(), and smp_wmb() in swapoff.
1309 */
1310 smp_rmb();
1311 return true;
1312 }
1313
1314 /*
1315 * Fast path try to get swap entries with specified order from current
1316 * CPU's swap entry pool (a cluster).
1317 */
swap_alloc_fast(struct folio * folio)1318 static bool swap_alloc_fast(struct folio *folio)
1319 {
1320 unsigned int order = folio_order(folio);
1321 struct swap_cluster_info *ci;
1322 struct swap_info_struct *si;
1323 unsigned int offset;
1324
1325 /*
1326 * Once allocated, swap_info_struct will never be completely freed,
1327 * so checking it's liveness by get_swap_device_info is enough.
1328 */
1329 si = this_cpu_read(percpu_swap_cluster.si[order]);
1330 offset = this_cpu_read(percpu_swap_cluster.offset[order]);
1331 if (!si || !offset || !get_swap_device_info(si))
1332 return false;
1333
1334 ci = swap_cluster_lock(si, offset);
1335 if (cluster_is_usable(ci, order)) {
1336 if (cluster_is_empty(ci))
1337 offset = cluster_offset(si, ci);
1338 alloc_swap_scan_cluster(si, ci, folio, offset);
1339 } else {
1340 swap_cluster_unlock(ci);
1341 }
1342
1343 put_swap_device(si);
1344 return folio_test_swapcache(folio);
1345 }
1346
1347 /* Rotate the device and switch to a new cluster */
swap_alloc_slow(struct folio * folio)1348 static void swap_alloc_slow(struct folio *folio)
1349 {
1350 struct swap_info_struct *si, *next;
1351
1352 spin_lock(&swap_avail_lock);
1353 start_over:
1354 plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) {
1355 /* Rotate the device and switch to a new cluster */
1356 plist_requeue(&si->avail_list, &swap_avail_head);
1357 spin_unlock(&swap_avail_lock);
1358 if (get_swap_device_info(si)) {
1359 cluster_alloc_swap_entry(si, folio);
1360 put_swap_device(si);
1361 if (folio_test_swapcache(folio))
1362 return;
1363 if (folio_test_large(folio))
1364 return;
1365 }
1366
1367 spin_lock(&swap_avail_lock);
1368 /*
1369 * if we got here, it's likely that si was almost full before,
1370 * multiple callers probably all tried to get a page from the
1371 * same si and it filled up before we could get one; or, the si
1372 * filled up between us dropping swap_avail_lock.
1373 * Since we dropped the swap_avail_lock, the swap_avail_list
1374 * may have been modified; so if next is still in the
1375 * swap_avail_head list then try it, otherwise start over if we
1376 * have not gotten any slots.
1377 */
1378 if (plist_node_empty(&next->avail_list))
1379 goto start_over;
1380 }
1381 spin_unlock(&swap_avail_lock);
1382 }
1383
1384 /*
1385 * Discard pending clusters in a synchronized way when under high pressure.
1386 * Return: true if any cluster is discarded.
1387 */
swap_sync_discard(void)1388 static bool swap_sync_discard(void)
1389 {
1390 bool ret = false;
1391 struct swap_info_struct *si, *next;
1392
1393 spin_lock(&swap_lock);
1394 start_over:
1395 plist_for_each_entry_safe(si, next, &swap_active_head, list) {
1396 spin_unlock(&swap_lock);
1397 if (get_swap_device_info(si)) {
1398 if (si->flags & SWP_PAGE_DISCARD)
1399 ret = swap_do_scheduled_discard(si);
1400 put_swap_device(si);
1401 }
1402 if (ret)
1403 return true;
1404
1405 spin_lock(&swap_lock);
1406 if (plist_node_empty(&next->list))
1407 goto start_over;
1408 }
1409 spin_unlock(&swap_lock);
1410
1411 return false;
1412 }
1413
1414 /**
1415 * swap_put_entries_cluster - Decrease the swap count of a set of slots.
1416 * @si: The swap device.
1417 * @start: start offset of slots.
1418 * @nr: number of slots.
1419 * @reclaim_cache: if true, also reclaim the swap cache.
1420 *
1421 * This helper decreases the swap count of a set of slots and tries to
1422 * batch free them. Also reclaims the swap cache if @reclaim_cache is true.
1423 * Context: The caller must ensure that all slots belong to the same
1424 * cluster and their swap count doesn't go underflow.
1425 */
swap_put_entries_cluster(struct swap_info_struct * si,unsigned long start,int nr,bool reclaim_cache)1426 static void swap_put_entries_cluster(struct swap_info_struct *si,
1427 unsigned long start, int nr,
1428 bool reclaim_cache)
1429 {
1430 unsigned long offset = start, end = start + nr;
1431 unsigned long batch_start = SWAP_ENTRY_INVALID;
1432 struct swap_cluster_info *ci;
1433 bool need_reclaim = false;
1434 unsigned int nr_reclaimed;
1435 unsigned long swp_tb;
1436 unsigned int count;
1437
1438 ci = swap_cluster_lock(si, offset);
1439 do {
1440 swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER);
1441 count = si->swap_map[offset];
1442 VM_WARN_ON(count < 1 || count == SWAP_MAP_BAD);
1443 if (count == 1) {
1444 /* count == 1 and non-cached slots will be batch freed. */
1445 if (!swp_tb_is_folio(swp_tb)) {
1446 if (!batch_start)
1447 batch_start = offset;
1448 continue;
1449 }
1450 /* count will be 0 after put, slot can be reclaimed */
1451 need_reclaim = true;
1452 }
1453 /*
1454 * A count != 1 or cached slot can't be freed. Put its swap
1455 * count and then free the interrupted pending batch. Cached
1456 * slots will be freed when folio is removed from swap cache
1457 * (__swap_cache_del_folio).
1458 */
1459 swap_put_entry_locked(si, ci, offset);
1460 if (batch_start) {
1461 swap_entries_free(si, ci, batch_start, offset - batch_start);
1462 batch_start = SWAP_ENTRY_INVALID;
1463 }
1464 } while (++offset < end);
1465
1466 if (batch_start)
1467 swap_entries_free(si, ci, batch_start, offset - batch_start);
1468 swap_cluster_unlock(ci);
1469
1470 if (!need_reclaim || !reclaim_cache)
1471 return;
1472
1473 offset = start;
1474 do {
1475 nr_reclaimed = __try_to_reclaim_swap(si, offset,
1476 TTRS_UNMAPPED | TTRS_FULL);
1477 offset++;
1478 if (nr_reclaimed)
1479 offset = round_up(offset, abs(nr_reclaimed));
1480 } while (offset < end);
1481 }
1482
1483 /**
1484 * folio_alloc_swap - allocate swap space for a folio
1485 * @folio: folio we want to move to swap
1486 *
1487 * Allocate swap space for the folio and add the folio to the
1488 * swap cache.
1489 *
1490 * Context: Caller needs to hold the folio lock.
1491 * Return: Whether the folio was added to the swap cache.
1492 */
folio_alloc_swap(struct folio * folio)1493 int folio_alloc_swap(struct folio *folio)
1494 {
1495 unsigned int order = folio_order(folio);
1496 unsigned int size = 1 << order;
1497
1498 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
1499 VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio);
1500
1501 if (order) {
1502 /*
1503 * Reject large allocation when THP_SWAP is disabled,
1504 * the caller should split the folio and try again.
1505 */
1506 if (!IS_ENABLED(CONFIG_THP_SWAP))
1507 return -EAGAIN;
1508
1509 /*
1510 * Allocation size should never exceed cluster size
1511 * (HPAGE_PMD_SIZE).
1512 */
1513 if (size > SWAPFILE_CLUSTER) {
1514 VM_WARN_ON_ONCE(1);
1515 return -EINVAL;
1516 }
1517 }
1518
1519 again:
1520 local_lock(&percpu_swap_cluster.lock);
1521 if (!swap_alloc_fast(folio))
1522 swap_alloc_slow(folio);
1523 local_unlock(&percpu_swap_cluster.lock);
1524
1525 if (!order && unlikely(!folio_test_swapcache(folio))) {
1526 if (swap_sync_discard())
1527 goto again;
1528 }
1529
1530 /* Need to call this even if allocation failed, for MEMCG_SWAP_FAIL. */
1531 if (unlikely(mem_cgroup_try_charge_swap(folio, folio->swap)))
1532 swap_cache_del_folio(folio);
1533
1534 if (unlikely(!folio_test_swapcache(folio)))
1535 return -ENOMEM;
1536
1537 return 0;
1538 }
1539
1540 /**
1541 * folio_dup_swap() - Increase swap count of swap entries of a folio.
1542 * @folio: folio with swap entries bounded.
1543 * @subpage: if not NULL, only increase the swap count of this subpage.
1544 *
1545 * Typically called when the folio is unmapped and have its swap entry to
1546 * take its palce.
1547 *
1548 * Context: Caller must ensure the folio is locked and in the swap cache.
1549 * NOTE: The caller also has to ensure there is no raced call to
1550 * swap_put_entries_direct on its swap entry before this helper returns, or
1551 * the swap map may underflow. Currently, we only accept @subpage == NULL
1552 * for shmem due to the limitation of swap continuation: shmem always
1553 * duplicates the swap entry only once, so there is no such issue for it.
1554 */
folio_dup_swap(struct folio * folio,struct page * subpage)1555 int folio_dup_swap(struct folio *folio, struct page *subpage)
1556 {
1557 int err = 0;
1558 swp_entry_t entry = folio->swap;
1559 unsigned long nr_pages = folio_nr_pages(folio);
1560
1561 VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
1562 VM_WARN_ON_FOLIO(!folio_test_swapcache(folio), folio);
1563
1564 if (subpage) {
1565 entry.val += folio_page_idx(folio, subpage);
1566 nr_pages = 1;
1567 }
1568
1569 while (!err && __swap_duplicate(entry, 1, nr_pages) == -ENOMEM)
1570 err = add_swap_count_continuation(entry, GFP_ATOMIC);
1571
1572 return err;
1573 }
1574
1575 /**
1576 * folio_put_swap() - Decrease swap count of swap entries of a folio.
1577 * @folio: folio with swap entries bounded, must be in swap cache and locked.
1578 * @subpage: if not NULL, only decrease the swap count of this subpage.
1579 *
1580 * This won't free the swap slots even if swap count drops to zero, they are
1581 * still pinned by the swap cache. User may call folio_free_swap to free them.
1582 * Context: Caller must ensure the folio is locked and in the swap cache.
1583 */
folio_put_swap(struct folio * folio,struct page * subpage)1584 void folio_put_swap(struct folio *folio, struct page *subpage)
1585 {
1586 swp_entry_t entry = folio->swap;
1587 unsigned long nr_pages = folio_nr_pages(folio);
1588 struct swap_info_struct *si = __swap_entry_to_info(entry);
1589
1590 VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
1591 VM_WARN_ON_FOLIO(!folio_test_swapcache(folio), folio);
1592
1593 if (subpage) {
1594 entry.val += folio_page_idx(folio, subpage);
1595 nr_pages = 1;
1596 }
1597
1598 swap_put_entries_cluster(si, swp_offset(entry), nr_pages, false);
1599 }
1600
swap_put_entry_locked(struct swap_info_struct * si,struct swap_cluster_info * ci,unsigned long offset)1601 static void swap_put_entry_locked(struct swap_info_struct *si,
1602 struct swap_cluster_info *ci,
1603 unsigned long offset)
1604 {
1605 unsigned char count;
1606
1607 count = si->swap_map[offset];
1608 if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
1609 if (count == COUNT_CONTINUED) {
1610 if (swap_count_continued(si, offset, count))
1611 count = SWAP_MAP_MAX | COUNT_CONTINUED;
1612 else
1613 count = SWAP_MAP_MAX;
1614 } else
1615 count--;
1616 }
1617
1618 WRITE_ONCE(si->swap_map[offset], count);
1619 if (!count && !swp_tb_is_folio(__swap_table_get(ci, offset % SWAPFILE_CLUSTER)))
1620 swap_entries_free(si, ci, offset, 1);
1621 }
1622
1623 /*
1624 * When we get a swap entry, if there aren't some other ways to
1625 * prevent swapoff, such as the folio in swap cache is locked, RCU
1626 * reader side is locked, etc., the swap entry may become invalid
1627 * because of swapoff. Then, we need to enclose all swap related
1628 * functions with get_swap_device() and put_swap_device(), unless the
1629 * swap functions call get/put_swap_device() by themselves.
1630 *
1631 * RCU reader side lock (including any spinlock) is sufficient to
1632 * prevent swapoff, because synchronize_rcu() is called in swapoff()
1633 * before freeing data structures.
1634 *
1635 * Check whether swap entry is valid in the swap device. If so,
1636 * return pointer to swap_info_struct, and keep the swap entry valid
1637 * via preventing the swap device from being swapoff, until
1638 * put_swap_device() is called. Otherwise return NULL.
1639 *
1640 * Notice that swapoff or swapoff+swapon can still happen before the
1641 * percpu_ref_tryget_live() in get_swap_device() or after the
1642 * percpu_ref_put() in put_swap_device() if there isn't any other way
1643 * to prevent swapoff. The caller must be prepared for that. For
1644 * example, the following situation is possible.
1645 *
1646 * CPU1 CPU2
1647 * do_swap_page()
1648 * ... swapoff+swapon
1649 * swap_cache_alloc_folio()
1650 * swap_cache_add_folio()
1651 * // check swap_map
1652 * // verify PTE not changed
1653 *
1654 * In __swap_duplicate(), the swap_map need to be checked before
1655 * changing partly because the specified swap entry may be for another
1656 * swap device which has been swapoff. And in do_swap_page(), after
1657 * the page is read from the swap device, the PTE is verified not
1658 * changed with the page table locked to check whether the swap device
1659 * has been swapoff or swapoff+swapon.
1660 */
get_swap_device(swp_entry_t entry)1661 struct swap_info_struct *get_swap_device(swp_entry_t entry)
1662 {
1663 struct swap_info_struct *si;
1664 unsigned long offset;
1665
1666 if (!entry.val)
1667 goto out;
1668 si = swap_entry_to_info(entry);
1669 if (!si)
1670 goto bad_nofile;
1671 if (!get_swap_device_info(si))
1672 goto out;
1673 offset = swp_offset(entry);
1674 if (offset >= si->max)
1675 goto put_out;
1676
1677 return si;
1678 bad_nofile:
1679 pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
1680 out:
1681 return NULL;
1682 put_out:
1683 pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val);
1684 percpu_ref_put(&si->users);
1685 return NULL;
1686 }
1687
1688 /*
1689 * Drop the last ref of swap entries, caller have to ensure all entries
1690 * belong to the same cgroup and cluster.
1691 */
swap_entries_free(struct swap_info_struct * si,struct swap_cluster_info * ci,unsigned long offset,unsigned int nr_pages)1692 void swap_entries_free(struct swap_info_struct *si,
1693 struct swap_cluster_info *ci,
1694 unsigned long offset, unsigned int nr_pages)
1695 {
1696 swp_entry_t entry = swp_entry(si->type, offset);
1697 unsigned char *map = si->swap_map + offset;
1698 unsigned char *map_end = map + nr_pages;
1699
1700 /* It should never free entries across different clusters */
1701 VM_BUG_ON(ci != __swap_offset_to_cluster(si, offset + nr_pages - 1));
1702 VM_BUG_ON(cluster_is_empty(ci));
1703 VM_BUG_ON(ci->count < nr_pages);
1704
1705 ci->count -= nr_pages;
1706 do {
1707 VM_WARN_ON(*map > 1);
1708 *map = 0;
1709 } while (++map < map_end);
1710
1711 mem_cgroup_uncharge_swap(entry, nr_pages);
1712 swap_range_free(si, offset, nr_pages);
1713 swap_cluster_assert_table_empty(ci, offset, nr_pages);
1714
1715 if (!ci->count)
1716 free_cluster(si, ci);
1717 else
1718 partial_free_cluster(si, ci);
1719 }
1720
__swap_count(swp_entry_t entry)1721 int __swap_count(swp_entry_t entry)
1722 {
1723 struct swap_info_struct *si = __swap_entry_to_info(entry);
1724 pgoff_t offset = swp_offset(entry);
1725
1726 return si->swap_map[offset];
1727 }
1728
1729 /**
1730 * swap_entry_swapped - Check if the swap entry is swapped.
1731 * @si: the swap device.
1732 * @entry: the swap entry.
1733 */
swap_entry_swapped(struct swap_info_struct * si,swp_entry_t entry)1734 bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry)
1735 {
1736 pgoff_t offset = swp_offset(entry);
1737 struct swap_cluster_info *ci;
1738 int count;
1739
1740 ci = swap_cluster_lock(si, offset);
1741 count = si->swap_map[offset];
1742 swap_cluster_unlock(ci);
1743
1744 return count && count != SWAP_MAP_BAD;
1745 }
1746
1747 /*
1748 * How many references to @entry are currently swapped out?
1749 * This considers COUNT_CONTINUED so it returns exact answer.
1750 */
swp_swapcount(swp_entry_t entry)1751 int swp_swapcount(swp_entry_t entry)
1752 {
1753 int count, tmp_count, n;
1754 struct swap_info_struct *si;
1755 struct swap_cluster_info *ci;
1756 struct page *page;
1757 pgoff_t offset;
1758 unsigned char *map;
1759
1760 si = get_swap_device(entry);
1761 if (!si)
1762 return 0;
1763
1764 offset = swp_offset(entry);
1765
1766 ci = swap_cluster_lock(si, offset);
1767
1768 count = si->swap_map[offset];
1769 if (!(count & COUNT_CONTINUED))
1770 goto out;
1771
1772 count &= ~COUNT_CONTINUED;
1773 n = SWAP_MAP_MAX + 1;
1774
1775 page = vmalloc_to_page(si->swap_map + offset);
1776 offset &= ~PAGE_MASK;
1777 VM_BUG_ON(page_private(page) != SWP_CONTINUED);
1778
1779 do {
1780 page = list_next_entry(page, lru);
1781 map = kmap_local_page(page);
1782 tmp_count = map[offset];
1783 kunmap_local(map);
1784
1785 count += (tmp_count & ~COUNT_CONTINUED) * n;
1786 n *= (SWAP_CONT_MAX + 1);
1787 } while (tmp_count & COUNT_CONTINUED);
1788 out:
1789 swap_cluster_unlock(ci);
1790 put_swap_device(si);
1791 return count;
1792 }
1793
swap_page_trans_huge_swapped(struct swap_info_struct * si,swp_entry_t entry,int order)1794 static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
1795 swp_entry_t entry, int order)
1796 {
1797 struct swap_cluster_info *ci;
1798 unsigned char *map = si->swap_map;
1799 unsigned int nr_pages = 1 << order;
1800 unsigned long roffset = swp_offset(entry);
1801 unsigned long offset = round_down(roffset, nr_pages);
1802 int i;
1803 bool ret = false;
1804
1805 ci = swap_cluster_lock(si, offset);
1806 if (nr_pages == 1) {
1807 if (map[roffset])
1808 ret = true;
1809 goto unlock_out;
1810 }
1811 for (i = 0; i < nr_pages; i++) {
1812 if (map[offset + i]) {
1813 ret = true;
1814 break;
1815 }
1816 }
1817 unlock_out:
1818 swap_cluster_unlock(ci);
1819 return ret;
1820 }
1821
folio_swapped(struct folio * folio)1822 static bool folio_swapped(struct folio *folio)
1823 {
1824 swp_entry_t entry = folio->swap;
1825 struct swap_info_struct *si;
1826
1827 VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
1828 VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio);
1829
1830 si = __swap_entry_to_info(entry);
1831 if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!folio_test_large(folio)))
1832 return swap_entry_swapped(si, entry);
1833
1834 return swap_page_trans_huge_swapped(si, entry, folio_order(folio));
1835 }
1836
folio_swapcache_freeable(struct folio * folio)1837 static bool folio_swapcache_freeable(struct folio *folio)
1838 {
1839 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
1840
1841 if (!folio_test_swapcache(folio))
1842 return false;
1843 if (folio_test_writeback(folio))
1844 return false;
1845
1846 /*
1847 * Once hibernation has begun to create its image of memory,
1848 * there's a danger that one of the calls to folio_free_swap()
1849 * - most probably a call from __try_to_reclaim_swap() while
1850 * hibernation is allocating its own swap pages for the image,
1851 * but conceivably even a call from memory reclaim - will free
1852 * the swap from a folio which has already been recorded in the
1853 * image as a clean swapcache folio, and then reuse its swap for
1854 * another page of the image. On waking from hibernation, the
1855 * original folio might be freed under memory pressure, then
1856 * later read back in from swap, now with the wrong data.
1857 *
1858 * Hibernation suspends storage while it is writing the image
1859 * to disk so check that here.
1860 */
1861 if (pm_suspended_storage())
1862 return false;
1863
1864 return true;
1865 }
1866
1867 /**
1868 * folio_free_swap() - Free the swap space used for this folio.
1869 * @folio: The folio to remove.
1870 *
1871 * If swap is getting full, or if there are no more mappings of this folio,
1872 * then call folio_free_swap to free its swap space.
1873 *
1874 * Return: true if we were able to release the swap space.
1875 */
folio_free_swap(struct folio * folio)1876 bool folio_free_swap(struct folio *folio)
1877 {
1878 if (!folio_swapcache_freeable(folio))
1879 return false;
1880 if (folio_swapped(folio))
1881 return false;
1882
1883 swap_cache_del_folio(folio);
1884 folio_set_dirty(folio);
1885 return true;
1886 }
1887
1888 /**
1889 * swap_put_entries_direct() - Release reference on range of swap entries and
1890 * reclaim their cache if no more references remain.
1891 * @entry: First entry of range.
1892 * @nr: Number of entries in range.
1893 *
1894 * For each swap entry in the contiguous range, release a reference. If any swap
1895 * entries become free, try to reclaim their underlying folios, if present. The
1896 * offset range is defined by [entry.offset, entry.offset + nr).
1897 *
1898 * Context: Caller must ensure there is no race condition on the reference
1899 * owner. e.g., locking the PTL of a PTE containing the entry being released.
1900 */
swap_put_entries_direct(swp_entry_t entry,int nr)1901 void swap_put_entries_direct(swp_entry_t entry, int nr)
1902 {
1903 const unsigned long start_offset = swp_offset(entry);
1904 const unsigned long end_offset = start_offset + nr;
1905 unsigned long offset, cluster_end;
1906 struct swap_info_struct *si;
1907
1908 si = get_swap_device(entry);
1909 if (WARN_ON_ONCE(!si))
1910 return;
1911 if (WARN_ON_ONCE(end_offset > si->max))
1912 goto out;
1913
1914 /* Put entries and reclaim cache in each cluster */
1915 offset = start_offset;
1916 do {
1917 cluster_end = min(round_up(offset + 1, SWAPFILE_CLUSTER), end_offset);
1918 swap_put_entries_cluster(si, offset, cluster_end - offset, true);
1919 offset = cluster_end;
1920 } while (offset < end_offset);
1921 out:
1922 put_swap_device(si);
1923 }
1924
1925 #ifdef CONFIG_HIBERNATION
1926 /* Allocate a slot for hibernation */
swap_alloc_hibernation_slot(int type)1927 swp_entry_t swap_alloc_hibernation_slot(int type)
1928 {
1929 struct swap_info_struct *si = swap_type_to_info(type);
1930 unsigned long offset;
1931 swp_entry_t entry = {0};
1932
1933 if (!si)
1934 goto fail;
1935
1936 /* This is called for allocating swap entry, not cache */
1937 if (get_swap_device_info(si)) {
1938 if (si->flags & SWP_WRITEOK) {
1939 /*
1940 * Grab the local lock to be compliant
1941 * with swap table allocation.
1942 */
1943 local_lock(&percpu_swap_cluster.lock);
1944 offset = cluster_alloc_swap_entry(si, NULL);
1945 local_unlock(&percpu_swap_cluster.lock);
1946 if (offset)
1947 entry = swp_entry(si->type, offset);
1948 }
1949 put_swap_device(si);
1950 }
1951 fail:
1952 return entry;
1953 }
1954
1955 /* Free a slot allocated by swap_alloc_hibernation_slot */
swap_free_hibernation_slot(swp_entry_t entry)1956 void swap_free_hibernation_slot(swp_entry_t entry)
1957 {
1958 struct swap_info_struct *si;
1959 struct swap_cluster_info *ci;
1960 pgoff_t offset = swp_offset(entry);
1961
1962 si = get_swap_device(entry);
1963 if (WARN_ON(!si))
1964 return;
1965
1966 ci = swap_cluster_lock(si, offset);
1967 swap_put_entry_locked(si, ci, offset);
1968 swap_cluster_unlock(ci);
1969
1970 /* In theory readahead might add it to the swap cache by accident */
1971 __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
1972 put_swap_device(si);
1973 }
1974
1975 /*
1976 * Find the swap type that corresponds to given device (if any).
1977 *
1978 * @offset - number of the PAGE_SIZE-sized block of the device, starting
1979 * from 0, in which the swap header is expected to be located.
1980 *
1981 * This is needed for the suspend to disk (aka swsusp).
1982 */
swap_type_of(dev_t device,sector_t offset)1983 int swap_type_of(dev_t device, sector_t offset)
1984 {
1985 int type;
1986
1987 if (!device)
1988 return -1;
1989
1990 spin_lock(&swap_lock);
1991 for (type = 0; type < nr_swapfiles; type++) {
1992 struct swap_info_struct *sis = swap_info[type];
1993
1994 if (!(sis->flags & SWP_WRITEOK))
1995 continue;
1996
1997 if (device == sis->bdev->bd_dev) {
1998 struct swap_extent *se = first_se(sis);
1999
2000 if (se->start_block == offset) {
2001 spin_unlock(&swap_lock);
2002 return type;
2003 }
2004 }
2005 }
2006 spin_unlock(&swap_lock);
2007 return -ENODEV;
2008 }
2009
find_first_swap(dev_t * device)2010 int find_first_swap(dev_t *device)
2011 {
2012 int type;
2013
2014 spin_lock(&swap_lock);
2015 for (type = 0; type < nr_swapfiles; type++) {
2016 struct swap_info_struct *sis = swap_info[type];
2017
2018 if (!(sis->flags & SWP_WRITEOK))
2019 continue;
2020 *device = sis->bdev->bd_dev;
2021 spin_unlock(&swap_lock);
2022 return type;
2023 }
2024 spin_unlock(&swap_lock);
2025 return -ENODEV;
2026 }
2027
2028 /*
2029 * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
2030 * corresponding to given index in swap_info (swap type).
2031 */
swapdev_block(int type,pgoff_t offset)2032 sector_t swapdev_block(int type, pgoff_t offset)
2033 {
2034 struct swap_info_struct *si = swap_type_to_info(type);
2035 struct swap_extent *se;
2036
2037 if (!si || !(si->flags & SWP_WRITEOK))
2038 return 0;
2039 se = offset_to_swap_extent(si, offset);
2040 return se->start_block + (offset - se->start_page);
2041 }
2042
2043 /*
2044 * Return either the total number of swap pages of given type, or the number
2045 * of free pages of that type (depending on @free)
2046 *
2047 * This is needed for software suspend
2048 */
count_swap_pages(int type,int free)2049 unsigned int count_swap_pages(int type, int free)
2050 {
2051 unsigned int n = 0;
2052
2053 spin_lock(&swap_lock);
2054 if ((unsigned int)type < nr_swapfiles) {
2055 struct swap_info_struct *sis = swap_info[type];
2056
2057 spin_lock(&sis->lock);
2058 if (sis->flags & SWP_WRITEOK) {
2059 n = sis->pages;
2060 if (free)
2061 n -= swap_usage_in_pages(sis);
2062 }
2063 spin_unlock(&sis->lock);
2064 }
2065 spin_unlock(&swap_lock);
2066 return n;
2067 }
2068 #endif /* CONFIG_HIBERNATION */
2069
pte_same_as_swp(pte_t pte,pte_t swp_pte)2070 static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte)
2071 {
2072 return pte_same(pte_swp_clear_flags(pte), swp_pte);
2073 }
2074
2075 /*
2076 * No need to decide whether this PTE shares the swap entry with others,
2077 * just let do_wp_page work it out if a write is requested later - to
2078 * force COW, vm_page_prot omits write permission from any private vma.
2079 */
unuse_pte(struct vm_area_struct * vma,pmd_t * pmd,unsigned long addr,swp_entry_t entry,struct folio * folio)2080 static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
2081 unsigned long addr, swp_entry_t entry, struct folio *folio)
2082 {
2083 struct page *page;
2084 struct folio *swapcache;
2085 spinlock_t *ptl;
2086 pte_t *pte, new_pte, old_pte;
2087 bool hwpoisoned = false;
2088 int ret = 1;
2089
2090 /*
2091 * If the folio is removed from swap cache by others, continue to
2092 * unuse other PTEs. try_to_unuse may try again if we missed this one.
2093 */
2094 if (!folio_matches_swap_entry(folio, entry))
2095 return 0;
2096
2097 swapcache = folio;
2098 folio = ksm_might_need_to_copy(folio, vma, addr);
2099 if (unlikely(!folio))
2100 return -ENOMEM;
2101 else if (unlikely(folio == ERR_PTR(-EHWPOISON))) {
2102 hwpoisoned = true;
2103 folio = swapcache;
2104 }
2105
2106 page = folio_file_page(folio, swp_offset(entry));
2107 if (PageHWPoison(page))
2108 hwpoisoned = true;
2109
2110 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
2111 if (unlikely(!pte || !pte_same_as_swp(ptep_get(pte),
2112 swp_entry_to_pte(entry)))) {
2113 ret = 0;
2114 goto out;
2115 }
2116
2117 old_pte = ptep_get(pte);
2118
2119 if (unlikely(hwpoisoned || !folio_test_uptodate(folio))) {
2120 swp_entry_t swp_entry;
2121
2122 dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
2123 if (hwpoisoned) {
2124 swp_entry = make_hwpoison_entry(page);
2125 } else {
2126 swp_entry = make_poisoned_swp_entry();
2127 }
2128 new_pte = swp_entry_to_pte(swp_entry);
2129 ret = 0;
2130 goto setpte;
2131 }
2132
2133 /*
2134 * Some architectures may have to restore extra metadata to the page
2135 * when reading from swap. This metadata may be indexed by swap entry
2136 * so this must be called before folio_put_swap().
2137 */
2138 arch_swap_restore(folio_swap(entry, folio), folio);
2139
2140 dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
2141 inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
2142 folio_get(folio);
2143 if (folio == swapcache) {
2144 rmap_t rmap_flags = RMAP_NONE;
2145
2146 /*
2147 * See do_swap_page(): writeback would be problematic.
2148 * However, we do a folio_wait_writeback() just before this
2149 * call and have the folio locked.
2150 */
2151 VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio);
2152 if (pte_swp_exclusive(old_pte))
2153 rmap_flags |= RMAP_EXCLUSIVE;
2154 /*
2155 * We currently only expect small !anon folios, which are either
2156 * fully exclusive or fully shared. If we ever get large folios
2157 * here, we have to be careful.
2158 */
2159 if (!folio_test_anon(folio)) {
2160 VM_WARN_ON_ONCE(folio_test_large(folio));
2161 VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
2162 folio_add_new_anon_rmap(folio, vma, addr, rmap_flags);
2163 } else {
2164 folio_add_anon_rmap_pte(folio, page, vma, addr, rmap_flags);
2165 }
2166 } else { /* ksm created a completely new copy */
2167 folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE);
2168 folio_add_lru_vma(folio, vma);
2169 }
2170 new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot));
2171 if (pte_swp_soft_dirty(old_pte))
2172 new_pte = pte_mksoft_dirty(new_pte);
2173 if (pte_swp_uffd_wp(old_pte))
2174 new_pte = pte_mkuffd_wp(new_pte);
2175 setpte:
2176 set_pte_at(vma->vm_mm, addr, pte, new_pte);
2177 folio_put_swap(swapcache, folio_file_page(swapcache, swp_offset(entry)));
2178 out:
2179 if (pte)
2180 pte_unmap_unlock(pte, ptl);
2181 if (folio != swapcache) {
2182 folio_unlock(folio);
2183 folio_put(folio);
2184 }
2185 return ret;
2186 }
2187
unuse_pte_range(struct vm_area_struct * vma,pmd_t * pmd,unsigned long addr,unsigned long end,unsigned int type)2188 static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
2189 unsigned long addr, unsigned long end,
2190 unsigned int type)
2191 {
2192 pte_t *pte = NULL;
2193 struct swap_info_struct *si;
2194
2195 si = swap_info[type];
2196 do {
2197 struct folio *folio;
2198 unsigned long offset;
2199 unsigned char swp_count;
2200 softleaf_t entry;
2201 int ret;
2202 pte_t ptent;
2203
2204 if (!pte++) {
2205 pte = pte_offset_map(pmd, addr);
2206 if (!pte)
2207 break;
2208 }
2209
2210 ptent = ptep_get_lockless(pte);
2211 entry = softleaf_from_pte(ptent);
2212
2213 if (!softleaf_is_swap(entry))
2214 continue;
2215 if (swp_type(entry) != type)
2216 continue;
2217
2218 offset = swp_offset(entry);
2219 pte_unmap(pte);
2220 pte = NULL;
2221
2222 folio = swap_cache_get_folio(entry);
2223 if (!folio) {
2224 struct vm_fault vmf = {
2225 .vma = vma,
2226 .address = addr,
2227 .real_address = addr,
2228 .pmd = pmd,
2229 };
2230
2231 folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
2232 &vmf);
2233 }
2234 if (!folio) {
2235 swp_count = READ_ONCE(si->swap_map[offset]);
2236 if (swp_count == 0 || swp_count == SWAP_MAP_BAD)
2237 continue;
2238 return -ENOMEM;
2239 }
2240
2241 folio_lock(folio);
2242 folio_wait_writeback(folio);
2243 ret = unuse_pte(vma, pmd, addr, entry, folio);
2244 if (ret < 0) {
2245 folio_unlock(folio);
2246 folio_put(folio);
2247 return ret;
2248 }
2249
2250 folio_free_swap(folio);
2251 folio_unlock(folio);
2252 folio_put(folio);
2253 } while (addr += PAGE_SIZE, addr != end);
2254
2255 if (pte)
2256 pte_unmap(pte);
2257 return 0;
2258 }
2259
unuse_pmd_range(struct vm_area_struct * vma,pud_t * pud,unsigned long addr,unsigned long end,unsigned int type)2260 static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
2261 unsigned long addr, unsigned long end,
2262 unsigned int type)
2263 {
2264 pmd_t *pmd;
2265 unsigned long next;
2266 int ret;
2267
2268 pmd = pmd_offset(pud, addr);
2269 do {
2270 cond_resched();
2271 next = pmd_addr_end(addr, end);
2272 ret = unuse_pte_range(vma, pmd, addr, next, type);
2273 if (ret)
2274 return ret;
2275 } while (pmd++, addr = next, addr != end);
2276 return 0;
2277 }
2278
unuse_pud_range(struct vm_area_struct * vma,p4d_t * p4d,unsigned long addr,unsigned long end,unsigned int type)2279 static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
2280 unsigned long addr, unsigned long end,
2281 unsigned int type)
2282 {
2283 pud_t *pud;
2284 unsigned long next;
2285 int ret;
2286
2287 pud = pud_offset(p4d, addr);
2288 do {
2289 next = pud_addr_end(addr, end);
2290 if (pud_none_or_clear_bad(pud))
2291 continue;
2292 ret = unuse_pmd_range(vma, pud, addr, next, type);
2293 if (ret)
2294 return ret;
2295 } while (pud++, addr = next, addr != end);
2296 return 0;
2297 }
2298
unuse_p4d_range(struct vm_area_struct * vma,pgd_t * pgd,unsigned long addr,unsigned long end,unsigned int type)2299 static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd,
2300 unsigned long addr, unsigned long end,
2301 unsigned int type)
2302 {
2303 p4d_t *p4d;
2304 unsigned long next;
2305 int ret;
2306
2307 p4d = p4d_offset(pgd, addr);
2308 do {
2309 next = p4d_addr_end(addr, end);
2310 if (p4d_none_or_clear_bad(p4d))
2311 continue;
2312 ret = unuse_pud_range(vma, p4d, addr, next, type);
2313 if (ret)
2314 return ret;
2315 } while (p4d++, addr = next, addr != end);
2316 return 0;
2317 }
2318
unuse_vma(struct vm_area_struct * vma,unsigned int type)2319 static int unuse_vma(struct vm_area_struct *vma, unsigned int type)
2320 {
2321 pgd_t *pgd;
2322 unsigned long addr, end, next;
2323 int ret;
2324
2325 addr = vma->vm_start;
2326 end = vma->vm_end;
2327
2328 pgd = pgd_offset(vma->vm_mm, addr);
2329 do {
2330 next = pgd_addr_end(addr, end);
2331 if (pgd_none_or_clear_bad(pgd))
2332 continue;
2333 ret = unuse_p4d_range(vma, pgd, addr, next, type);
2334 if (ret)
2335 return ret;
2336 } while (pgd++, addr = next, addr != end);
2337 return 0;
2338 }
2339
unuse_mm(struct mm_struct * mm,unsigned int type)2340 static int unuse_mm(struct mm_struct *mm, unsigned int type)
2341 {
2342 struct vm_area_struct *vma;
2343 int ret = 0;
2344 VMA_ITERATOR(vmi, mm, 0);
2345
2346 mmap_read_lock(mm);
2347 if (check_stable_address_space(mm))
2348 goto unlock;
2349 for_each_vma(vmi, vma) {
2350 if (vma->anon_vma && !is_vm_hugetlb_page(vma)) {
2351 ret = unuse_vma(vma, type);
2352 if (ret)
2353 break;
2354 }
2355
2356 cond_resched();
2357 }
2358 unlock:
2359 mmap_read_unlock(mm);
2360 return ret;
2361 }
2362
2363 /*
2364 * Scan swap_map from current position to next entry still in use.
2365 * Return 0 if there are no inuse entries after prev till end of
2366 * the map.
2367 */
find_next_to_unuse(struct swap_info_struct * si,unsigned int prev)2368 static unsigned int find_next_to_unuse(struct swap_info_struct *si,
2369 unsigned int prev)
2370 {
2371 unsigned int i;
2372 unsigned long swp_tb;
2373 unsigned char count;
2374
2375 /*
2376 * No need for swap_lock here: we're just looking
2377 * for whether an entry is in use, not modifying it; false
2378 * hits are okay, and sys_swapoff() has already prevented new
2379 * allocations from this area (while holding swap_lock).
2380 */
2381 for (i = prev + 1; i < si->max; i++) {
2382 count = READ_ONCE(si->swap_map[i]);
2383 swp_tb = swap_table_get(__swap_offset_to_cluster(si, i),
2384 i % SWAPFILE_CLUSTER);
2385 if (count == SWAP_MAP_BAD)
2386 continue;
2387 if (count || swp_tb_is_folio(swp_tb))
2388 break;
2389 if ((i % LATENCY_LIMIT) == 0)
2390 cond_resched();
2391 }
2392
2393 if (i == si->max)
2394 i = 0;
2395
2396 return i;
2397 }
2398
try_to_unuse(unsigned int type)2399 static int try_to_unuse(unsigned int type)
2400 {
2401 struct mm_struct *prev_mm;
2402 struct mm_struct *mm;
2403 struct list_head *p;
2404 int retval = 0;
2405 struct swap_info_struct *si = swap_info[type];
2406 struct folio *folio;
2407 swp_entry_t entry;
2408 unsigned int i;
2409
2410 if (!swap_usage_in_pages(si))
2411 goto success;
2412
2413 retry:
2414 retval = shmem_unuse(type);
2415 if (retval)
2416 return retval;
2417
2418 prev_mm = &init_mm;
2419 mmget(prev_mm);
2420
2421 spin_lock(&mmlist_lock);
2422 p = &init_mm.mmlist;
2423 while (swap_usage_in_pages(si) &&
2424 !signal_pending(current) &&
2425 (p = p->next) != &init_mm.mmlist) {
2426
2427 mm = list_entry(p, struct mm_struct, mmlist);
2428 if (!mmget_not_zero(mm))
2429 continue;
2430 spin_unlock(&mmlist_lock);
2431 mmput(prev_mm);
2432 prev_mm = mm;
2433 retval = unuse_mm(mm, type);
2434 if (retval) {
2435 mmput(prev_mm);
2436 return retval;
2437 }
2438
2439 /*
2440 * Make sure that we aren't completely killing
2441 * interactive performance.
2442 */
2443 cond_resched();
2444 spin_lock(&mmlist_lock);
2445 }
2446 spin_unlock(&mmlist_lock);
2447
2448 mmput(prev_mm);
2449
2450 i = 0;
2451 while (swap_usage_in_pages(si) &&
2452 !signal_pending(current) &&
2453 (i = find_next_to_unuse(si, i)) != 0) {
2454
2455 entry = swp_entry(type, i);
2456 folio = swap_cache_get_folio(entry);
2457 if (!folio)
2458 continue;
2459
2460 /*
2461 * It is conceivable that a racing task removed this folio from
2462 * swap cache just before we acquired the page lock. The folio
2463 * might even be back in swap cache on another swap area. But
2464 * that is okay, folio_free_swap() only removes stale folios.
2465 */
2466 folio_lock(folio);
2467 folio_wait_writeback(folio);
2468 folio_free_swap(folio);
2469 folio_unlock(folio);
2470 folio_put(folio);
2471 }
2472
2473 /*
2474 * Lets check again to see if there are still swap entries in the map.
2475 * If yes, we would need to do retry the unuse logic again.
2476 * Under global memory pressure, swap entries can be reinserted back
2477 * into process space after the mmlist loop above passes over them.
2478 *
2479 * Limit the number of retries? No: when mmget_not_zero()
2480 * above fails, that mm is likely to be freeing swap from
2481 * exit_mmap(), which proceeds at its own independent pace;
2482 * and even shmem_writeout() could have been preempted after
2483 * folio_alloc_swap(), temporarily hiding that swap. It's easy
2484 * and robust (though cpu-intensive) just to keep retrying.
2485 */
2486 if (swap_usage_in_pages(si)) {
2487 if (!signal_pending(current))
2488 goto retry;
2489 return -EINTR;
2490 }
2491
2492 success:
2493 /*
2494 * Make sure that further cleanups after try_to_unuse() returns happen
2495 * after swap_range_free() reduces si->inuse_pages to 0.
2496 */
2497 smp_mb();
2498 return 0;
2499 }
2500
2501 /*
2502 * After a successful try_to_unuse, if no swap is now in use, we know
2503 * we can empty the mmlist. swap_lock must be held on entry and exit.
2504 * Note that mmlist_lock nests inside swap_lock, and an mm must be
2505 * added to the mmlist just after page_duplicate - before would be racy.
2506 */
drain_mmlist(void)2507 static void drain_mmlist(void)
2508 {
2509 struct list_head *p, *next;
2510 unsigned int type;
2511
2512 for (type = 0; type < nr_swapfiles; type++)
2513 if (swap_usage_in_pages(swap_info[type]))
2514 return;
2515 spin_lock(&mmlist_lock);
2516 list_for_each_safe(p, next, &init_mm.mmlist)
2517 list_del_init(p);
2518 spin_unlock(&mmlist_lock);
2519 }
2520
2521 /*
2522 * Free all of a swapdev's extent information
2523 */
destroy_swap_extents(struct swap_info_struct * sis)2524 static void destroy_swap_extents(struct swap_info_struct *sis)
2525 {
2526 while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) {
2527 struct rb_node *rb = sis->swap_extent_root.rb_node;
2528 struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node);
2529
2530 rb_erase(rb, &sis->swap_extent_root);
2531 kfree(se);
2532 }
2533
2534 if (sis->flags & SWP_ACTIVATED) {
2535 struct file *swap_file = sis->swap_file;
2536 struct address_space *mapping = swap_file->f_mapping;
2537
2538 sis->flags &= ~SWP_ACTIVATED;
2539 if (mapping->a_ops->swap_deactivate)
2540 mapping->a_ops->swap_deactivate(swap_file);
2541 }
2542 }
2543
2544 /*
2545 * Add a block range (and the corresponding page range) into this swapdev's
2546 * extent tree.
2547 *
2548 * This function rather assumes that it is called in ascending page order.
2549 */
2550 int
add_swap_extent(struct swap_info_struct * sis,unsigned long start_page,unsigned long nr_pages,sector_t start_block)2551 add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
2552 unsigned long nr_pages, sector_t start_block)
2553 {
2554 struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL;
2555 struct swap_extent *se;
2556 struct swap_extent *new_se;
2557
2558 /*
2559 * place the new node at the right most since the
2560 * function is called in ascending page order.
2561 */
2562 while (*link) {
2563 parent = *link;
2564 link = &parent->rb_right;
2565 }
2566
2567 if (parent) {
2568 se = rb_entry(parent, struct swap_extent, rb_node);
2569 BUG_ON(se->start_page + se->nr_pages != start_page);
2570 if (se->start_block + se->nr_pages == start_block) {
2571 /* Merge it */
2572 se->nr_pages += nr_pages;
2573 return 0;
2574 }
2575 }
2576
2577 /* No merge, insert a new extent. */
2578 new_se = kmalloc_obj(*se);
2579 if (new_se == NULL)
2580 return -ENOMEM;
2581 new_se->start_page = start_page;
2582 new_se->nr_pages = nr_pages;
2583 new_se->start_block = start_block;
2584
2585 rb_link_node(&new_se->rb_node, parent, link);
2586 rb_insert_color(&new_se->rb_node, &sis->swap_extent_root);
2587 return 1;
2588 }
2589 EXPORT_SYMBOL_GPL(add_swap_extent);
2590
2591 /*
2592 * A `swap extent' is a simple thing which maps a contiguous range of pages
2593 * onto a contiguous range of disk blocks. A rbtree of swap extents is
2594 * built at swapon time and is then used at swap_writepage/swap_read_folio
2595 * time for locating where on disk a page belongs.
2596 *
2597 * If the swapfile is an S_ISBLK block device, a single extent is installed.
2598 * This is done so that the main operating code can treat S_ISBLK and S_ISREG
2599 * swap files identically.
2600 *
2601 * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap
2602 * extent rbtree operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK
2603 * swapfiles are handled *identically* after swapon time.
2604 *
2605 * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks
2606 * and will parse them into a rbtree, in PAGE_SIZE chunks. If some stray
2607 * blocks are found which do not fall within the PAGE_SIZE alignment
2608 * requirements, they are simply tossed out - we will never use those blocks
2609 * for swapping.
2610 *
2611 * For all swap devices we set S_SWAPFILE across the life of the swapon. This
2612 * prevents users from writing to the swap device, which will corrupt memory.
2613 *
2614 * The amount of disk space which a single swap extent represents varies.
2615 * Typically it is in the 1-4 megabyte range. So we can have hundreds of
2616 * extents in the rbtree. - akpm.
2617 */
setup_swap_extents(struct swap_info_struct * sis,sector_t * span)2618 static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
2619 {
2620 struct file *swap_file = sis->swap_file;
2621 struct address_space *mapping = swap_file->f_mapping;
2622 struct inode *inode = mapping->host;
2623 int ret;
2624
2625 if (S_ISBLK(inode->i_mode)) {
2626 ret = add_swap_extent(sis, 0, sis->max, 0);
2627 *span = sis->pages;
2628 return ret;
2629 }
2630
2631 if (mapping->a_ops->swap_activate) {
2632 ret = mapping->a_ops->swap_activate(sis, swap_file, span);
2633 if (ret < 0)
2634 return ret;
2635 sis->flags |= SWP_ACTIVATED;
2636 if ((sis->flags & SWP_FS_OPS) &&
2637 sio_pool_init() != 0) {
2638 destroy_swap_extents(sis);
2639 return -ENOMEM;
2640 }
2641 return ret;
2642 }
2643
2644 return generic_swapfile_activate(sis, swap_file, span);
2645 }
2646
setup_swap_info(struct swap_info_struct * si,int prio,unsigned char * swap_map,struct swap_cluster_info * cluster_info,unsigned long * zeromap)2647 static void setup_swap_info(struct swap_info_struct *si, int prio,
2648 unsigned char *swap_map,
2649 struct swap_cluster_info *cluster_info,
2650 unsigned long *zeromap)
2651 {
2652 si->prio = prio;
2653 /*
2654 * the plist prio is negated because plist ordering is
2655 * low-to-high, while swap ordering is high-to-low
2656 */
2657 si->list.prio = -si->prio;
2658 si->avail_list.prio = -si->prio;
2659 si->swap_map = swap_map;
2660 si->cluster_info = cluster_info;
2661 si->zeromap = zeromap;
2662 }
2663
_enable_swap_info(struct swap_info_struct * si)2664 static void _enable_swap_info(struct swap_info_struct *si)
2665 {
2666 atomic_long_add(si->pages, &nr_swap_pages);
2667 total_swap_pages += si->pages;
2668
2669 assert_spin_locked(&swap_lock);
2670
2671 plist_add(&si->list, &swap_active_head);
2672
2673 /* Add back to available list */
2674 add_to_avail_list(si, true);
2675 }
2676
enable_swap_info(struct swap_info_struct * si,int prio,unsigned char * swap_map,struct swap_cluster_info * cluster_info,unsigned long * zeromap)2677 static void enable_swap_info(struct swap_info_struct *si, int prio,
2678 unsigned char *swap_map,
2679 struct swap_cluster_info *cluster_info,
2680 unsigned long *zeromap)
2681 {
2682 spin_lock(&swap_lock);
2683 spin_lock(&si->lock);
2684 setup_swap_info(si, prio, swap_map, cluster_info, zeromap);
2685 spin_unlock(&si->lock);
2686 spin_unlock(&swap_lock);
2687 /*
2688 * Finished initializing swap device, now it's safe to reference it.
2689 */
2690 percpu_ref_resurrect(&si->users);
2691 spin_lock(&swap_lock);
2692 spin_lock(&si->lock);
2693 _enable_swap_info(si);
2694 spin_unlock(&si->lock);
2695 spin_unlock(&swap_lock);
2696 }
2697
reinsert_swap_info(struct swap_info_struct * si)2698 static void reinsert_swap_info(struct swap_info_struct *si)
2699 {
2700 spin_lock(&swap_lock);
2701 spin_lock(&si->lock);
2702 setup_swap_info(si, si->prio, si->swap_map, si->cluster_info, si->zeromap);
2703 _enable_swap_info(si);
2704 spin_unlock(&si->lock);
2705 spin_unlock(&swap_lock);
2706 }
2707
2708 /*
2709 * Called after clearing SWP_WRITEOK, ensures cluster_alloc_range
2710 * see the updated flags, so there will be no more allocations.
2711 */
wait_for_allocation(struct swap_info_struct * si)2712 static void wait_for_allocation(struct swap_info_struct *si)
2713 {
2714 unsigned long offset;
2715 unsigned long end = ALIGN(si->max, SWAPFILE_CLUSTER);
2716 struct swap_cluster_info *ci;
2717
2718 BUG_ON(si->flags & SWP_WRITEOK);
2719
2720 for (offset = 0; offset < end; offset += SWAPFILE_CLUSTER) {
2721 ci = swap_cluster_lock(si, offset);
2722 swap_cluster_unlock(ci);
2723 }
2724 }
2725
free_cluster_info(struct swap_cluster_info * cluster_info,unsigned long maxpages)2726 static void free_cluster_info(struct swap_cluster_info *cluster_info,
2727 unsigned long maxpages)
2728 {
2729 struct swap_cluster_info *ci;
2730 int i, nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
2731
2732 if (!cluster_info)
2733 return;
2734 for (i = 0; i < nr_clusters; i++) {
2735 ci = cluster_info + i;
2736 /* Cluster with bad marks count will have a remaining table */
2737 spin_lock(&ci->lock);
2738 if (rcu_dereference_protected(ci->table, true)) {
2739 ci->count = 0;
2740 swap_cluster_free_table(ci);
2741 }
2742 spin_unlock(&ci->lock);
2743 }
2744 kvfree(cluster_info);
2745 }
2746
2747 /*
2748 * Called after swap device's reference count is dead, so
2749 * neither scan nor allocation will use it.
2750 */
flush_percpu_swap_cluster(struct swap_info_struct * si)2751 static void flush_percpu_swap_cluster(struct swap_info_struct *si)
2752 {
2753 int cpu, i;
2754 struct swap_info_struct **pcp_si;
2755
2756 for_each_possible_cpu(cpu) {
2757 pcp_si = per_cpu_ptr(percpu_swap_cluster.si, cpu);
2758 /*
2759 * Invalidate the percpu swap cluster cache, si->users
2760 * is dead, so no new user will point to it, just flush
2761 * any existing user.
2762 */
2763 for (i = 0; i < SWAP_NR_ORDERS; i++)
2764 cmpxchg(&pcp_si[i], si, NULL);
2765 }
2766 }
2767
2768
SYSCALL_DEFINE1(swapoff,const char __user *,specialfile)2769 SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
2770 {
2771 struct swap_info_struct *p = NULL;
2772 unsigned char *swap_map;
2773 unsigned long *zeromap;
2774 struct swap_cluster_info *cluster_info;
2775 struct file *swap_file, *victim;
2776 struct address_space *mapping;
2777 struct inode *inode;
2778 unsigned int maxpages;
2779 int err, found = 0;
2780
2781 if (!capable(CAP_SYS_ADMIN))
2782 return -EPERM;
2783
2784 BUG_ON(!current->mm);
2785
2786 CLASS(filename, pathname)(specialfile);
2787 victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0);
2788 if (IS_ERR(victim))
2789 return PTR_ERR(victim);
2790
2791 mapping = victim->f_mapping;
2792 spin_lock(&swap_lock);
2793 plist_for_each_entry(p, &swap_active_head, list) {
2794 if (p->flags & SWP_WRITEOK) {
2795 if (p->swap_file->f_mapping == mapping) {
2796 found = 1;
2797 break;
2798 }
2799 }
2800 }
2801 if (!found) {
2802 err = -EINVAL;
2803 spin_unlock(&swap_lock);
2804 goto out_dput;
2805 }
2806 if (!security_vm_enough_memory_mm(current->mm, p->pages))
2807 vm_unacct_memory(p->pages);
2808 else {
2809 err = -ENOMEM;
2810 spin_unlock(&swap_lock);
2811 goto out_dput;
2812 }
2813 spin_lock(&p->lock);
2814 del_from_avail_list(p, true);
2815 plist_del(&p->list, &swap_active_head);
2816 atomic_long_sub(p->pages, &nr_swap_pages);
2817 total_swap_pages -= p->pages;
2818 spin_unlock(&p->lock);
2819 spin_unlock(&swap_lock);
2820
2821 wait_for_allocation(p);
2822
2823 set_current_oom_origin();
2824 err = try_to_unuse(p->type);
2825 clear_current_oom_origin();
2826
2827 if (err) {
2828 /* re-insert swap space back into swap_list */
2829 reinsert_swap_info(p);
2830 goto out_dput;
2831 }
2832
2833 /*
2834 * Wait for swap operations protected by get/put_swap_device()
2835 * to complete. Because of synchronize_rcu() here, all swap
2836 * operations protected by RCU reader side lock (including any
2837 * spinlock) will be waited too. This makes it easy to
2838 * prevent folio_test_swapcache() and the following swap cache
2839 * operations from racing with swapoff.
2840 */
2841 percpu_ref_kill(&p->users);
2842 synchronize_rcu();
2843 wait_for_completion(&p->comp);
2844
2845 flush_work(&p->discard_work);
2846 flush_work(&p->reclaim_work);
2847 flush_percpu_swap_cluster(p);
2848
2849 destroy_swap_extents(p);
2850 if (p->flags & SWP_CONTINUED)
2851 free_swap_count_continuations(p);
2852
2853 if (!(p->flags & SWP_SOLIDSTATE))
2854 atomic_dec(&nr_rotate_swap);
2855
2856 mutex_lock(&swapon_mutex);
2857 spin_lock(&swap_lock);
2858 spin_lock(&p->lock);
2859 drain_mmlist();
2860
2861 swap_file = p->swap_file;
2862 p->swap_file = NULL;
2863 swap_map = p->swap_map;
2864 p->swap_map = NULL;
2865 zeromap = p->zeromap;
2866 p->zeromap = NULL;
2867 maxpages = p->max;
2868 cluster_info = p->cluster_info;
2869 p->max = 0;
2870 p->cluster_info = NULL;
2871 spin_unlock(&p->lock);
2872 spin_unlock(&swap_lock);
2873 arch_swap_invalidate_area(p->type);
2874 zswap_swapoff(p->type);
2875 mutex_unlock(&swapon_mutex);
2876 kfree(p->global_cluster);
2877 p->global_cluster = NULL;
2878 vfree(swap_map);
2879 kvfree(zeromap);
2880 free_cluster_info(cluster_info, maxpages);
2881 /* Destroy swap account information */
2882 swap_cgroup_swapoff(p->type);
2883
2884 inode = mapping->host;
2885
2886 inode_lock(inode);
2887 inode->i_flags &= ~S_SWAPFILE;
2888 inode_unlock(inode);
2889 filp_close(swap_file, NULL);
2890
2891 /*
2892 * Clear the SWP_USED flag after all resources are freed so that swapon
2893 * can reuse this swap_info in alloc_swap_info() safely. It is ok to
2894 * not hold p->lock after we cleared its SWP_WRITEOK.
2895 */
2896 spin_lock(&swap_lock);
2897 p->flags = 0;
2898 spin_unlock(&swap_lock);
2899
2900 err = 0;
2901 atomic_inc(&proc_poll_event);
2902 wake_up_interruptible(&proc_poll_wait);
2903
2904 out_dput:
2905 filp_close(victim, NULL);
2906 return err;
2907 }
2908
2909 #ifdef CONFIG_PROC_FS
swaps_poll(struct file * file,poll_table * wait)2910 static __poll_t swaps_poll(struct file *file, poll_table *wait)
2911 {
2912 struct seq_file *seq = file->private_data;
2913
2914 poll_wait(file, &proc_poll_wait, wait);
2915
2916 if (seq->poll_event != atomic_read(&proc_poll_event)) {
2917 seq->poll_event = atomic_read(&proc_poll_event);
2918 return EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI;
2919 }
2920
2921 return EPOLLIN | EPOLLRDNORM;
2922 }
2923
2924 /* iterator */
swap_start(struct seq_file * swap,loff_t * pos)2925 static void *swap_start(struct seq_file *swap, loff_t *pos)
2926 {
2927 struct swap_info_struct *si;
2928 int type;
2929 loff_t l = *pos;
2930
2931 mutex_lock(&swapon_mutex);
2932
2933 if (!l)
2934 return SEQ_START_TOKEN;
2935
2936 for (type = 0; (si = swap_type_to_info(type)); type++) {
2937 if (!(si->flags & SWP_USED) || !si->swap_map)
2938 continue;
2939 if (!--l)
2940 return si;
2941 }
2942
2943 return NULL;
2944 }
2945
swap_next(struct seq_file * swap,void * v,loff_t * pos)2946 static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
2947 {
2948 struct swap_info_struct *si = v;
2949 int type;
2950
2951 if (v == SEQ_START_TOKEN)
2952 type = 0;
2953 else
2954 type = si->type + 1;
2955
2956 ++(*pos);
2957 for (; (si = swap_type_to_info(type)); type++) {
2958 if (!(si->flags & SWP_USED) || !si->swap_map)
2959 continue;
2960 return si;
2961 }
2962
2963 return NULL;
2964 }
2965
swap_stop(struct seq_file * swap,void * v)2966 static void swap_stop(struct seq_file *swap, void *v)
2967 {
2968 mutex_unlock(&swapon_mutex);
2969 }
2970
swap_show(struct seq_file * swap,void * v)2971 static int swap_show(struct seq_file *swap, void *v)
2972 {
2973 struct swap_info_struct *si = v;
2974 struct file *file;
2975 int len;
2976 unsigned long bytes, inuse;
2977
2978 if (si == SEQ_START_TOKEN) {
2979 seq_puts(swap, "Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n");
2980 return 0;
2981 }
2982
2983 bytes = K(si->pages);
2984 inuse = K(swap_usage_in_pages(si));
2985
2986 file = si->swap_file;
2987 len = seq_file_path(swap, file, " \t\n\\");
2988 seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n",
2989 len < 40 ? 40 - len : 1, " ",
2990 S_ISBLK(file_inode(file)->i_mode) ?
2991 "partition" : "file\t",
2992 bytes, bytes < 10000000 ? "\t" : "",
2993 inuse, inuse < 10000000 ? "\t" : "",
2994 si->prio);
2995 return 0;
2996 }
2997
2998 static const struct seq_operations swaps_op = {
2999 .start = swap_start,
3000 .next = swap_next,
3001 .stop = swap_stop,
3002 .show = swap_show
3003 };
3004
swaps_open(struct inode * inode,struct file * file)3005 static int swaps_open(struct inode *inode, struct file *file)
3006 {
3007 struct seq_file *seq;
3008 int ret;
3009
3010 ret = seq_open(file, &swaps_op);
3011 if (ret)
3012 return ret;
3013
3014 seq = file->private_data;
3015 seq->poll_event = atomic_read(&proc_poll_event);
3016 return 0;
3017 }
3018
3019 static const struct proc_ops swaps_proc_ops = {
3020 .proc_flags = PROC_ENTRY_PERMANENT,
3021 .proc_open = swaps_open,
3022 .proc_read = seq_read,
3023 .proc_lseek = seq_lseek,
3024 .proc_release = seq_release,
3025 .proc_poll = swaps_poll,
3026 };
3027
procswaps_init(void)3028 static int __init procswaps_init(void)
3029 {
3030 proc_create("swaps", 0, NULL, &swaps_proc_ops);
3031 return 0;
3032 }
3033 __initcall(procswaps_init);
3034 #endif /* CONFIG_PROC_FS */
3035
3036 #ifdef MAX_SWAPFILES_CHECK
max_swapfiles_check(void)3037 static int __init max_swapfiles_check(void)
3038 {
3039 MAX_SWAPFILES_CHECK();
3040 return 0;
3041 }
3042 late_initcall(max_swapfiles_check);
3043 #endif
3044
alloc_swap_info(void)3045 static struct swap_info_struct *alloc_swap_info(void)
3046 {
3047 struct swap_info_struct *p;
3048 struct swap_info_struct *defer = NULL;
3049 unsigned int type;
3050
3051 p = kvzalloc_obj(struct swap_info_struct);
3052 if (!p)
3053 return ERR_PTR(-ENOMEM);
3054
3055 if (percpu_ref_init(&p->users, swap_users_ref_free,
3056 PERCPU_REF_INIT_DEAD, GFP_KERNEL)) {
3057 kvfree(p);
3058 return ERR_PTR(-ENOMEM);
3059 }
3060
3061 spin_lock(&swap_lock);
3062 for (type = 0; type < nr_swapfiles; type++) {
3063 if (!(swap_info[type]->flags & SWP_USED))
3064 break;
3065 }
3066 if (type >= MAX_SWAPFILES) {
3067 spin_unlock(&swap_lock);
3068 percpu_ref_exit(&p->users);
3069 kvfree(p);
3070 return ERR_PTR(-EPERM);
3071 }
3072 if (type >= nr_swapfiles) {
3073 p->type = type;
3074 /*
3075 * Publish the swap_info_struct after initializing it.
3076 * Note that kvzalloc() above zeroes all its fields.
3077 */
3078 smp_store_release(&swap_info[type], p); /* rcu_assign_pointer() */
3079 nr_swapfiles++;
3080 } else {
3081 defer = p;
3082 p = swap_info[type];
3083 /*
3084 * Do not memset this entry: a racing procfs swap_next()
3085 * would be relying on p->type to remain valid.
3086 */
3087 }
3088 p->swap_extent_root = RB_ROOT;
3089 plist_node_init(&p->list, 0);
3090 plist_node_init(&p->avail_list, 0);
3091 p->flags = SWP_USED;
3092 spin_unlock(&swap_lock);
3093 if (defer) {
3094 percpu_ref_exit(&defer->users);
3095 kvfree(defer);
3096 }
3097 spin_lock_init(&p->lock);
3098 spin_lock_init(&p->cont_lock);
3099 atomic_long_set(&p->inuse_pages, SWAP_USAGE_OFFLIST_BIT);
3100 init_completion(&p->comp);
3101
3102 return p;
3103 }
3104
claim_swapfile(struct swap_info_struct * si,struct inode * inode)3105 static int claim_swapfile(struct swap_info_struct *si, struct inode *inode)
3106 {
3107 if (S_ISBLK(inode->i_mode)) {
3108 si->bdev = I_BDEV(inode);
3109 /*
3110 * Zoned block devices contain zones that have a sequential
3111 * write only restriction. Hence zoned block devices are not
3112 * suitable for swapping. Disallow them here.
3113 */
3114 if (bdev_is_zoned(si->bdev))
3115 return -EINVAL;
3116 si->flags |= SWP_BLKDEV;
3117 } else if (S_ISREG(inode->i_mode)) {
3118 si->bdev = inode->i_sb->s_bdev;
3119 }
3120
3121 return 0;
3122 }
3123
3124
3125 /*
3126 * Find out how many pages are allowed for a single swap device. There
3127 * are two limiting factors:
3128 * 1) the number of bits for the swap offset in the swp_entry_t type, and
3129 * 2) the number of bits in the swap pte, as defined by the different
3130 * architectures.
3131 *
3132 * In order to find the largest possible bit mask, a swap entry with
3133 * swap type 0 and swap offset ~0UL is created, encoded to a swap pte,
3134 * decoded to a swp_entry_t again, and finally the swap offset is
3135 * extracted.
3136 *
3137 * This will mask all the bits from the initial ~0UL mask that can't
3138 * be encoded in either the swp_entry_t or the architecture definition
3139 * of a swap pte.
3140 */
generic_max_swapfile_size(void)3141 unsigned long generic_max_swapfile_size(void)
3142 {
3143 swp_entry_t entry = swp_entry(0, ~0UL);
3144 const pte_t pte = softleaf_to_pte(entry);
3145
3146 /*
3147 * Since the PTE can be an invalid softleaf entry (e.g. the none PTE),
3148 * we need to do this manually.
3149 */
3150 entry = __pte_to_swp_entry(pte);
3151 entry = swp_entry(__swp_type(entry), __swp_offset(entry));
3152
3153 return swp_offset(entry) + 1;
3154 }
3155
3156 /* Can be overridden by an architecture for additional checks. */
arch_max_swapfile_size(void)3157 __weak unsigned long arch_max_swapfile_size(void)
3158 {
3159 return generic_max_swapfile_size();
3160 }
3161
read_swap_header(struct swap_info_struct * si,union swap_header * swap_header,struct inode * inode)3162 static unsigned long read_swap_header(struct swap_info_struct *si,
3163 union swap_header *swap_header,
3164 struct inode *inode)
3165 {
3166 int i;
3167 unsigned long maxpages;
3168 unsigned long swapfilepages;
3169 unsigned long last_page;
3170
3171 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
3172 pr_err("Unable to find swap-space signature\n");
3173 return 0;
3174 }
3175
3176 /* swap partition endianness hack... */
3177 if (swab32(swap_header->info.version) == 1) {
3178 swab32s(&swap_header->info.version);
3179 swab32s(&swap_header->info.last_page);
3180 swab32s(&swap_header->info.nr_badpages);
3181 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
3182 return 0;
3183 for (i = 0; i < swap_header->info.nr_badpages; i++)
3184 swab32s(&swap_header->info.badpages[i]);
3185 }
3186 /* Check the swap header's sub-version */
3187 if (swap_header->info.version != 1) {
3188 pr_warn("Unable to handle swap header version %d\n",
3189 swap_header->info.version);
3190 return 0;
3191 }
3192
3193 maxpages = swapfile_maximum_size;
3194 last_page = swap_header->info.last_page;
3195 if (!last_page) {
3196 pr_warn("Empty swap-file\n");
3197 return 0;
3198 }
3199 if (last_page > maxpages) {
3200 pr_warn("Truncating oversized swap area, only using %luk out of %luk\n",
3201 K(maxpages), K(last_page));
3202 }
3203 if (maxpages > last_page) {
3204 maxpages = last_page + 1;
3205 /* p->max is an unsigned int: don't overflow it */
3206 if ((unsigned int)maxpages == 0)
3207 maxpages = UINT_MAX;
3208 }
3209
3210 if (!maxpages)
3211 return 0;
3212 swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
3213 if (swapfilepages && maxpages > swapfilepages) {
3214 pr_warn("Swap area shorter than signature indicates\n");
3215 return 0;
3216 }
3217 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
3218 return 0;
3219 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
3220 return 0;
3221
3222 return maxpages;
3223 }
3224
setup_swap_map(struct swap_info_struct * si,union swap_header * swap_header,unsigned char * swap_map,unsigned long maxpages)3225 static int setup_swap_map(struct swap_info_struct *si,
3226 union swap_header *swap_header,
3227 unsigned char *swap_map,
3228 unsigned long maxpages)
3229 {
3230 unsigned long i;
3231
3232 swap_map[0] = SWAP_MAP_BAD; /* omit header page */
3233 for (i = 0; i < swap_header->info.nr_badpages; i++) {
3234 unsigned int page_nr = swap_header->info.badpages[i];
3235 if (page_nr == 0 || page_nr > swap_header->info.last_page)
3236 return -EINVAL;
3237 if (page_nr < maxpages) {
3238 swap_map[page_nr] = SWAP_MAP_BAD;
3239 si->pages--;
3240 }
3241 }
3242
3243 if (!si->pages) {
3244 pr_warn("Empty swap-file\n");
3245 return -EINVAL;
3246 }
3247
3248 return 0;
3249 }
3250
setup_clusters(struct swap_info_struct * si,union swap_header * swap_header,unsigned long maxpages)3251 static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
3252 union swap_header *swap_header,
3253 unsigned long maxpages)
3254 {
3255 unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
3256 struct swap_cluster_info *cluster_info;
3257 int err = -ENOMEM;
3258 unsigned long i;
3259
3260 cluster_info = kvzalloc_objs(*cluster_info, nr_clusters);
3261 if (!cluster_info)
3262 goto err;
3263
3264 for (i = 0; i < nr_clusters; i++)
3265 spin_lock_init(&cluster_info[i].lock);
3266
3267 if (!(si->flags & SWP_SOLIDSTATE)) {
3268 si->global_cluster = kmalloc_obj(*si->global_cluster);
3269 if (!si->global_cluster)
3270 goto err;
3271 for (i = 0; i < SWAP_NR_ORDERS; i++)
3272 si->global_cluster->next[i] = SWAP_ENTRY_INVALID;
3273 spin_lock_init(&si->global_cluster_lock);
3274 }
3275
3276 /*
3277 * Mark unusable pages as unavailable. The clusters aren't
3278 * marked free yet, so no list operations are involved yet.
3279 *
3280 * See setup_swap_map(): header page, bad pages,
3281 * and the EOF part of the last cluster.
3282 */
3283 err = swap_cluster_setup_bad_slot(cluster_info, 0);
3284 if (err)
3285 goto err;
3286 for (i = 0; i < swap_header->info.nr_badpages; i++) {
3287 unsigned int page_nr = swap_header->info.badpages[i];
3288
3289 if (page_nr >= maxpages)
3290 continue;
3291 err = swap_cluster_setup_bad_slot(cluster_info, page_nr);
3292 if (err)
3293 goto err;
3294 }
3295 for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) {
3296 err = swap_cluster_setup_bad_slot(cluster_info, i);
3297 if (err)
3298 goto err;
3299 }
3300
3301 INIT_LIST_HEAD(&si->free_clusters);
3302 INIT_LIST_HEAD(&si->full_clusters);
3303 INIT_LIST_HEAD(&si->discard_clusters);
3304
3305 for (i = 0; i < SWAP_NR_ORDERS; i++) {
3306 INIT_LIST_HEAD(&si->nonfull_clusters[i]);
3307 INIT_LIST_HEAD(&si->frag_clusters[i]);
3308 }
3309
3310 for (i = 0; i < nr_clusters; i++) {
3311 struct swap_cluster_info *ci = &cluster_info[i];
3312
3313 if (ci->count) {
3314 ci->flags = CLUSTER_FLAG_NONFULL;
3315 list_add_tail(&ci->list, &si->nonfull_clusters[0]);
3316 } else {
3317 ci->flags = CLUSTER_FLAG_FREE;
3318 list_add_tail(&ci->list, &si->free_clusters);
3319 }
3320 }
3321
3322 return cluster_info;
3323 err:
3324 free_cluster_info(cluster_info, maxpages);
3325 return ERR_PTR(err);
3326 }
3327
SYSCALL_DEFINE2(swapon,const char __user *,specialfile,int,swap_flags)3328 SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
3329 {
3330 struct swap_info_struct *si;
3331 struct file *swap_file = NULL;
3332 struct address_space *mapping;
3333 struct dentry *dentry;
3334 int prio;
3335 int error;
3336 union swap_header *swap_header;
3337 int nr_extents;
3338 sector_t span;
3339 unsigned long maxpages;
3340 unsigned char *swap_map = NULL;
3341 unsigned long *zeromap = NULL;
3342 struct swap_cluster_info *cluster_info = NULL;
3343 struct folio *folio = NULL;
3344 struct inode *inode = NULL;
3345 bool inced_nr_rotate_swap = false;
3346
3347 if (swap_flags & ~SWAP_FLAGS_VALID)
3348 return -EINVAL;
3349
3350 if (!capable(CAP_SYS_ADMIN))
3351 return -EPERM;
3352
3353 si = alloc_swap_info();
3354 if (IS_ERR(si))
3355 return PTR_ERR(si);
3356
3357 INIT_WORK(&si->discard_work, swap_discard_work);
3358 INIT_WORK(&si->reclaim_work, swap_reclaim_work);
3359
3360 CLASS(filename, name)(specialfile);
3361 swap_file = file_open_name(name, O_RDWR | O_LARGEFILE | O_EXCL, 0);
3362 if (IS_ERR(swap_file)) {
3363 error = PTR_ERR(swap_file);
3364 swap_file = NULL;
3365 goto bad_swap;
3366 }
3367
3368 si->swap_file = swap_file;
3369 mapping = swap_file->f_mapping;
3370 dentry = swap_file->f_path.dentry;
3371 inode = mapping->host;
3372
3373 error = claim_swapfile(si, inode);
3374 if (unlikely(error))
3375 goto bad_swap;
3376
3377 inode_lock(inode);
3378 if (d_unlinked(dentry) || cant_mount(dentry)) {
3379 error = -ENOENT;
3380 goto bad_swap_unlock_inode;
3381 }
3382 if (IS_SWAPFILE(inode)) {
3383 error = -EBUSY;
3384 goto bad_swap_unlock_inode;
3385 }
3386
3387 /*
3388 * The swap subsystem needs a major overhaul to support this.
3389 * It doesn't work yet so just disable it for now.
3390 */
3391 if (mapping_min_folio_order(mapping) > 0) {
3392 error = -EINVAL;
3393 goto bad_swap_unlock_inode;
3394 }
3395
3396 /*
3397 * Read the swap header.
3398 */
3399 if (!mapping->a_ops->read_folio) {
3400 error = -EINVAL;
3401 goto bad_swap_unlock_inode;
3402 }
3403 folio = read_mapping_folio(mapping, 0, swap_file);
3404 if (IS_ERR(folio)) {
3405 error = PTR_ERR(folio);
3406 goto bad_swap_unlock_inode;
3407 }
3408 swap_header = kmap_local_folio(folio, 0);
3409
3410 maxpages = read_swap_header(si, swap_header, inode);
3411 if (unlikely(!maxpages)) {
3412 error = -EINVAL;
3413 goto bad_swap_unlock_inode;
3414 }
3415
3416 si->max = maxpages;
3417 si->pages = maxpages - 1;
3418 nr_extents = setup_swap_extents(si, &span);
3419 if (nr_extents < 0) {
3420 error = nr_extents;
3421 goto bad_swap_unlock_inode;
3422 }
3423 if (si->pages != si->max - 1) {
3424 pr_err("swap:%u != (max:%u - 1)\n", si->pages, si->max);
3425 error = -EINVAL;
3426 goto bad_swap_unlock_inode;
3427 }
3428
3429 maxpages = si->max;
3430
3431 /* OK, set up the swap map and apply the bad block list */
3432 swap_map = vzalloc(maxpages);
3433 if (!swap_map) {
3434 error = -ENOMEM;
3435 goto bad_swap_unlock_inode;
3436 }
3437
3438 error = swap_cgroup_swapon(si->type, maxpages);
3439 if (error)
3440 goto bad_swap_unlock_inode;
3441
3442 error = setup_swap_map(si, swap_header, swap_map, maxpages);
3443 if (error)
3444 goto bad_swap_unlock_inode;
3445
3446 /*
3447 * Use kvmalloc_array instead of bitmap_zalloc as the allocation order might
3448 * be above MAX_PAGE_ORDER incase of a large swap file.
3449 */
3450 zeromap = kvmalloc_array(BITS_TO_LONGS(maxpages), sizeof(long),
3451 GFP_KERNEL | __GFP_ZERO);
3452 if (!zeromap) {
3453 error = -ENOMEM;
3454 goto bad_swap_unlock_inode;
3455 }
3456
3457 if (si->bdev && bdev_stable_writes(si->bdev))
3458 si->flags |= SWP_STABLE_WRITES;
3459
3460 if (si->bdev && bdev_synchronous(si->bdev))
3461 si->flags |= SWP_SYNCHRONOUS_IO;
3462
3463 if (si->bdev && bdev_nonrot(si->bdev)) {
3464 si->flags |= SWP_SOLIDSTATE;
3465 } else {
3466 atomic_inc(&nr_rotate_swap);
3467 inced_nr_rotate_swap = true;
3468 }
3469
3470 cluster_info = setup_clusters(si, swap_header, maxpages);
3471 if (IS_ERR(cluster_info)) {
3472 error = PTR_ERR(cluster_info);
3473 cluster_info = NULL;
3474 goto bad_swap_unlock_inode;
3475 }
3476
3477 if ((swap_flags & SWAP_FLAG_DISCARD) &&
3478 si->bdev && bdev_max_discard_sectors(si->bdev)) {
3479 /*
3480 * When discard is enabled for swap with no particular
3481 * policy flagged, we set all swap discard flags here in
3482 * order to sustain backward compatibility with older
3483 * swapon(8) releases.
3484 */
3485 si->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
3486 SWP_PAGE_DISCARD);
3487
3488 /*
3489 * By flagging sys_swapon, a sysadmin can tell us to
3490 * either do single-time area discards only, or to just
3491 * perform discards for released swap page-clusters.
3492 * Now it's time to adjust the p->flags accordingly.
3493 */
3494 if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
3495 si->flags &= ~SWP_PAGE_DISCARD;
3496 else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
3497 si->flags &= ~SWP_AREA_DISCARD;
3498
3499 /* issue a swapon-time discard if it's still required */
3500 if (si->flags & SWP_AREA_DISCARD) {
3501 int err = discard_swap(si);
3502 if (unlikely(err))
3503 pr_err("swapon: discard_swap(%p): %d\n",
3504 si, err);
3505 }
3506 }
3507
3508 error = zswap_swapon(si->type, maxpages);
3509 if (error)
3510 goto bad_swap_unlock_inode;
3511
3512 /*
3513 * Flush any pending IO and dirty mappings before we start using this
3514 * swap device.
3515 */
3516 inode->i_flags |= S_SWAPFILE;
3517 error = inode_drain_writes(inode);
3518 if (error) {
3519 inode->i_flags &= ~S_SWAPFILE;
3520 goto free_swap_zswap;
3521 }
3522
3523 mutex_lock(&swapon_mutex);
3524 prio = DEF_SWAP_PRIO;
3525 if (swap_flags & SWAP_FLAG_PREFER)
3526 prio = swap_flags & SWAP_FLAG_PRIO_MASK;
3527 enable_swap_info(si, prio, swap_map, cluster_info, zeromap);
3528
3529 pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s\n",
3530 K(si->pages), name->name, si->prio, nr_extents,
3531 K((unsigned long long)span),
3532 (si->flags & SWP_SOLIDSTATE) ? "SS" : "",
3533 (si->flags & SWP_DISCARDABLE) ? "D" : "",
3534 (si->flags & SWP_AREA_DISCARD) ? "s" : "",
3535 (si->flags & SWP_PAGE_DISCARD) ? "c" : "");
3536
3537 mutex_unlock(&swapon_mutex);
3538 atomic_inc(&proc_poll_event);
3539 wake_up_interruptible(&proc_poll_wait);
3540
3541 error = 0;
3542 goto out;
3543 free_swap_zswap:
3544 zswap_swapoff(si->type);
3545 bad_swap_unlock_inode:
3546 inode_unlock(inode);
3547 bad_swap:
3548 kfree(si->global_cluster);
3549 si->global_cluster = NULL;
3550 inode = NULL;
3551 destroy_swap_extents(si);
3552 swap_cgroup_swapoff(si->type);
3553 spin_lock(&swap_lock);
3554 si->swap_file = NULL;
3555 si->flags = 0;
3556 spin_unlock(&swap_lock);
3557 vfree(swap_map);
3558 kvfree(zeromap);
3559 if (cluster_info)
3560 free_cluster_info(cluster_info, maxpages);
3561 if (inced_nr_rotate_swap)
3562 atomic_dec(&nr_rotate_swap);
3563 if (swap_file)
3564 filp_close(swap_file, NULL);
3565 out:
3566 if (!IS_ERR_OR_NULL(folio))
3567 folio_release_kmap(folio, swap_header);
3568 if (inode)
3569 inode_unlock(inode);
3570 return error;
3571 }
3572
si_swapinfo(struct sysinfo * val)3573 void si_swapinfo(struct sysinfo *val)
3574 {
3575 unsigned int type;
3576 unsigned long nr_to_be_unused = 0;
3577
3578 spin_lock(&swap_lock);
3579 for (type = 0; type < nr_swapfiles; type++) {
3580 struct swap_info_struct *si = swap_info[type];
3581
3582 if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
3583 nr_to_be_unused += swap_usage_in_pages(si);
3584 }
3585 val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
3586 val->totalswap = total_swap_pages + nr_to_be_unused;
3587 spin_unlock(&swap_lock);
3588 }
3589
3590 /*
3591 * Verify that nr swap entries are valid and increment their swap map counts.
3592 *
3593 * Returns error code in following case.
3594 * - success -> 0
3595 * - swp_entry is invalid -> EINVAL
3596 * - swap-mapped reference is requested but the entry is not used. -> ENOENT
3597 * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
3598 */
swap_dup_entries(struct swap_info_struct * si,struct swap_cluster_info * ci,unsigned long offset,unsigned char usage,int nr)3599 static int swap_dup_entries(struct swap_info_struct *si,
3600 struct swap_cluster_info *ci,
3601 unsigned long offset,
3602 unsigned char usage, int nr)
3603 {
3604 int i;
3605 unsigned char count;
3606
3607 for (i = 0; i < nr; i++) {
3608 count = si->swap_map[offset + i];
3609 /*
3610 * For swapin out, allocator never allocates bad slots. for
3611 * swapin, readahead is guarded by swap_entry_swapped.
3612 */
3613 if (WARN_ON(count == SWAP_MAP_BAD))
3614 return -ENOENT;
3615 /*
3616 * Swap count duplication must be guarded by either swap cache folio (from
3617 * folio_dup_swap) or external lock of existing entry (from swap_dup_entry_direct).
3618 */
3619 if (WARN_ON(!count &&
3620 !swp_tb_is_folio(__swap_table_get(ci, offset % SWAPFILE_CLUSTER))))
3621 return -ENOENT;
3622 if (WARN_ON((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX))
3623 return -EINVAL;
3624 }
3625
3626 for (i = 0; i < nr; i++) {
3627 count = si->swap_map[offset + i];
3628 if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
3629 count += usage;
3630 else if (swap_count_continued(si, offset + i, count))
3631 count = COUNT_CONTINUED;
3632 else {
3633 /*
3634 * Don't need to rollback changes, because if
3635 * usage == 1, there must be nr == 1.
3636 */
3637 return -ENOMEM;
3638 }
3639
3640 WRITE_ONCE(si->swap_map[offset + i], count);
3641 }
3642
3643 return 0;
3644 }
3645
__swap_duplicate(swp_entry_t entry,unsigned char usage,int nr)3646 static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr)
3647 {
3648 int err;
3649 struct swap_info_struct *si;
3650 struct swap_cluster_info *ci;
3651 unsigned long offset = swp_offset(entry);
3652
3653 si = swap_entry_to_info(entry);
3654 if (WARN_ON_ONCE(!si)) {
3655 pr_err("%s%08lx\n", Bad_file, entry.val);
3656 return -EINVAL;
3657 }
3658
3659 VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER);
3660 ci = swap_cluster_lock(si, offset);
3661 err = swap_dup_entries(si, ci, offset, usage, nr);
3662 swap_cluster_unlock(ci);
3663 return err;
3664 }
3665
3666 /*
3667 * swap_dup_entry_direct() - Increase reference count of a swap entry by one.
3668 * @entry: first swap entry from which we want to increase the refcount.
3669 *
3670 * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required
3671 * but could not be atomically allocated. Returns 0, just as if it succeeded,
3672 * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which
3673 * might occur if a page table entry has got corrupted.
3674 *
3675 * Context: Caller must ensure there is no race condition on the reference
3676 * owner. e.g., locking the PTL of a PTE containing the entry being increased.
3677 */
swap_dup_entry_direct(swp_entry_t entry)3678 int swap_dup_entry_direct(swp_entry_t entry)
3679 {
3680 int err = 0;
3681 while (!err && __swap_duplicate(entry, 1, 1) == -ENOMEM)
3682 err = add_swap_count_continuation(entry, GFP_ATOMIC);
3683 return err;
3684 }
3685
3686 /*
3687 * add_swap_count_continuation - called when a swap count is duplicated
3688 * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
3689 * page of the original vmalloc'ed swap_map, to hold the continuation count
3690 * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called
3691 * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc.
3692 *
3693 * These continuation pages are seldom referenced: the common paths all work
3694 * on the original swap_map, only referring to a continuation page when the
3695 * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX.
3696 *
3697 * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding
3698 * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL)
3699 * can be called after dropping locks.
3700 */
add_swap_count_continuation(swp_entry_t entry,gfp_t gfp_mask)3701 int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
3702 {
3703 struct swap_info_struct *si;
3704 struct swap_cluster_info *ci;
3705 struct page *head;
3706 struct page *page;
3707 struct page *list_page;
3708 pgoff_t offset;
3709 unsigned char count;
3710 int ret = 0;
3711
3712 /*
3713 * When debugging, it's easier to use __GFP_ZERO here; but it's better
3714 * for latency not to zero a page while GFP_ATOMIC and holding locks.
3715 */
3716 page = alloc_page(gfp_mask | __GFP_HIGHMEM);
3717
3718 si = get_swap_device(entry);
3719 if (!si) {
3720 /*
3721 * An acceptable race has occurred since the failing
3722 * __swap_duplicate(): the swap device may be swapoff
3723 */
3724 goto outer;
3725 }
3726
3727 offset = swp_offset(entry);
3728
3729 ci = swap_cluster_lock(si, offset);
3730
3731 count = si->swap_map[offset];
3732
3733 if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
3734 /*
3735 * The higher the swap count, the more likely it is that tasks
3736 * will race to add swap count continuation: we need to avoid
3737 * over-provisioning.
3738 */
3739 goto out;
3740 }
3741
3742 if (!page) {
3743 ret = -ENOMEM;
3744 goto out;
3745 }
3746
3747 head = vmalloc_to_page(si->swap_map + offset);
3748 offset &= ~PAGE_MASK;
3749
3750 spin_lock(&si->cont_lock);
3751 /*
3752 * Page allocation does not initialize the page's lru field,
3753 * but it does always reset its private field.
3754 */
3755 if (!page_private(head)) {
3756 BUG_ON(count & COUNT_CONTINUED);
3757 INIT_LIST_HEAD(&head->lru);
3758 set_page_private(head, SWP_CONTINUED);
3759 si->flags |= SWP_CONTINUED;
3760 }
3761
3762 list_for_each_entry(list_page, &head->lru, lru) {
3763 unsigned char *map;
3764
3765 /*
3766 * If the previous map said no continuation, but we've found
3767 * a continuation page, free our allocation and use this one.
3768 */
3769 if (!(count & COUNT_CONTINUED))
3770 goto out_unlock_cont;
3771
3772 map = kmap_local_page(list_page) + offset;
3773 count = *map;
3774 kunmap_local(map);
3775
3776 /*
3777 * If this continuation count now has some space in it,
3778 * free our allocation and use this one.
3779 */
3780 if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
3781 goto out_unlock_cont;
3782 }
3783
3784 list_add_tail(&page->lru, &head->lru);
3785 page = NULL; /* now it's attached, don't free it */
3786 out_unlock_cont:
3787 spin_unlock(&si->cont_lock);
3788 out:
3789 swap_cluster_unlock(ci);
3790 put_swap_device(si);
3791 outer:
3792 if (page)
3793 __free_page(page);
3794 return ret;
3795 }
3796
3797 /*
3798 * swap_count_continued - when the original swap_map count is incremented
3799 * from SWAP_MAP_MAX, check if there is already a continuation page to carry
3800 * into, carry if so, or else fail until a new continuation page is allocated;
3801 * when the original swap_map count is decremented from 0 with continuation,
3802 * borrow from the continuation and report whether it still holds more.
3803 * Called while __swap_duplicate() or caller of swap_put_entry_locked()
3804 * holds cluster lock.
3805 */
swap_count_continued(struct swap_info_struct * si,pgoff_t offset,unsigned char count)3806 static bool swap_count_continued(struct swap_info_struct *si,
3807 pgoff_t offset, unsigned char count)
3808 {
3809 struct page *head;
3810 struct page *page;
3811 unsigned char *map;
3812 bool ret;
3813
3814 head = vmalloc_to_page(si->swap_map + offset);
3815 if (page_private(head) != SWP_CONTINUED) {
3816 BUG_ON(count & COUNT_CONTINUED);
3817 return false; /* need to add count continuation */
3818 }
3819
3820 spin_lock(&si->cont_lock);
3821 offset &= ~PAGE_MASK;
3822 page = list_next_entry(head, lru);
3823 map = kmap_local_page(page) + offset;
3824
3825 if (count == SWAP_MAP_MAX) /* initial increment from swap_map */
3826 goto init_map; /* jump over SWAP_CONT_MAX checks */
3827
3828 if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */
3829 /*
3830 * Think of how you add 1 to 999
3831 */
3832 while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
3833 kunmap_local(map);
3834 page = list_next_entry(page, lru);
3835 BUG_ON(page == head);
3836 map = kmap_local_page(page) + offset;
3837 }
3838 if (*map == SWAP_CONT_MAX) {
3839 kunmap_local(map);
3840 page = list_next_entry(page, lru);
3841 if (page == head) {
3842 ret = false; /* add count continuation */
3843 goto out;
3844 }
3845 map = kmap_local_page(page) + offset;
3846 init_map: *map = 0; /* we didn't zero the page */
3847 }
3848 *map += 1;
3849 kunmap_local(map);
3850 while ((page = list_prev_entry(page, lru)) != head) {
3851 map = kmap_local_page(page) + offset;
3852 *map = COUNT_CONTINUED;
3853 kunmap_local(map);
3854 }
3855 ret = true; /* incremented */
3856
3857 } else { /* decrementing */
3858 /*
3859 * Think of how you subtract 1 from 1000
3860 */
3861 BUG_ON(count != COUNT_CONTINUED);
3862 while (*map == COUNT_CONTINUED) {
3863 kunmap_local(map);
3864 page = list_next_entry(page, lru);
3865 BUG_ON(page == head);
3866 map = kmap_local_page(page) + offset;
3867 }
3868 BUG_ON(*map == 0);
3869 *map -= 1;
3870 if (*map == 0)
3871 count = 0;
3872 kunmap_local(map);
3873 while ((page = list_prev_entry(page, lru)) != head) {
3874 map = kmap_local_page(page) + offset;
3875 *map = SWAP_CONT_MAX | count;
3876 count = COUNT_CONTINUED;
3877 kunmap_local(map);
3878 }
3879 ret = count == COUNT_CONTINUED;
3880 }
3881 out:
3882 spin_unlock(&si->cont_lock);
3883 return ret;
3884 }
3885
3886 /*
3887 * free_swap_count_continuations - swapoff free all the continuation pages
3888 * appended to the swap_map, after swap_map is quiesced, before vfree'ing it.
3889 */
free_swap_count_continuations(struct swap_info_struct * si)3890 static void free_swap_count_continuations(struct swap_info_struct *si)
3891 {
3892 pgoff_t offset;
3893
3894 for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
3895 struct page *head;
3896 head = vmalloc_to_page(si->swap_map + offset);
3897 if (page_private(head)) {
3898 struct page *page, *next;
3899
3900 list_for_each_entry_safe(page, next, &head->lru, lru) {
3901 list_del(&page->lru);
3902 __free_page(page);
3903 }
3904 }
3905 }
3906 }
3907
3908 #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
__has_usable_swap(void)3909 static bool __has_usable_swap(void)
3910 {
3911 return !plist_head_empty(&swap_active_head);
3912 }
3913
__folio_throttle_swaprate(struct folio * folio,gfp_t gfp)3914 void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
3915 {
3916 struct swap_info_struct *si;
3917
3918 if (!(gfp & __GFP_IO))
3919 return;
3920
3921 if (!__has_usable_swap())
3922 return;
3923
3924 if (!blk_cgroup_congested())
3925 return;
3926
3927 /*
3928 * We've already scheduled a throttle, avoid taking the global swap
3929 * lock.
3930 */
3931 if (current->throttle_disk)
3932 return;
3933
3934 spin_lock(&swap_avail_lock);
3935 plist_for_each_entry(si, &swap_avail_head, avail_list) {
3936 if (si->bdev) {
3937 blkcg_schedule_throttle(si->bdev->bd_disk, true);
3938 break;
3939 }
3940 }
3941 spin_unlock(&swap_avail_lock);
3942 }
3943 #endif
3944
swapfile_init(void)3945 static int __init swapfile_init(void)
3946 {
3947 swapfile_maximum_size = arch_max_swapfile_size();
3948
3949 /*
3950 * Once a cluster is freed, it's swap table content is read
3951 * only, and all swap cache readers (swap_cache_*) verifies
3952 * the content before use. So it's safe to use RCU slab here.
3953 */
3954 if (!SWP_TABLE_USE_PAGE)
3955 swap_table_cachep = kmem_cache_create("swap_table",
3956 sizeof(struct swap_table),
3957 0, SLAB_PANIC | SLAB_TYPESAFE_BY_RCU, NULL);
3958
3959 #ifdef CONFIG_MIGRATION
3960 if (swapfile_maximum_size >= (1UL << SWP_MIG_TOTAL_BITS))
3961 swap_migration_ad_supported = true;
3962 #endif /* CONFIG_MIGRATION */
3963
3964 return 0;
3965 }
3966 subsys_initcall(swapfile_init);
3967