xref: /linux/mm/page_io.c (revision bba2c3615bd6cfee7456d1130f2e6b01b3f4e9ba)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *  linux/mm/page_io.c
4  *
5  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
6  *
7  *  Swap reorganised 29.12.95,
8  *  Asynchronous swapping added 30.12.95. Stephen Tweedie
9  *  Removed race in async swapping. 14.4.1996. Bruno Haible
10  *  Add swap of shared pages through the page cache. 20.2.1998. Stephen Tweedie
11  *  Always use brw_page, life becomes simpler. 12 May 1998 Eric Biederman
12  */
13 
14 #include <linux/mm.h>
15 #include <linux/kernel_stat.h>
16 #include <linux/gfp.h>
17 #include <linux/pagemap.h>
18 #include <linux/swap.h>
19 #include <linux/bio.h>
20 #include <linux/swapops.h>
21 #include <linux/writeback.h>
22 #include <linux/blkdev.h>
23 #include <linux/psi.h>
24 #include <linux/uio.h>
25 #include <linux/sched/task.h>
26 #include <linux/delayacct.h>
27 #include <linux/zswap.h>
28 #include "swap.h"
29 #include "swap_table.h"
30 
31 static void __end_swap_bio_write(struct bio *bio)
32 {
33 	struct folio *folio = bio_first_folio_all(bio);
34 
35 	if (bio->bi_status) {
36 		/*
37 		 * We failed to write the page out to swap-space.
38 		 * Re-dirty the page in order to avoid it being reclaimed.
39 		 * Also print a dire warning that things will go BAD (tm)
40 		 * very quickly.
41 		 *
42 		 * Also clear PG_reclaim to avoid folio_rotate_reclaimable()
43 		 */
44 		folio_mark_dirty(folio);
45 		pr_alert_ratelimited("Write-error on swap-device (%u:%u:%llu)\n",
46 				     MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
47 				     (unsigned long long)bio->bi_iter.bi_sector);
48 		folio_clear_reclaim(folio);
49 	}
50 	folio_end_writeback(folio);
51 }
52 
53 static void end_swap_bio_write(struct bio *bio)
54 {
55 	__end_swap_bio_write(bio);
56 	bio_put(bio);
57 }
58 
59 static void __end_swap_bio_read(struct bio *bio)
60 {
61 	struct folio *folio = bio_first_folio_all(bio);
62 
63 	if (bio->bi_status) {
64 		pr_alert_ratelimited("Read-error on swap-device (%u:%u:%llu)\n",
65 				     MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
66 				     (unsigned long long)bio->bi_iter.bi_sector);
67 	} else {
68 		folio_mark_uptodate(folio);
69 	}
70 	folio_unlock(folio);
71 }
72 
73 static void end_swap_bio_read(struct bio *bio)
74 {
75 	__end_swap_bio_read(bio);
76 	bio_put(bio);
77 }
78 
79 int generic_swapfile_activate(struct swap_info_struct *sis,
80 				struct file *swap_file,
81 				sector_t *span)
82 {
83 	struct address_space *mapping = swap_file->f_mapping;
84 	struct inode *inode = mapping->host;
85 	unsigned blocks_per_page;
86 	unsigned long page_no;
87 	unsigned blkbits;
88 	sector_t probe_block;
89 	sector_t last_block;
90 	sector_t lowest_block = -1;
91 	sector_t highest_block = 0;
92 	int nr_extents = 0;
93 	int ret;
94 
95 	blkbits = inode->i_blkbits;
96 	blocks_per_page = PAGE_SIZE >> blkbits;
97 
98 	/*
99 	 * Map all the blocks into the extent tree.  This code doesn't try
100 	 * to be very smart.
101 	 */
102 	probe_block = 0;
103 	page_no = 0;
104 	last_block = i_size_read(inode) >> blkbits;
105 	while ((probe_block + blocks_per_page) <= last_block &&
106 			page_no < sis->max) {
107 		unsigned block_in_page;
108 		sector_t first_block;
109 
110 		cond_resched();
111 
112 		first_block = probe_block;
113 		ret = bmap(inode, &first_block);
114 		if (ret || !first_block)
115 			goto bad_bmap;
116 
117 		/*
118 		 * It must be PAGE_SIZE aligned on-disk
119 		 */
120 		if (first_block & (blocks_per_page - 1)) {
121 			probe_block++;
122 			goto reprobe;
123 		}
124 
125 		for (block_in_page = 1; block_in_page < blocks_per_page;
126 					block_in_page++) {
127 			sector_t block;
128 
129 			block = probe_block + block_in_page;
130 			ret = bmap(inode, &block);
131 			if (ret || !block)
132 				goto bad_bmap;
133 
134 			if (block != first_block + block_in_page) {
135 				/* Discontiguity */
136 				probe_block++;
137 				goto reprobe;
138 			}
139 		}
140 
141 		first_block >>= (PAGE_SHIFT - blkbits);
142 		if (page_no) {	/* exclude the header page */
143 			if (first_block < lowest_block)
144 				lowest_block = first_block;
145 			if (first_block > highest_block)
146 				highest_block = first_block;
147 		}
148 
149 		/*
150 		 * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
151 		 */
152 		ret = add_swap_extent(sis, page_no, 1, first_block);
153 		if (ret < 0)
154 			goto out;
155 		nr_extents += ret;
156 		page_no++;
157 		probe_block += blocks_per_page;
158 reprobe:
159 		continue;
160 	}
161 	ret = nr_extents;
162 	*span = 1 + highest_block - lowest_block;
163 	if (page_no == 0)
164 		page_no = 1;	/* force Empty message */
165 	sis->max = page_no;
166 	sis->pages = page_no - 1;
167 out:
168 	return ret;
169 bad_bmap:
170 	pr_err("swapon: swapfile has holes\n");
171 	ret = -EINVAL;
172 	goto out;
173 }
174 
175 static bool is_folio_zero_filled(struct folio *folio)
176 {
177 	unsigned int pos, last_pos;
178 	unsigned long *data;
179 	unsigned int i;
180 
181 	last_pos = PAGE_SIZE / sizeof(*data) - 1;
182 	for (i = 0; i < folio_nr_pages(folio); i++) {
183 		data = kmap_local_folio(folio, i * PAGE_SIZE);
184 		/*
185 		 * Check last word first, incase the page is zero-filled at
186 		 * the start and has non-zero data at the end, which is common
187 		 * in real-world workloads.
188 		 */
189 		if (data[last_pos]) {
190 			kunmap_local(data);
191 			return false;
192 		}
193 		for (pos = 0; pos < last_pos; pos++) {
194 			if (data[pos]) {
195 				kunmap_local(data);
196 				return false;
197 			}
198 		}
199 		kunmap_local(data);
200 	}
201 
202 	return true;
203 }
204 
205 static void swap_zeromap_folio_set(struct folio *folio)
206 {
207 	struct obj_cgroup *objcg = get_obj_cgroup_from_folio(folio);
208 	int nr_pages = folio_nr_pages(folio);
209 	struct swap_cluster_info *ci;
210 	swp_entry_t entry;
211 	unsigned int i;
212 
213 	VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio);
214 	VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
215 
216 	ci = swap_cluster_get_and_lock(folio);
217 	for (i = 0; i < folio_nr_pages(folio); i++) {
218 		entry = page_swap_entry(folio_page(folio, i));
219 		__swap_table_set_zero(ci, swp_cluster_offset(entry));
220 	}
221 	swap_cluster_unlock(ci);
222 
223 	count_vm_events(SWPOUT_ZERO, nr_pages);
224 	if (objcg) {
225 		count_objcg_events(objcg, SWPOUT_ZERO, nr_pages);
226 		obj_cgroup_put(objcg);
227 	}
228 }
229 
230 static void swap_zeromap_folio_clear(struct folio *folio)
231 {
232 	struct swap_cluster_info *ci;
233 	swp_entry_t entry;
234 	unsigned int i;
235 
236 	VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio);
237 	VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
238 
239 	ci = swap_cluster_get_and_lock(folio);
240 	for (i = 0; i < folio_nr_pages(folio); i++) {
241 		entry = page_swap_entry(folio_page(folio, i));
242 		__swap_table_clear_zero(ci, swp_cluster_offset(entry));
243 	}
244 	swap_cluster_unlock(ci);
245 }
246 
247 /*
248  * We may have stale swap cache pages in memory: notice
249  * them here and get rid of the unnecessary final write.
250  */
251 int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug)
252 {
253 	int ret = 0;
254 
255 	if (folio_free_swap(folio))
256 		goto out_unlock;
257 
258 	/*
259 	 * Arch code may have to preserve more data than just the page
260 	 * contents, e.g. memory tags.
261 	 */
262 	ret = arch_prepare_to_swap(folio);
263 	if (ret) {
264 		folio_mark_dirty(folio);
265 		goto out_unlock;
266 	}
267 
268 	/*
269 	 * Use the swap table zero mark to avoid doing IO for zero-filled
270 	 * pages. The zero mark is protected by the cluster lock, which is
271 	 * acquired internally by swap_zeromap_folio_set/clear.
272 	 */
273 	if (is_folio_zero_filled(folio)) {
274 		swap_zeromap_folio_set(folio);
275 		goto out_unlock;
276 	}
277 
278 	/*
279 	 * Clear bits this folio occupies in the zeromap to prevent zero data
280 	 * being read in from any previous zero writes that occupied the same
281 	 * swap entries.
282 	 */
283 	swap_zeromap_folio_clear(folio);
284 
285 	if (zswap_store(folio)) {
286 		count_mthp_stat(folio_order(folio), MTHP_STAT_ZSWPOUT);
287 		goto out_unlock;
288 	}
289 
290 	rcu_read_lock();
291 	if (!mem_cgroup_zswap_writeback_enabled(folio_memcg(folio))) {
292 		rcu_read_unlock();
293 		folio_mark_dirty(folio);
294 		return AOP_WRITEPAGE_ACTIVATE;
295 	}
296 	rcu_read_unlock();
297 
298 	__swap_writepage(folio, swap_plug);
299 	return 0;
300 out_unlock:
301 	folio_unlock(folio);
302 	return ret;
303 }
304 
305 static inline void count_swpout_vm_event(struct folio *folio)
306 {
307 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
308 	if (unlikely(folio_test_pmd_mappable(folio))) {
309 		count_memcg_folio_events(folio, THP_SWPOUT, 1);
310 		count_vm_event(THP_SWPOUT);
311 	}
312 #endif
313 	count_mthp_stat(folio_order(folio), MTHP_STAT_SWPOUT);
314 	count_memcg_folio_events(folio, PSWPOUT, folio_nr_pages(folio));
315 	count_vm_events(PSWPOUT, folio_nr_pages(folio));
316 }
317 
318 #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
319 static void bio_associate_blkg_from_page(struct bio *bio, struct folio *folio)
320 {
321 	struct cgroup_subsys_state *css;
322 	struct mem_cgroup *memcg;
323 
324 	if (!folio_memcg_charged(folio))
325 		return;
326 
327 	rcu_read_lock();
328 	memcg = folio_memcg(folio);
329 	css = cgroup_e_css(memcg->css.cgroup, &io_cgrp_subsys);
330 	bio_associate_blkg_from_css(bio, css);
331 	rcu_read_unlock();
332 }
333 #else
334 #define bio_associate_blkg_from_page(bio, folio)		do { } while (0)
335 #endif /* CONFIG_MEMCG && CONFIG_BLK_CGROUP */
336 
337 struct swap_iocb {
338 	struct kiocb		iocb;
339 	struct bio_vec		bvecs[SWAP_CLUSTER_MAX];
340 	int			nr_bvecs;
341 	int			len;
342 };
343 static mempool_t *sio_pool;
344 
345 int sio_pool_init(void)
346 {
347 	if (!sio_pool) {
348 		mempool_t *pool = mempool_create_kmalloc_pool(
349 			SWAP_CLUSTER_MAX, sizeof(struct swap_iocb));
350 		if (cmpxchg(&sio_pool, NULL, pool))
351 			mempool_destroy(pool);
352 	}
353 	if (!sio_pool)
354 		return -ENOMEM;
355 	return 0;
356 }
357 
358 static void sio_write_complete(struct kiocb *iocb, long ret)
359 {
360 	struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb);
361 	struct page *page = sio->bvecs[0].bv_page;
362 	int p;
363 
364 	if (ret != sio->len) {
365 		/*
366 		 * In the case of swap-over-nfs, this can be a
367 		 * temporary failure if the system has limited
368 		 * memory for allocating transmit buffers.
369 		 * Mark the page dirty and avoid
370 		 * folio_rotate_reclaimable but rate-limit the
371 		 * messages.
372 		 */
373 		pr_err_ratelimited("Write error %ld on dio swapfile (%llu)\n",
374 				   ret, swap_dev_pos(page_swap_entry(page)));
375 		for (p = 0; p < sio->nr_bvecs; p++) {
376 			page = sio->bvecs[p].bv_page;
377 			set_page_dirty(page);
378 			ClearPageReclaim(page);
379 		}
380 	}
381 
382 	for (p = 0; p < sio->nr_bvecs; p++)
383 		end_page_writeback(sio->bvecs[p].bv_page);
384 
385 	mempool_free(sio, sio_pool);
386 }
387 
388 static void swap_writepage_fs(struct folio *folio, struct swap_iocb **swap_plug)
389 {
390 	struct swap_iocb *sio = swap_plug ? *swap_plug : NULL;
391 	struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
392 	struct file *swap_file = sis->swap_file;
393 	loff_t pos = swap_dev_pos(folio->swap);
394 
395 	count_swpout_vm_event(folio);
396 	folio_start_writeback(folio);
397 	folio_unlock(folio);
398 	if (sio) {
399 		if (sio->iocb.ki_filp != swap_file ||
400 		    sio->iocb.ki_pos + sio->len != pos) {
401 			swap_write_unplug(sio);
402 			sio = NULL;
403 		}
404 	}
405 	if (!sio) {
406 		sio = mempool_alloc(sio_pool, GFP_NOIO);
407 		init_sync_kiocb(&sio->iocb, swap_file);
408 		sio->iocb.ki_complete = sio_write_complete;
409 		sio->iocb.ki_pos = pos;
410 		sio->nr_bvecs = 0;
411 		sio->len = 0;
412 	}
413 	bvec_set_folio(&sio->bvecs[sio->nr_bvecs], folio, folio_size(folio), 0);
414 	sio->len += folio_size(folio);
415 	sio->nr_bvecs += 1;
416 	if (sio->nr_bvecs == ARRAY_SIZE(sio->bvecs) || !swap_plug) {
417 		swap_write_unplug(sio);
418 		sio = NULL;
419 	}
420 	if (swap_plug)
421 		*swap_plug = sio;
422 }
423 
424 static void swap_writepage_bdev_sync(struct folio *folio,
425 		struct swap_info_struct *sis)
426 {
427 	struct bio_vec bv;
428 	struct bio bio;
429 
430 	bio_init(&bio, sis->bdev, &bv, 1, REQ_OP_WRITE | REQ_SWAP);
431 	bio.bi_iter.bi_sector = swap_folio_sector(folio);
432 	bio_add_folio_nofail(&bio, folio, folio_size(folio), 0);
433 
434 	bio_associate_blkg_from_page(&bio, folio);
435 	count_swpout_vm_event(folio);
436 
437 	folio_start_writeback(folio);
438 	folio_unlock(folio);
439 
440 	submit_bio_wait(&bio);
441 	__end_swap_bio_write(&bio);
442 }
443 
444 static void swap_writepage_bdev_async(struct folio *folio,
445 		struct swap_info_struct *sis)
446 {
447 	struct bio *bio;
448 
449 	bio = bio_alloc(sis->bdev, 1, REQ_OP_WRITE | REQ_SWAP, GFP_NOIO);
450 	bio->bi_iter.bi_sector = swap_folio_sector(folio);
451 	bio->bi_end_io = end_swap_bio_write;
452 	bio_add_folio_nofail(bio, folio, folio_size(folio), 0);
453 
454 	bio_associate_blkg_from_page(bio, folio);
455 	count_swpout_vm_event(folio);
456 	folio_start_writeback(folio);
457 	folio_unlock(folio);
458 	submit_bio(bio);
459 }
460 
461 void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
462 {
463 	struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
464 
465 	VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
466 	/*
467 	 * ->flags can be updated non-atomically,
468 	 * but that will never affect SWP_FS_OPS, so the data_race
469 	 * is safe.
470 	 */
471 	if (data_race(sis->flags & SWP_FS_OPS))
472 		swap_writepage_fs(folio, swap_plug);
473 	/*
474 	 * ->flags can be updated non-atomically,
475 	 * but that will never affect SWP_SYNCHRONOUS_IO, so the data_race
476 	 * is safe.
477 	 */
478 	else if (data_race(sis->flags & SWP_SYNCHRONOUS_IO))
479 		swap_writepage_bdev_sync(folio, sis);
480 	else
481 		swap_writepage_bdev_async(folio, sis);
482 }
483 
484 void swap_write_unplug(struct swap_iocb *sio)
485 {
486 	struct iov_iter from;
487 	struct address_space *mapping = sio->iocb.ki_filp->f_mapping;
488 	int ret;
489 
490 	iov_iter_bvec(&from, ITER_SOURCE, sio->bvecs, sio->nr_bvecs, sio->len);
491 	ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
492 	if (ret != -EIOCBQUEUED)
493 		sio_write_complete(&sio->iocb, ret);
494 }
495 
496 static void sio_read_complete(struct kiocb *iocb, long ret)
497 {
498 	struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb);
499 	int p;
500 
501 	if (ret == sio->len) {
502 		for (p = 0; p < sio->nr_bvecs; p++) {
503 			struct folio *folio = bvec_folio(&sio->bvecs[p]);
504 
505 			count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN);
506 			count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio));
507 			folio_mark_uptodate(folio);
508 			folio_unlock(folio);
509 		}
510 		count_vm_events(PSWPIN, sio->len >> PAGE_SHIFT);
511 	} else {
512 		for (p = 0; p < sio->nr_bvecs; p++) {
513 			struct folio *folio = bvec_folio(&sio->bvecs[p]);
514 
515 			folio_unlock(folio);
516 		}
517 		pr_alert_ratelimited("Read-error on swap-device\n");
518 	}
519 	mempool_free(sio, sio_pool);
520 }
521 
522 /*
523  * Return the count of contiguous swap entries that share the same
524  * zeromap status as the starting entry. If is_zerop is not NULL,
525  * it will return the zeromap status of the starting entry.
526  *
527  * Context: Caller must ensure the cluster containing the entries
528  * that will be checked won't be freed.
529  */
530 static int swap_zeromap_batch(swp_entry_t entry, int max_nr,
531 			      bool *is_zerop)
532 {
533 	int i;
534 	bool is_zero;
535 	unsigned int ci_start = swp_cluster_offset(entry);
536 	struct swap_cluster_info *ci = __swap_entry_to_cluster(entry);
537 
538 	VM_WARN_ON_ONCE(ci_start + max_nr > SWAPFILE_CLUSTER);
539 
540 	rcu_read_lock();
541 	is_zero = __swap_table_test_zero(ci, ci_start);
542 	for (i = 1; i < max_nr; i++)
543 		if (is_zero != __swap_table_test_zero(ci, ci_start + i))
544 			break;
545 	rcu_read_unlock();
546 	if (is_zerop)
547 		*is_zerop = is_zero;
548 
549 	return i;
550 }
551 
552 static bool swap_read_folio_zeromap(struct folio *folio)
553 {
554 	int nr_pages = folio_nr_pages(folio);
555 	struct obj_cgroup *objcg;
556 	bool is_zeromap;
557 
558 	VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
559 
560 	/*
561 	 * Swapping in a large folio that is partially in the zeromap is not
562 	 * currently handled. Return true without marking the folio uptodate so
563 	 * that an IO error is emitted (e.g. do_swap_page() will sigbus).
564 	 * Folio lock stabilizes the cluster and map, so the check is safe.
565 	 */
566 	if (WARN_ON_ONCE(swap_zeromap_batch(folio->swap, nr_pages,
567 			 &is_zeromap) != nr_pages))
568 		return true;
569 
570 	if (!is_zeromap)
571 		return false;
572 
573 	objcg = get_obj_cgroup_from_folio(folio);
574 	count_vm_events(SWPIN_ZERO, nr_pages);
575 	if (objcg) {
576 		count_objcg_events(objcg, SWPIN_ZERO, nr_pages);
577 		obj_cgroup_put(objcg);
578 	}
579 
580 	folio_zero_range(folio, 0, folio_size(folio));
581 	folio_mark_uptodate(folio);
582 	return true;
583 }
584 
585 static void swap_read_folio_fs(struct folio *folio, struct swap_iocb **plug)
586 {
587 	struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
588 	struct swap_iocb *sio = NULL;
589 	loff_t pos = swap_dev_pos(folio->swap);
590 
591 	if (plug)
592 		sio = *plug;
593 	if (sio) {
594 		if (sio->iocb.ki_filp != sis->swap_file ||
595 		    sio->iocb.ki_pos + sio->len != pos) {
596 			swap_read_unplug(sio);
597 			sio = NULL;
598 		}
599 	}
600 	if (!sio) {
601 		sio = mempool_alloc(sio_pool, GFP_KERNEL);
602 		init_sync_kiocb(&sio->iocb, sis->swap_file);
603 		sio->iocb.ki_pos = pos;
604 		sio->iocb.ki_complete = sio_read_complete;
605 		sio->nr_bvecs = 0;
606 		sio->len = 0;
607 	}
608 	bvec_set_folio(&sio->bvecs[sio->nr_bvecs], folio, folio_size(folio), 0);
609 	sio->len += folio_size(folio);
610 	sio->nr_bvecs += 1;
611 	if (sio->nr_bvecs == ARRAY_SIZE(sio->bvecs) || !plug) {
612 		swap_read_unplug(sio);
613 		sio = NULL;
614 	}
615 	if (plug)
616 		*plug = sio;
617 }
618 
619 static void swap_read_folio_bdev_sync(struct folio *folio,
620 		struct swap_info_struct *sis)
621 {
622 	struct bio_vec bv;
623 	struct bio bio;
624 
625 	bio_init(&bio, sis->bdev, &bv, 1, REQ_OP_READ);
626 	bio.bi_iter.bi_sector = swap_folio_sector(folio);
627 	bio_add_folio_nofail(&bio, folio, folio_size(folio), 0);
628 	/*
629 	 * Keep this task valid during swap readpage because the oom killer may
630 	 * attempt to access it in the page fault retry time check.
631 	 */
632 	get_task_struct(current);
633 	count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN);
634 	count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio));
635 	count_vm_events(PSWPIN, folio_nr_pages(folio));
636 	submit_bio_wait(&bio);
637 	__end_swap_bio_read(&bio);
638 	put_task_struct(current);
639 }
640 
641 static void swap_read_folio_bdev_async(struct folio *folio,
642 		struct swap_info_struct *sis)
643 {
644 	struct bio *bio;
645 
646 	bio = bio_alloc(sis->bdev, 1, REQ_OP_READ, GFP_KERNEL);
647 	bio->bi_iter.bi_sector = swap_folio_sector(folio);
648 	bio->bi_end_io = end_swap_bio_read;
649 	bio_add_folio_nofail(bio, folio, folio_size(folio), 0);
650 	count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN);
651 	count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio));
652 	count_vm_events(PSWPIN, folio_nr_pages(folio));
653 	submit_bio(bio);
654 }
655 
656 void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
657 {
658 	struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
659 	bool synchronous = sis->flags & SWP_SYNCHRONOUS_IO;
660 	bool workingset = folio_test_workingset(folio);
661 	unsigned long pflags;
662 	bool in_thrashing;
663 
664 	VM_BUG_ON_FOLIO(!folio_test_swapcache(folio) && !synchronous, folio);
665 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
666 	VM_BUG_ON_FOLIO(folio_test_uptodate(folio), folio);
667 
668 	/*
669 	 * Count submission time as memory stall and delay. When the device
670 	 * is congested, or the submitting cgroup IO-throttled, submission
671 	 * can be a significant part of overall IO time.
672 	 */
673 	if (workingset) {
674 		delayacct_thrashing_start(&in_thrashing);
675 		psi_memstall_enter(&pflags);
676 	}
677 	delayacct_swapin_start();
678 
679 	if (swap_read_folio_zeromap(folio)) {
680 		folio_unlock(folio);
681 		goto finish;
682 	}
683 
684 	if (zswap_load(folio) != -ENOENT)
685 		goto finish;
686 
687 	/* We have to read from slower devices. Increase zswap protection. */
688 	zswap_folio_swapin(folio);
689 
690 	if (data_race(sis->flags & SWP_FS_OPS)) {
691 		swap_read_folio_fs(folio, plug);
692 	} else if (synchronous) {
693 		swap_read_folio_bdev_sync(folio, sis);
694 	} else {
695 		swap_read_folio_bdev_async(folio, sis);
696 	}
697 
698 finish:
699 	if (workingset) {
700 		delayacct_thrashing_end(&in_thrashing);
701 		psi_memstall_leave(&pflags);
702 	}
703 	delayacct_swapin_end();
704 }
705 
706 void __swap_read_unplug(struct swap_iocb *sio)
707 {
708 	struct iov_iter from;
709 	struct address_space *mapping = sio->iocb.ki_filp->f_mapping;
710 	int ret;
711 
712 	iov_iter_bvec(&from, ITER_DEST, sio->bvecs, sio->nr_bvecs, sio->len);
713 	ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
714 	if (ret != -EIOCBQUEUED)
715 		sio_read_complete(&sio->iocb, ret);
716 }
717