xref: /linux/fs/xfs/xfs_aops.c (revision c4dde411bc366f568dbe33366253bbfea049e8ea)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
4  * Copyright (c) 2016-2025 Christoph Hellwig.
5  * All Rights Reserved.
6  */
7 #include "xfs_platform.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_mount.h"
13 #include "xfs_inode.h"
14 #include "xfs_trans.h"
15 #include "xfs_iomap.h"
16 #include "xfs_trace.h"
17 #include "xfs_bmap.h"
18 #include "xfs_bmap_util.h"
19 #include "xfs_reflink.h"
20 #include "xfs_errortag.h"
21 #include "xfs_error.h"
22 #include "xfs_icache.h"
23 #include "xfs_zone_alloc.h"
24 #include "xfs_rtgroup.h"
25 #include <linux/bio-integrity.h>
26 
27 struct xfs_writepage_ctx {
28 	struct iomap_writepage_ctx ctx;
29 	unsigned int		data_seq;
30 	unsigned int		cow_seq;
31 };
32 
33 static inline struct xfs_writepage_ctx *
34 XFS_WPC(struct iomap_writepage_ctx *ctx)
35 {
36 	return container_of(ctx, struct xfs_writepage_ctx, ctx);
37 }
38 
39 /*
40  * Fast and loose check if this write could update the on-disk inode size.
41  */
42 static inline bool xfs_ioend_is_append(struct iomap_ioend *ioend)
43 {
44 	return ioend->io_offset + ioend->io_size >
45 		XFS_I(ioend->io_inode)->i_disk_size;
46 }
47 
48 /*
49  * Update on-disk file size now that data has been written to disk.
50  */
51 int
52 xfs_setfilesize(
53 	struct xfs_inode	*ip,
54 	xfs_off_t		offset,
55 	size_t			size)
56 {
57 	struct xfs_mount	*mp = ip->i_mount;
58 	struct xfs_trans	*tp;
59 	xfs_fsize_t		isize;
60 	int			error;
61 
62 	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
63 	if (error)
64 		return error;
65 
66 	xfs_ilock(ip, XFS_ILOCK_EXCL);
67 	isize = xfs_new_eof(ip, offset + size);
68 	if (!isize) {
69 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
70 		xfs_trans_cancel(tp);
71 		return 0;
72 	}
73 
74 	trace_xfs_setfilesize(ip, offset, size);
75 
76 	ip->i_disk_size = isize;
77 	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
78 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
79 
80 	return xfs_trans_commit(tp);
81 }
82 
83 static void
84 xfs_ioend_put_open_zones(
85 	struct iomap_ioend	*ioend)
86 {
87 	struct iomap_ioend *tmp;
88 
89 	/*
90 	 * Put the open zone for all ioends merged into this one (if any).
91 	 */
92 	list_for_each_entry(tmp, &ioend->io_list, io_list)
93 		xfs_open_zone_put(tmp->io_private);
94 
95 	/*
96 	 * The main ioend might not have an open zone if the submission failed
97 	 * before xfs_zone_alloc_and_submit got called.
98 	 */
99 	if (ioend->io_private)
100 		xfs_open_zone_put(ioend->io_private);
101 }
102 
103 /*
104  * IO write completion.
105  */
106 STATIC void
107 xfs_end_ioend_write(
108 	struct iomap_ioend	*ioend)
109 {
110 	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
111 	struct xfs_mount	*mp = ip->i_mount;
112 	bool			is_zoned = xfs_is_zoned_inode(ip);
113 	xfs_off_t		offset = ioend->io_offset;
114 	size_t			size = ioend->io_size;
115 	unsigned int		nofs_flag;
116 	int			error;
117 
118 	/*
119 	 * We can allocate memory here while doing writeback on behalf of
120 	 * memory reclaim.  To avoid memory allocation deadlocks set the
121 	 * task-wide nofs context for the following operations.
122 	 */
123 	nofs_flag = memalloc_nofs_save();
124 
125 	/*
126 	 * Just clean up the in-memory structures if the fs has been shut down.
127 	 */
128 	if (xfs_is_shutdown(mp)) {
129 		error = -EIO;
130 		goto done;
131 	}
132 
133 	/*
134 	 * Clean up all COW blocks and underlying data fork delalloc blocks on
135 	 * I/O error. The delalloc punch is required because this ioend was
136 	 * mapped to blocks in the COW fork and the associated pages are no
137 	 * longer dirty. If we don't remove delalloc blocks here, they become
138 	 * stale and can corrupt free space accounting on unmount.
139 	 */
140 	error = blk_status_to_errno(ioend->io_bio.bi_status);
141 	if (unlikely(error)) {
142 		if (ioend->io_flags & IOMAP_IOEND_SHARED) {
143 			ASSERT(!is_zoned);
144 			xfs_reflink_cancel_cow_range(ip, offset, size, true);
145 			xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, offset,
146 					offset + size, NULL);
147 		}
148 		goto done;
149 	}
150 
151 	/*
152 	 * Success: commit the COW or unwritten blocks if needed.
153 	 */
154 	if (is_zoned)
155 		error = xfs_zoned_end_io(ip, offset, size, ioend->io_sector,
156 				ioend->io_private, NULLFSBLOCK);
157 	else if (ioend->io_flags & IOMAP_IOEND_SHARED)
158 		error = xfs_reflink_end_cow(ip, offset, size);
159 	else if (ioend->io_flags & IOMAP_IOEND_UNWRITTEN)
160 		error = xfs_iomap_write_unwritten(ip, offset, size, false);
161 
162 	if (!error &&
163 	    !(ioend->io_flags & IOMAP_IOEND_DIRECT) &&
164 	    xfs_ioend_is_append(ioend))
165 		error = xfs_setfilesize(ip, offset, size);
166 done:
167 	if (is_zoned)
168 		xfs_ioend_put_open_zones(ioend);
169 	iomap_finish_ioends(ioend, error);
170 	memalloc_nofs_restore(nofs_flag);
171 }
172 
173 /*
174  * Finish all pending IO completions that require transactional modifications.
175  *
176  * We try to merge physical and logically contiguous ioends before completion to
177  * minimise the number of transactions we need to perform during IO completion.
178  * Both unwritten extent conversion and COW remapping need to iterate and modify
179  * one physical extent at a time, so we gain nothing by merging physically
180  * discontiguous extents here.
181  *
182  * The ioend chain length that we can be processing here is largely unbound in
183  * length and we may have to perform significant amounts of work on each ioend
184  * to complete it. Hence we have to be careful about holding the CPU for too
185  * long in this loop.
186  */
187 void
188 xfs_end_io(
189 	struct work_struct	*work)
190 {
191 	struct xfs_inode	*ip =
192 		container_of(work, struct xfs_inode, i_ioend_work);
193 	struct iomap_ioend	*ioend;
194 	struct list_head	tmp;
195 	unsigned long		flags;
196 
197 	spin_lock_irqsave(&ip->i_ioend_lock, flags);
198 	list_replace_init(&ip->i_ioend_list, &tmp);
199 	spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
200 
201 	iomap_sort_ioends(&tmp);
202 	while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend,
203 			io_list))) {
204 		list_del_init(&ioend->io_list);
205 		iomap_ioend_try_merge(ioend, &tmp);
206 		if (bio_op(&ioend->io_bio) == REQ_OP_READ)
207 			iomap_finish_ioends(ioend,
208 				blk_status_to_errno(ioend->io_bio.bi_status));
209 		else
210 			xfs_end_ioend_write(ioend);
211 		cond_resched();
212 	}
213 }
214 
215 void
216 xfs_end_bio(
217 	struct bio		*bio)
218 {
219 	struct iomap_ioend	*ioend = iomap_ioend_from_bio(bio);
220 	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
221 	struct xfs_mount	*mp = ip->i_mount;
222 	unsigned long		flags;
223 
224 	/*
225 	 * For Appends record the actually written block number and set the
226 	 * boundary flag if needed.
227 	 */
228 	if (IS_ENABLED(CONFIG_XFS_RT) && bio_is_zone_append(bio)) {
229 		ioend->io_sector = bio->bi_iter.bi_sector;
230 		xfs_mark_rtg_boundary(ioend);
231 	}
232 
233 	spin_lock_irqsave(&ip->i_ioend_lock, flags);
234 	if (list_empty(&ip->i_ioend_list))
235 		WARN_ON_ONCE(!queue_work(mp->m_unwritten_workqueue,
236 					 &ip->i_ioend_work));
237 	list_add_tail(&ioend->io_list, &ip->i_ioend_list);
238 	spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
239 }
240 
241 /*
242  * We cannot cancel the ioend directly on error.  We may have already set other
243  * pages under writeback and hence we have to run I/O completion to mark the
244  * error state of the pages under writeback appropriately.
245  *
246  * If the folio has delalloc blocks on it, the caller is asking us to punch them
247  * out. If we don't, we can leave a stale delalloc mapping covered by a clean
248  * page that needs to be dirtied again before the delalloc mapping can be
249  * converted. This stale delalloc mapping can trip up a later direct I/O read
250  * operation on the same region.
251  *
252  * We prevent this by truncating away the delalloc regions on the folio. Because
253  * they are delalloc, we can do this without needing a transaction. Indeed - if
254  * we get ENOSPC errors, we have to be able to do this truncation without a
255  * transaction as there is no space left for block reservation (typically why
256  * we see a ENOSPC in writeback).
257  */
258 static void
259 xfs_discard_folio(
260 	struct folio		*folio,
261 	loff_t			pos)
262 {
263 	struct xfs_inode	*ip = XFS_I(folio->mapping->host);
264 	struct xfs_mount	*mp = ip->i_mount;
265 
266 	if (xfs_is_shutdown(mp))
267 		return;
268 
269 	xfs_alert_ratelimited(mp,
270 		"page discard on page "PTR_FMT", inode 0x%llx, pos %llu.",
271 			folio, ip->i_ino, pos);
272 
273 	/*
274 	 * The end of the punch range is always the offset of the first
275 	 * byte of the next folio. Hence the end offset is only dependent on the
276 	 * folio itself and not the start offset that is passed in.
277 	 */
278 	xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos,
279 				folio_next_pos(folio), NULL);
280 }
281 
282 /*
283  * Fast revalidation of the cached writeback mapping. Return true if the current
284  * mapping is valid, false otherwise.
285  */
286 static bool
287 xfs_imap_valid(
288 	struct iomap_writepage_ctx	*wpc,
289 	struct xfs_inode		*ip,
290 	loff_t				offset)
291 {
292 	if (offset < wpc->iomap.offset ||
293 	    offset >= wpc->iomap.offset + wpc->iomap.length)
294 		return false;
295 	/*
296 	 * If this is a COW mapping, it is sufficient to check that the mapping
297 	 * covers the offset. Be careful to check this first because the caller
298 	 * can revalidate a COW mapping without updating the data seqno.
299 	 */
300 	if (wpc->iomap.flags & IOMAP_F_SHARED)
301 		return true;
302 
303 	/*
304 	 * This is not a COW mapping. Check the sequence number of the data fork
305 	 * because concurrent changes could have invalidated the extent. Check
306 	 * the COW fork because concurrent changes since the last time we
307 	 * checked (and found nothing at this offset) could have added
308 	 * overlapping blocks.
309 	 */
310 	if (XFS_WPC(wpc)->data_seq != READ_ONCE(ip->i_df.if_seq)) {
311 		trace_xfs_wb_data_iomap_invalid(ip, &wpc->iomap,
312 				XFS_WPC(wpc)->data_seq, XFS_DATA_FORK);
313 		return false;
314 	}
315 	if (xfs_inode_has_cow_data(ip) &&
316 	    XFS_WPC(wpc)->cow_seq != READ_ONCE(ip->i_cowfp->if_seq)) {
317 		trace_xfs_wb_cow_iomap_invalid(ip, &wpc->iomap,
318 				XFS_WPC(wpc)->cow_seq, XFS_COW_FORK);
319 		return false;
320 	}
321 	return true;
322 }
323 
324 static int
325 xfs_map_blocks(
326 	struct iomap_writepage_ctx *wpc,
327 	loff_t			offset,
328 	unsigned int		len)
329 {
330 	struct xfs_inode	*ip = XFS_I(wpc->inode);
331 	struct xfs_mount	*mp = ip->i_mount;
332 	ssize_t			count = i_blocksize(wpc->inode);
333 	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
334 	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, offset + count);
335 	xfs_fileoff_t		cow_fsb;
336 	int			whichfork;
337 	struct xfs_bmbt_irec	imap;
338 	struct xfs_iext_cursor	icur;
339 	int			retries = 0;
340 	int			error = 0;
341 	unsigned int		*seq;
342 
343 	if (xfs_is_shutdown(mp))
344 		return -EIO;
345 
346 	XFS_ERRORTAG_DELAY(mp, XFS_ERRTAG_WB_DELAY_MS);
347 
348 	/*
349 	 * COW fork blocks can overlap data fork blocks even if the blocks
350 	 * aren't shared.  COW I/O always takes precedent, so we must always
351 	 * check for overlap on reflink inodes unless the mapping is already a
352 	 * COW one, or the COW fork hasn't changed from the last time we looked
353 	 * at it.
354 	 *
355 	 * It's safe to check the COW fork if_seq here without the ILOCK because
356 	 * we've indirectly protected against concurrent updates: writeback has
357 	 * the page locked, which prevents concurrent invalidations by reflink
358 	 * and directio and prevents concurrent buffered writes to the same
359 	 * page.  Changes to if_seq always happen under i_lock, which protects
360 	 * against concurrent updates and provides a memory barrier on the way
361 	 * out that ensures that we always see the current value.
362 	 */
363 	if (xfs_imap_valid(wpc, ip, offset))
364 		return 0;
365 
366 	/*
367 	 * If we don't have a valid map, now it's time to get a new one for this
368 	 * offset.  This will convert delayed allocations (including COW ones)
369 	 * into real extents.  If we return without a valid map, it means we
370 	 * landed in a hole and we skip the block.
371 	 */
372 retry:
373 	cow_fsb = NULLFILEOFF;
374 	whichfork = XFS_DATA_FORK;
375 	xfs_ilock(ip, XFS_ILOCK_SHARED);
376 	ASSERT(!xfs_need_iread_extents(&ip->i_df));
377 
378 	/*
379 	 * Check if this is offset is covered by a COW extents, and if yes use
380 	 * it directly instead of looking up anything in the data fork.
381 	 */
382 	if (xfs_inode_has_cow_data(ip) &&
383 	    xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
384 		cow_fsb = imap.br_startoff;
385 	if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
386 		XFS_WPC(wpc)->cow_seq = READ_ONCE(ip->i_cowfp->if_seq);
387 		xfs_iunlock(ip, XFS_ILOCK_SHARED);
388 
389 		whichfork = XFS_COW_FORK;
390 		goto allocate_blocks;
391 	}
392 
393 	/*
394 	 * No COW extent overlap. Revalidate now that we may have updated
395 	 * ->cow_seq. If the data mapping is still valid, we're done.
396 	 */
397 	if (xfs_imap_valid(wpc, ip, offset)) {
398 		xfs_iunlock(ip, XFS_ILOCK_SHARED);
399 		return 0;
400 	}
401 
402 	/*
403 	 * If we don't have a valid map, now it's time to get a new one for this
404 	 * offset.  This will convert delayed allocations (including COW ones)
405 	 * into real extents.
406 	 */
407 	if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap))
408 		imap.br_startoff = end_fsb;	/* fake a hole past EOF */
409 	XFS_WPC(wpc)->data_seq = READ_ONCE(ip->i_df.if_seq);
410 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
411 
412 	/* landed in a hole or beyond EOF? */
413 	if (imap.br_startoff > offset_fsb) {
414 		imap.br_blockcount = imap.br_startoff - offset_fsb;
415 		imap.br_startoff = offset_fsb;
416 		imap.br_startblock = HOLESTARTBLOCK;
417 		imap.br_state = XFS_EXT_NORM;
418 	}
419 
420 	/*
421 	 * Truncate to the next COW extent if there is one.  This is the only
422 	 * opportunity to do this because we can skip COW fork lookups for the
423 	 * subsequent blocks in the mapping; however, the requirement to treat
424 	 * the COW range separately remains.
425 	 */
426 	if (cow_fsb != NULLFILEOFF &&
427 	    cow_fsb < imap.br_startoff + imap.br_blockcount)
428 		imap.br_blockcount = cow_fsb - imap.br_startoff;
429 
430 	/* got a delalloc extent? */
431 	if (imap.br_startblock != HOLESTARTBLOCK &&
432 	    isnullstartblock(imap.br_startblock))
433 		goto allocate_blocks;
434 
435 	xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, XFS_WPC(wpc)->data_seq);
436 	trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap);
437 	return 0;
438 allocate_blocks:
439 	/*
440 	 * Convert a dellalloc extent to a real one. The current page is held
441 	 * locked so nothing could have removed the block backing offset_fsb,
442 	 * although it could have moved from the COW to the data fork by another
443 	 * thread.
444 	 */
445 	if (whichfork == XFS_COW_FORK)
446 		seq = &XFS_WPC(wpc)->cow_seq;
447 	else
448 		seq = &XFS_WPC(wpc)->data_seq;
449 
450 	error = xfs_bmapi_convert_delalloc(ip, whichfork, offset,
451 				&wpc->iomap, seq);
452 	if (error) {
453 		/*
454 		 * If we failed to find the extent in the COW fork we might have
455 		 * raced with a COW to data fork conversion or truncate.
456 		 * Restart the lookup to catch the extent in the data fork for
457 		 * the former case, but prevent additional retries to avoid
458 		 * looping forever for the latter case.
459 		 */
460 		if (error == -EAGAIN && whichfork == XFS_COW_FORK && !retries++)
461 			goto retry;
462 		ASSERT(error != -EAGAIN);
463 		return error;
464 	}
465 
466 	/*
467 	 * Due to merging the return real extent might be larger than the
468 	 * original delalloc one.  Trim the return extent to the next COW
469 	 * boundary again to force a re-lookup.
470 	 */
471 	if (whichfork != XFS_COW_FORK && cow_fsb != NULLFILEOFF) {
472 		loff_t		cow_offset = XFS_FSB_TO_B(mp, cow_fsb);
473 
474 		if (cow_offset < wpc->iomap.offset + wpc->iomap.length)
475 			wpc->iomap.length = cow_offset - wpc->iomap.offset;
476 	}
477 
478 	ASSERT(wpc->iomap.offset <= offset);
479 	ASSERT(wpc->iomap.offset + wpc->iomap.length > offset);
480 	trace_xfs_map_blocks_alloc(ip, offset, count, whichfork, &imap);
481 	return 0;
482 }
483 
484 static ssize_t
485 xfs_writeback_range(
486 	struct iomap_writepage_ctx *wpc,
487 	struct folio		*folio,
488 	u64			offset,
489 	unsigned int		len,
490 	u64			end_pos)
491 {
492 	ssize_t			ret;
493 
494 	ret = xfs_map_blocks(wpc, offset, len);
495 	if (!ret)
496 		ret = iomap_add_to_ioend(wpc, folio, offset, end_pos, len);
497 	if (ret < 0)
498 		xfs_discard_folio(folio, offset);
499 	return ret;
500 }
501 
502 static bool
503 xfs_ioend_needs_wq_completion(
504 	struct iomap_ioend	*ioend)
505 {
506 	/* Changing inode size requires a transaction. */
507 	if (xfs_ioend_is_append(ioend))
508 		return true;
509 
510 	/* Extent manipulation requires a transaction. */
511 	if (ioend->io_flags & (IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_SHARED))
512 		return true;
513 
514 	/* Page cache invalidation cannot be done in irq context. */
515 	if (ioend->io_flags & IOMAP_IOEND_DONTCACHE)
516 		return true;
517 
518 	return false;
519 }
520 
521 static int
522 xfs_writeback_submit(
523 	struct iomap_writepage_ctx	*wpc,
524 	int				error)
525 {
526 	struct iomap_ioend		*ioend = wpc->wb_ctx;
527 
528 	/*
529 	 * Convert CoW extents to regular.
530 	 *
531 	 * We can allocate memory here while doing writeback on behalf of memory
532 	 * reclaim.  To avoid memory allocation deadlocks, set the task-wide
533 	 * nofs context.
534 	 */
535 	if (!error && (ioend->io_flags & IOMAP_IOEND_SHARED)) {
536 		unsigned int		nofs_flag;
537 
538 		nofs_flag = memalloc_nofs_save();
539 		error = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
540 				ioend->io_offset, ioend->io_size);
541 		memalloc_nofs_restore(nofs_flag);
542 	}
543 
544 	/*
545 	 * Send ioends that might require a transaction to the completion wq.
546 	 */
547 	if (xfs_ioend_needs_wq_completion(ioend))
548 		ioend->io_bio.bi_end_io = xfs_end_bio;
549 
550 	return iomap_ioend_writeback_submit(wpc, error);
551 }
552 
553 static const struct iomap_writeback_ops xfs_writeback_ops = {
554 	.writeback_range	= xfs_writeback_range,
555 	.writeback_submit	= xfs_writeback_submit,
556 };
557 
558 struct xfs_zoned_writepage_ctx {
559 	struct iomap_writepage_ctx	ctx;
560 	struct xfs_open_zone		*open_zone;
561 };
562 
563 static inline struct xfs_zoned_writepage_ctx *
564 XFS_ZWPC(struct iomap_writepage_ctx *ctx)
565 {
566 	return container_of(ctx, struct xfs_zoned_writepage_ctx, ctx);
567 }
568 
569 static int
570 xfs_zoned_map_blocks(
571 	struct iomap_writepage_ctx *wpc,
572 	loff_t			offset,
573 	unsigned int		len)
574 {
575 	struct xfs_inode	*ip = XFS_I(wpc->inode);
576 	struct xfs_mount	*mp = ip->i_mount;
577 	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
578 	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, offset + len);
579 	xfs_filblks_t		count_fsb;
580 	struct xfs_bmbt_irec	imap, del;
581 	struct xfs_iext_cursor	icur;
582 
583 	if (xfs_is_shutdown(mp))
584 		return -EIO;
585 
586 	XFS_ERRORTAG_DELAY(mp, XFS_ERRTAG_WB_DELAY_MS);
587 
588 	/*
589 	 * All dirty data must be covered by delalloc extents.  But truncate can
590 	 * remove delalloc extents underneath us or reduce their size.
591 	 * Returning a hole tells iomap to not write back any data from this
592 	 * range, which is the right thing to do in that case.
593 	 *
594 	 * Otherwise just tell iomap to treat ranges previously covered by a
595 	 * delalloc extent as mapped.  The actual block allocation will be done
596 	 * just before submitting the bio.
597 	 *
598 	 * This implies we never map outside folios that are locked or marked
599 	 * as under writeback, and thus there is no need check the fork sequence
600 	 * count here.
601 	 */
602 	xfs_ilock(ip, XFS_ILOCK_EXCL);
603 	if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
604 		imap.br_startoff = end_fsb;	/* fake a hole past EOF */
605 	if (imap.br_startoff > offset_fsb) {
606 		imap.br_blockcount = imap.br_startoff - offset_fsb;
607 		imap.br_startoff = offset_fsb;
608 		imap.br_startblock = HOLESTARTBLOCK;
609 		imap.br_state = XFS_EXT_NORM;
610 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
611 		xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, 0);
612 		return 0;
613 	}
614 	end_fsb = min(end_fsb, imap.br_startoff + imap.br_blockcount);
615 	count_fsb = end_fsb - offset_fsb;
616 
617 	del = imap;
618 	xfs_trim_extent(&del, offset_fsb, count_fsb);
619 	xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &icur, &imap, &del,
620 			XFS_BMAPI_REMAP);
621 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
622 
623 	wpc->iomap.type = IOMAP_MAPPED;
624 	wpc->iomap.flags = IOMAP_F_DIRTY;
625 	wpc->iomap.bdev = mp->m_rtdev_targp->bt_bdev;
626 	wpc->iomap.offset = offset;
627 	wpc->iomap.length = XFS_FSB_TO_B(mp, count_fsb);
628 	wpc->iomap.flags = IOMAP_F_ANON_WRITE;
629 
630 	trace_xfs_zoned_map_blocks(ip, offset, wpc->iomap.length);
631 	return 0;
632 }
633 
634 static ssize_t
635 xfs_zoned_writeback_range(
636 	struct iomap_writepage_ctx *wpc,
637 	struct folio		*folio,
638 	u64			offset,
639 	unsigned int		len,
640 	u64			end_pos)
641 {
642 	ssize_t			ret;
643 
644 	ret = xfs_zoned_map_blocks(wpc, offset, len);
645 	if (!ret)
646 		ret = iomap_add_to_ioend(wpc, folio, offset, end_pos, len);
647 	if (ret < 0)
648 		xfs_discard_folio(folio, offset);
649 	return ret;
650 }
651 
652 static int
653 xfs_zoned_writeback_submit(
654 	struct iomap_writepage_ctx	*wpc,
655 	int				error)
656 {
657 	struct iomap_ioend		*ioend = wpc->wb_ctx;
658 
659 	ioend->io_bio.bi_end_io = xfs_end_bio;
660 	if (error) {
661 		ioend->io_bio.bi_status = errno_to_blk_status(error);
662 		bio_endio(&ioend->io_bio);
663 		return error;
664 	}
665 	if (wpc->iomap.flags & IOMAP_F_INTEGRITY)
666 		fs_bio_integrity_generate(&ioend->io_bio);
667 	xfs_zone_alloc_and_submit(ioend, &XFS_ZWPC(wpc)->open_zone);
668 	return 0;
669 }
670 
671 static const struct iomap_writeback_ops xfs_zoned_writeback_ops = {
672 	.writeback_range	= xfs_zoned_writeback_range,
673 	.writeback_submit	= xfs_zoned_writeback_submit,
674 };
675 
676 STATIC int
677 xfs_vm_writepages(
678 	struct address_space	*mapping,
679 	struct writeback_control *wbc)
680 {
681 	struct xfs_inode	*ip = XFS_I(mapping->host);
682 
683 	xfs_iflags_clear(ip, XFS_ITRUNCATED);
684 
685 	if (xfs_is_zoned_inode(ip)) {
686 		struct xfs_zoned_writepage_ctx	xc = {
687 			.ctx = {
688 				.inode	= mapping->host,
689 				.wbc	= wbc,
690 				.ops	= &xfs_zoned_writeback_ops
691 			},
692 		};
693 		int				error;
694 
695 		error = iomap_writepages(&xc.ctx);
696 		if (xc.open_zone)
697 			xfs_open_zone_put(xc.open_zone);
698 		return error;
699 	} else {
700 		struct xfs_writepage_ctx	wpc = {
701 			.ctx = {
702 				.inode	= mapping->host,
703 				.wbc	= wbc,
704 				.ops	= &xfs_writeback_ops
705 			},
706 		};
707 
708 		return iomap_writepages(&wpc.ctx);
709 	}
710 }
711 
712 STATIC int
713 xfs_dax_writepages(
714 	struct address_space	*mapping,
715 	struct writeback_control *wbc)
716 {
717 	struct xfs_inode	*ip = XFS_I(mapping->host);
718 
719 	xfs_iflags_clear(ip, XFS_ITRUNCATED);
720 	return dax_writeback_mapping_range(mapping,
721 			xfs_inode_buftarg(ip)->bt_daxdev, wbc);
722 }
723 
724 STATIC sector_t
725 xfs_vm_bmap(
726 	struct address_space	*mapping,
727 	sector_t		block)
728 {
729 	struct xfs_inode	*ip = XFS_I(mapping->host);
730 
731 	trace_xfs_vm_bmap(ip);
732 
733 	/*
734 	 * The swap code (ab-)uses ->bmap to get a block mapping and then
735 	 * bypasses the file system for actual I/O.  We really can't allow
736 	 * that on reflinks inodes, so we have to skip out here.  And yes,
737 	 * 0 is the magic code for a bmap error.
738 	 *
739 	 * Since we don't pass back blockdev info, we can't return bmap
740 	 * information for rt files either.
741 	 */
742 	if (xfs_is_cow_inode(ip) || XFS_IS_REALTIME_INODE(ip))
743 		return 0;
744 	return iomap_bmap(mapping, block, &xfs_read_iomap_ops);
745 }
746 
747 static void
748 xfs_bio_submit_read(
749 	const struct iomap_iter		*iter,
750 	struct iomap_read_folio_ctx	*ctx)
751 {
752 	struct bio			*bio = ctx->read_ctx;
753 
754 	/* defer read completions to the ioend workqueue */
755 	iomap_init_ioend(iter->inode, bio, ctx->read_ctx_file_offset, 0);
756 	bio->bi_end_io = xfs_end_bio;
757 	submit_bio(bio);
758 }
759 
760 static const struct iomap_read_ops xfs_iomap_read_ops = {
761 	.read_folio_range	= iomap_bio_read_folio_range,
762 	.submit_read		= xfs_bio_submit_read,
763 	.bio_set		= &iomap_ioend_bioset,
764 };
765 
766 static inline const struct iomap_read_ops *
767 xfs_get_iomap_read_ops(
768 	const struct address_space	*mapping)
769 {
770 	struct xfs_inode		*ip = XFS_I(mapping->host);
771 
772 	if (bdev_has_integrity_csum(xfs_inode_buftarg(ip)->bt_bdev))
773 		return &xfs_iomap_read_ops;
774 	return &iomap_bio_read_ops;
775 }
776 
777 STATIC int
778 xfs_vm_read_folio(
779 	struct file			*file,
780 	struct folio			*folio)
781 {
782 	struct iomap_read_folio_ctx	ctx = { .cur_folio = folio };
783 
784 	ctx.ops = xfs_get_iomap_read_ops(folio->mapping);
785 	iomap_read_folio(&xfs_read_iomap_ops, &ctx, NULL);
786 	return 0;
787 }
788 
789 STATIC void
790 xfs_vm_readahead(
791 	struct readahead_control	*rac)
792 {
793 	struct iomap_read_folio_ctx	ctx = { .rac = rac };
794 
795 	ctx.ops = xfs_get_iomap_read_ops(rac->mapping),
796 	iomap_readahead(&xfs_read_iomap_ops, &ctx, NULL);
797 }
798 
799 static int
800 xfs_vm_swap_activate(
801 	struct swap_info_struct		*sis,
802 	struct file			*swap_file,
803 	sector_t			*span)
804 {
805 	struct xfs_inode		*ip = XFS_I(file_inode(swap_file));
806 
807 	if (xfs_is_zoned_inode(ip))
808 		return -EINVAL;
809 
810 	/*
811 	 * Swap file activation can race against concurrent shared extent
812 	 * removal in files that have been cloned.  If this happens,
813 	 * iomap_swapfile_iter() can fail because it encountered a shared
814 	 * extent even though an operation is in progress to remove those
815 	 * shared extents.
816 	 *
817 	 * This race becomes problematic when we defer extent removal
818 	 * operations beyond the end of a syscall (i.e. use async background
819 	 * processing algorithms).  Users think the extents are no longer
820 	 * shared, but iomap_swapfile_iter() still sees them as shared
821 	 * because the refcountbt entries for the extents being removed have
822 	 * not yet been updated.  Hence the swapon call fails unexpectedly.
823 	 *
824 	 * The race condition is currently most obvious from the unlink()
825 	 * operation as extent removal is deferred until after the last
826 	 * reference to the inode goes away.  We then process the extent
827 	 * removal asynchronously, hence triggers the "syscall completed but
828 	 * work not done" condition mentioned above.  To close this race
829 	 * window, we need to flush any pending inodegc operations to ensure
830 	 * they have updated the refcountbt records before we try to map the
831 	 * swapfile.
832 	 */
833 	xfs_inodegc_flush(ip->i_mount);
834 
835 	/*
836 	 * Direct the swap code to the correct block device when this file
837 	 * sits on the RT device.
838 	 */
839 	sis->bdev = xfs_inode_buftarg(ip)->bt_bdev;
840 
841 	return iomap_swapfile_activate(sis, swap_file, span,
842 			&xfs_read_iomap_ops);
843 }
844 
845 const struct address_space_operations xfs_address_space_operations = {
846 	.read_folio		= xfs_vm_read_folio,
847 	.readahead		= xfs_vm_readahead,
848 	.writepages		= xfs_vm_writepages,
849 	.dirty_folio		= iomap_dirty_folio,
850 	.release_folio		= iomap_release_folio,
851 	.invalidate_folio	= iomap_invalidate_folio,
852 	.bmap			= xfs_vm_bmap,
853 	.migrate_folio		= filemap_migrate_folio,
854 	.is_partially_uptodate  = iomap_is_partially_uptodate,
855 	.error_remove_folio	= generic_error_remove_folio,
856 	.swap_activate		= xfs_vm_swap_activate,
857 };
858 
859 const struct address_space_operations xfs_dax_aops = {
860 	.writepages		= xfs_dax_writepages,
861 	.dirty_folio		= noop_dirty_folio,
862 	.swap_activate		= xfs_vm_swap_activate,
863 };
864