xref: /linux/fs/xfs/xfs_aops.c (revision 03f76ddff5b04a808ae16c06418460151e2fdd4b)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
4  * Copyright (c) 2016-2025 Christoph Hellwig.
5  * All Rights Reserved.
6  */
7 #include "xfs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_mount.h"
13 #include "xfs_inode.h"
14 #include "xfs_trans.h"
15 #include "xfs_iomap.h"
16 #include "xfs_trace.h"
17 #include "xfs_bmap.h"
18 #include "xfs_bmap_util.h"
19 #include "xfs_reflink.h"
20 #include "xfs_errortag.h"
21 #include "xfs_error.h"
22 #include "xfs_icache.h"
23 #include "xfs_zone_alloc.h"
24 #include "xfs_rtgroup.h"
25 
26 struct xfs_writepage_ctx {
27 	struct iomap_writepage_ctx ctx;
28 	unsigned int		data_seq;
29 	unsigned int		cow_seq;
30 };
31 
32 static inline struct xfs_writepage_ctx *
33 XFS_WPC(struct iomap_writepage_ctx *ctx)
34 {
35 	return container_of(ctx, struct xfs_writepage_ctx, ctx);
36 }
37 
38 /*
39  * Fast and loose check if this write could update the on-disk inode size.
40  */
41 static inline bool xfs_ioend_is_append(struct iomap_ioend *ioend)
42 {
43 	return ioend->io_offset + ioend->io_size >
44 		XFS_I(ioend->io_inode)->i_disk_size;
45 }
46 
47 /*
48  * Update on-disk file size now that data has been written to disk.
49  */
50 int
51 xfs_setfilesize(
52 	struct xfs_inode	*ip,
53 	xfs_off_t		offset,
54 	size_t			size)
55 {
56 	struct xfs_mount	*mp = ip->i_mount;
57 	struct xfs_trans	*tp;
58 	xfs_fsize_t		isize;
59 	int			error;
60 
61 	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
62 	if (error)
63 		return error;
64 
65 	xfs_ilock(ip, XFS_ILOCK_EXCL);
66 	isize = xfs_new_eof(ip, offset + size);
67 	if (!isize) {
68 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
69 		xfs_trans_cancel(tp);
70 		return 0;
71 	}
72 
73 	trace_xfs_setfilesize(ip, offset, size);
74 
75 	ip->i_disk_size = isize;
76 	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
77 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
78 
79 	return xfs_trans_commit(tp);
80 }
81 
82 static void
83 xfs_ioend_put_open_zones(
84 	struct iomap_ioend	*ioend)
85 {
86 	struct iomap_ioend *tmp;
87 
88 	/*
89 	 * Put the open zone for all ioends merged into this one (if any).
90 	 */
91 	list_for_each_entry(tmp, &ioend->io_list, io_list)
92 		xfs_open_zone_put(tmp->io_private);
93 
94 	/*
95 	 * The main ioend might not have an open zone if the submission failed
96 	 * before xfs_zone_alloc_and_submit got called.
97 	 */
98 	if (ioend->io_private)
99 		xfs_open_zone_put(ioend->io_private);
100 }
101 
102 /*
103  * IO write completion.
104  */
105 STATIC void
106 xfs_end_ioend(
107 	struct iomap_ioend	*ioend)
108 {
109 	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
110 	struct xfs_mount	*mp = ip->i_mount;
111 	bool			is_zoned = xfs_is_zoned_inode(ip);
112 	xfs_off_t		offset = ioend->io_offset;
113 	size_t			size = ioend->io_size;
114 	unsigned int		nofs_flag;
115 	int			error;
116 
117 	/*
118 	 * We can allocate memory here while doing writeback on behalf of
119 	 * memory reclaim.  To avoid memory allocation deadlocks set the
120 	 * task-wide nofs context for the following operations.
121 	 */
122 	nofs_flag = memalloc_nofs_save();
123 
124 	/*
125 	 * Just clean up the in-memory structures if the fs has been shut down.
126 	 */
127 	if (xfs_is_shutdown(mp)) {
128 		error = -EIO;
129 		goto done;
130 	}
131 
132 	/*
133 	 * Clean up all COW blocks and underlying data fork delalloc blocks on
134 	 * I/O error. The delalloc punch is required because this ioend was
135 	 * mapped to blocks in the COW fork and the associated pages are no
136 	 * longer dirty. If we don't remove delalloc blocks here, they become
137 	 * stale and can corrupt free space accounting on unmount.
138 	 */
139 	error = blk_status_to_errno(ioend->io_bio.bi_status);
140 	if (unlikely(error)) {
141 		if (ioend->io_flags & IOMAP_IOEND_SHARED) {
142 			ASSERT(!is_zoned);
143 			xfs_reflink_cancel_cow_range(ip, offset, size, true);
144 			xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, offset,
145 					offset + size, NULL);
146 		}
147 		goto done;
148 	}
149 
150 	/*
151 	 * Success: commit the COW or unwritten blocks if needed.
152 	 */
153 	if (is_zoned)
154 		error = xfs_zoned_end_io(ip, offset, size, ioend->io_sector,
155 				ioend->io_private, NULLFSBLOCK);
156 	else if (ioend->io_flags & IOMAP_IOEND_SHARED)
157 		error = xfs_reflink_end_cow(ip, offset, size);
158 	else if (ioend->io_flags & IOMAP_IOEND_UNWRITTEN)
159 		error = xfs_iomap_write_unwritten(ip, offset, size, false);
160 
161 	if (!error &&
162 	    !(ioend->io_flags & IOMAP_IOEND_DIRECT) &&
163 	    xfs_ioend_is_append(ioend))
164 		error = xfs_setfilesize(ip, offset, size);
165 done:
166 	if (is_zoned)
167 		xfs_ioend_put_open_zones(ioend);
168 	iomap_finish_ioends(ioend, error);
169 	memalloc_nofs_restore(nofs_flag);
170 }
171 
172 /*
173  * Finish all pending IO completions that require transactional modifications.
174  *
175  * We try to merge physical and logically contiguous ioends before completion to
176  * minimise the number of transactions we need to perform during IO completion.
177  * Both unwritten extent conversion and COW remapping need to iterate and modify
178  * one physical extent at a time, so we gain nothing by merging physically
179  * discontiguous extents here.
180  *
181  * The ioend chain length that we can be processing here is largely unbound in
182  * length and we may have to perform significant amounts of work on each ioend
183  * to complete it. Hence we have to be careful about holding the CPU for too
184  * long in this loop.
185  */
186 void
187 xfs_end_io(
188 	struct work_struct	*work)
189 {
190 	struct xfs_inode	*ip =
191 		container_of(work, struct xfs_inode, i_ioend_work);
192 	struct iomap_ioend	*ioend;
193 	struct list_head	tmp;
194 	unsigned long		flags;
195 
196 	spin_lock_irqsave(&ip->i_ioend_lock, flags);
197 	list_replace_init(&ip->i_ioend_list, &tmp);
198 	spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
199 
200 	iomap_sort_ioends(&tmp);
201 	while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend,
202 			io_list))) {
203 		list_del_init(&ioend->io_list);
204 		iomap_ioend_try_merge(ioend, &tmp);
205 		xfs_end_ioend(ioend);
206 		cond_resched();
207 	}
208 }
209 
210 void
211 xfs_end_bio(
212 	struct bio		*bio)
213 {
214 	struct iomap_ioend	*ioend = iomap_ioend_from_bio(bio);
215 	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
216 	struct xfs_mount	*mp = ip->i_mount;
217 	unsigned long		flags;
218 
219 	/*
220 	 * For Appends record the actually written block number and set the
221 	 * boundary flag if needed.
222 	 */
223 	if (IS_ENABLED(CONFIG_XFS_RT) && bio_is_zone_append(bio)) {
224 		ioend->io_sector = bio->bi_iter.bi_sector;
225 		xfs_mark_rtg_boundary(ioend);
226 	}
227 
228 	spin_lock_irqsave(&ip->i_ioend_lock, flags);
229 	if (list_empty(&ip->i_ioend_list))
230 		WARN_ON_ONCE(!queue_work(mp->m_unwritten_workqueue,
231 					 &ip->i_ioend_work));
232 	list_add_tail(&ioend->io_list, &ip->i_ioend_list);
233 	spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
234 }
235 
236 /*
237  * We cannot cancel the ioend directly on error.  We may have already set other
238  * pages under writeback and hence we have to run I/O completion to mark the
239  * error state of the pages under writeback appropriately.
240  *
241  * If the folio has delalloc blocks on it, the caller is asking us to punch them
242  * out. If we don't, we can leave a stale delalloc mapping covered by a clean
243  * page that needs to be dirtied again before the delalloc mapping can be
244  * converted. This stale delalloc mapping can trip up a later direct I/O read
245  * operation on the same region.
246  *
247  * We prevent this by truncating away the delalloc regions on the folio. Because
248  * they are delalloc, we can do this without needing a transaction. Indeed - if
249  * we get ENOSPC errors, we have to be able to do this truncation without a
250  * transaction as there is no space left for block reservation (typically why
251  * we see a ENOSPC in writeback).
252  */
253 static void
254 xfs_discard_folio(
255 	struct folio		*folio,
256 	loff_t			pos)
257 {
258 	struct xfs_inode	*ip = XFS_I(folio->mapping->host);
259 	struct xfs_mount	*mp = ip->i_mount;
260 
261 	if (xfs_is_shutdown(mp))
262 		return;
263 
264 	xfs_alert_ratelimited(mp,
265 		"page discard on page "PTR_FMT", inode 0x%llx, pos %llu.",
266 			folio, ip->i_ino, pos);
267 
268 	/*
269 	 * The end of the punch range is always the offset of the first
270 	 * byte of the next folio. Hence the end offset is only dependent on the
271 	 * folio itself and not the start offset that is passed in.
272 	 */
273 	xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos,
274 				folio_pos(folio) + folio_size(folio), NULL);
275 }
276 
277 /*
278  * Fast revalidation of the cached writeback mapping. Return true if the current
279  * mapping is valid, false otherwise.
280  */
281 static bool
282 xfs_imap_valid(
283 	struct iomap_writepage_ctx	*wpc,
284 	struct xfs_inode		*ip,
285 	loff_t				offset)
286 {
287 	if (offset < wpc->iomap.offset ||
288 	    offset >= wpc->iomap.offset + wpc->iomap.length)
289 		return false;
290 	/*
291 	 * If this is a COW mapping, it is sufficient to check that the mapping
292 	 * covers the offset. Be careful to check this first because the caller
293 	 * can revalidate a COW mapping without updating the data seqno.
294 	 */
295 	if (wpc->iomap.flags & IOMAP_F_SHARED)
296 		return true;
297 
298 	/*
299 	 * This is not a COW mapping. Check the sequence number of the data fork
300 	 * because concurrent changes could have invalidated the extent. Check
301 	 * the COW fork because concurrent changes since the last time we
302 	 * checked (and found nothing at this offset) could have added
303 	 * overlapping blocks.
304 	 */
305 	if (XFS_WPC(wpc)->data_seq != READ_ONCE(ip->i_df.if_seq)) {
306 		trace_xfs_wb_data_iomap_invalid(ip, &wpc->iomap,
307 				XFS_WPC(wpc)->data_seq, XFS_DATA_FORK);
308 		return false;
309 	}
310 	if (xfs_inode_has_cow_data(ip) &&
311 	    XFS_WPC(wpc)->cow_seq != READ_ONCE(ip->i_cowfp->if_seq)) {
312 		trace_xfs_wb_cow_iomap_invalid(ip, &wpc->iomap,
313 				XFS_WPC(wpc)->cow_seq, XFS_COW_FORK);
314 		return false;
315 	}
316 	return true;
317 }
318 
319 static int
320 xfs_map_blocks(
321 	struct iomap_writepage_ctx *wpc,
322 	loff_t			offset,
323 	unsigned int		len)
324 {
325 	struct xfs_inode	*ip = XFS_I(wpc->inode);
326 	struct xfs_mount	*mp = ip->i_mount;
327 	ssize_t			count = i_blocksize(wpc->inode);
328 	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
329 	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, offset + count);
330 	xfs_fileoff_t		cow_fsb;
331 	int			whichfork;
332 	struct xfs_bmbt_irec	imap;
333 	struct xfs_iext_cursor	icur;
334 	int			retries = 0;
335 	int			error = 0;
336 	unsigned int		*seq;
337 
338 	if (xfs_is_shutdown(mp))
339 		return -EIO;
340 
341 	XFS_ERRORTAG_DELAY(mp, XFS_ERRTAG_WB_DELAY_MS);
342 
343 	/*
344 	 * COW fork blocks can overlap data fork blocks even if the blocks
345 	 * aren't shared.  COW I/O always takes precedent, so we must always
346 	 * check for overlap on reflink inodes unless the mapping is already a
347 	 * COW one, or the COW fork hasn't changed from the last time we looked
348 	 * at it.
349 	 *
350 	 * It's safe to check the COW fork if_seq here without the ILOCK because
351 	 * we've indirectly protected against concurrent updates: writeback has
352 	 * the page locked, which prevents concurrent invalidations by reflink
353 	 * and directio and prevents concurrent buffered writes to the same
354 	 * page.  Changes to if_seq always happen under i_lock, which protects
355 	 * against concurrent updates and provides a memory barrier on the way
356 	 * out that ensures that we always see the current value.
357 	 */
358 	if (xfs_imap_valid(wpc, ip, offset))
359 		return 0;
360 
361 	/*
362 	 * If we don't have a valid map, now it's time to get a new one for this
363 	 * offset.  This will convert delayed allocations (including COW ones)
364 	 * into real extents.  If we return without a valid map, it means we
365 	 * landed in a hole and we skip the block.
366 	 */
367 retry:
368 	cow_fsb = NULLFILEOFF;
369 	whichfork = XFS_DATA_FORK;
370 	xfs_ilock(ip, XFS_ILOCK_SHARED);
371 	ASSERT(!xfs_need_iread_extents(&ip->i_df));
372 
373 	/*
374 	 * Check if this is offset is covered by a COW extents, and if yes use
375 	 * it directly instead of looking up anything in the data fork.
376 	 */
377 	if (xfs_inode_has_cow_data(ip) &&
378 	    xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
379 		cow_fsb = imap.br_startoff;
380 	if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
381 		XFS_WPC(wpc)->cow_seq = READ_ONCE(ip->i_cowfp->if_seq);
382 		xfs_iunlock(ip, XFS_ILOCK_SHARED);
383 
384 		whichfork = XFS_COW_FORK;
385 		goto allocate_blocks;
386 	}
387 
388 	/*
389 	 * No COW extent overlap. Revalidate now that we may have updated
390 	 * ->cow_seq. If the data mapping is still valid, we're done.
391 	 */
392 	if (xfs_imap_valid(wpc, ip, offset)) {
393 		xfs_iunlock(ip, XFS_ILOCK_SHARED);
394 		return 0;
395 	}
396 
397 	/*
398 	 * If we don't have a valid map, now it's time to get a new one for this
399 	 * offset.  This will convert delayed allocations (including COW ones)
400 	 * into real extents.
401 	 */
402 	if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap))
403 		imap.br_startoff = end_fsb;	/* fake a hole past EOF */
404 	XFS_WPC(wpc)->data_seq = READ_ONCE(ip->i_df.if_seq);
405 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
406 
407 	/* landed in a hole or beyond EOF? */
408 	if (imap.br_startoff > offset_fsb) {
409 		imap.br_blockcount = imap.br_startoff - offset_fsb;
410 		imap.br_startoff = offset_fsb;
411 		imap.br_startblock = HOLESTARTBLOCK;
412 		imap.br_state = XFS_EXT_NORM;
413 	}
414 
415 	/*
416 	 * Truncate to the next COW extent if there is one.  This is the only
417 	 * opportunity to do this because we can skip COW fork lookups for the
418 	 * subsequent blocks in the mapping; however, the requirement to treat
419 	 * the COW range separately remains.
420 	 */
421 	if (cow_fsb != NULLFILEOFF &&
422 	    cow_fsb < imap.br_startoff + imap.br_blockcount)
423 		imap.br_blockcount = cow_fsb - imap.br_startoff;
424 
425 	/* got a delalloc extent? */
426 	if (imap.br_startblock != HOLESTARTBLOCK &&
427 	    isnullstartblock(imap.br_startblock))
428 		goto allocate_blocks;
429 
430 	xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, XFS_WPC(wpc)->data_seq);
431 	trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap);
432 	return 0;
433 allocate_blocks:
434 	/*
435 	 * Convert a dellalloc extent to a real one. The current page is held
436 	 * locked so nothing could have removed the block backing offset_fsb,
437 	 * although it could have moved from the COW to the data fork by another
438 	 * thread.
439 	 */
440 	if (whichfork == XFS_COW_FORK)
441 		seq = &XFS_WPC(wpc)->cow_seq;
442 	else
443 		seq = &XFS_WPC(wpc)->data_seq;
444 
445 	error = xfs_bmapi_convert_delalloc(ip, whichfork, offset,
446 				&wpc->iomap, seq);
447 	if (error) {
448 		/*
449 		 * If we failed to find the extent in the COW fork we might have
450 		 * raced with a COW to data fork conversion or truncate.
451 		 * Restart the lookup to catch the extent in the data fork for
452 		 * the former case, but prevent additional retries to avoid
453 		 * looping forever for the latter case.
454 		 */
455 		if (error == -EAGAIN && whichfork == XFS_COW_FORK && !retries++)
456 			goto retry;
457 		ASSERT(error != -EAGAIN);
458 		return error;
459 	}
460 
461 	/*
462 	 * Due to merging the return real extent might be larger than the
463 	 * original delalloc one.  Trim the return extent to the next COW
464 	 * boundary again to force a re-lookup.
465 	 */
466 	if (whichfork != XFS_COW_FORK && cow_fsb != NULLFILEOFF) {
467 		loff_t		cow_offset = XFS_FSB_TO_B(mp, cow_fsb);
468 
469 		if (cow_offset < wpc->iomap.offset + wpc->iomap.length)
470 			wpc->iomap.length = cow_offset - wpc->iomap.offset;
471 	}
472 
473 	ASSERT(wpc->iomap.offset <= offset);
474 	ASSERT(wpc->iomap.offset + wpc->iomap.length > offset);
475 	trace_xfs_map_blocks_alloc(ip, offset, count, whichfork, &imap);
476 	return 0;
477 }
478 
479 static ssize_t
480 xfs_writeback_range(
481 	struct iomap_writepage_ctx *wpc,
482 	struct folio		*folio,
483 	u64			offset,
484 	unsigned int		len,
485 	u64			end_pos)
486 {
487 	ssize_t			ret;
488 
489 	ret = xfs_map_blocks(wpc, offset, len);
490 	if (!ret)
491 		ret = iomap_add_to_ioend(wpc, folio, offset, end_pos, len);
492 	if (ret < 0)
493 		xfs_discard_folio(folio, offset);
494 	return ret;
495 }
496 
497 static bool
498 xfs_ioend_needs_wq_completion(
499 	struct iomap_ioend	*ioend)
500 {
501 	/* Changing inode size requires a transaction. */
502 	if (xfs_ioend_is_append(ioend))
503 		return true;
504 
505 	/* Extent manipulation requires a transaction. */
506 	if (ioend->io_flags & (IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_SHARED))
507 		return true;
508 
509 	/* Page cache invalidation cannot be done in irq context. */
510 	if (ioend->io_flags & IOMAP_IOEND_DONTCACHE)
511 		return true;
512 
513 	return false;
514 }
515 
516 static int
517 xfs_writeback_submit(
518 	struct iomap_writepage_ctx	*wpc,
519 	int				error)
520 {
521 	struct iomap_ioend		*ioend = wpc->wb_ctx;
522 
523 	/*
524 	 * Convert CoW extents to regular.
525 	 *
526 	 * We can allocate memory here while doing writeback on behalf of memory
527 	 * reclaim.  To avoid memory allocation deadlocks, set the task-wide
528 	 * nofs context.
529 	 */
530 	if (!error && (ioend->io_flags & IOMAP_IOEND_SHARED)) {
531 		unsigned int		nofs_flag;
532 
533 		nofs_flag = memalloc_nofs_save();
534 		error = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
535 				ioend->io_offset, ioend->io_size);
536 		memalloc_nofs_restore(nofs_flag);
537 	}
538 
539 	/*
540 	 * Send ioends that might require a transaction to the completion wq.
541 	 */
542 	if (xfs_ioend_needs_wq_completion(ioend))
543 		ioend->io_bio.bi_end_io = xfs_end_bio;
544 
545 	return iomap_ioend_writeback_submit(wpc, error);
546 }
547 
548 static const struct iomap_writeback_ops xfs_writeback_ops = {
549 	.writeback_range	= xfs_writeback_range,
550 	.writeback_submit	= xfs_writeback_submit,
551 };
552 
553 struct xfs_zoned_writepage_ctx {
554 	struct iomap_writepage_ctx	ctx;
555 	struct xfs_open_zone		*open_zone;
556 };
557 
558 static inline struct xfs_zoned_writepage_ctx *
559 XFS_ZWPC(struct iomap_writepage_ctx *ctx)
560 {
561 	return container_of(ctx, struct xfs_zoned_writepage_ctx, ctx);
562 }
563 
564 static int
565 xfs_zoned_map_blocks(
566 	struct iomap_writepage_ctx *wpc,
567 	loff_t			offset,
568 	unsigned int		len)
569 {
570 	struct xfs_inode	*ip = XFS_I(wpc->inode);
571 	struct xfs_mount	*mp = ip->i_mount;
572 	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
573 	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, offset + len);
574 	xfs_filblks_t		count_fsb;
575 	struct xfs_bmbt_irec	imap, del;
576 	struct xfs_iext_cursor	icur;
577 
578 	if (xfs_is_shutdown(mp))
579 		return -EIO;
580 
581 	XFS_ERRORTAG_DELAY(mp, XFS_ERRTAG_WB_DELAY_MS);
582 
583 	/*
584 	 * All dirty data must be covered by delalloc extents.  But truncate can
585 	 * remove delalloc extents underneath us or reduce their size.
586 	 * Returning a hole tells iomap to not write back any data from this
587 	 * range, which is the right thing to do in that case.
588 	 *
589 	 * Otherwise just tell iomap to treat ranges previously covered by a
590 	 * delalloc extent as mapped.  The actual block allocation will be done
591 	 * just before submitting the bio.
592 	 *
593 	 * This implies we never map outside folios that are locked or marked
594 	 * as under writeback, and thus there is no need check the fork sequence
595 	 * count here.
596 	 */
597 	xfs_ilock(ip, XFS_ILOCK_EXCL);
598 	if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
599 		imap.br_startoff = end_fsb;	/* fake a hole past EOF */
600 	if (imap.br_startoff > offset_fsb) {
601 		imap.br_blockcount = imap.br_startoff - offset_fsb;
602 		imap.br_startoff = offset_fsb;
603 		imap.br_startblock = HOLESTARTBLOCK;
604 		imap.br_state = XFS_EXT_NORM;
605 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
606 		xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, 0);
607 		return 0;
608 	}
609 	end_fsb = min(end_fsb, imap.br_startoff + imap.br_blockcount);
610 	count_fsb = end_fsb - offset_fsb;
611 
612 	del = imap;
613 	xfs_trim_extent(&del, offset_fsb, count_fsb);
614 	xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &icur, &imap, &del,
615 			XFS_BMAPI_REMAP);
616 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
617 
618 	wpc->iomap.type = IOMAP_MAPPED;
619 	wpc->iomap.flags = IOMAP_F_DIRTY;
620 	wpc->iomap.bdev = mp->m_rtdev_targp->bt_bdev;
621 	wpc->iomap.offset = offset;
622 	wpc->iomap.length = XFS_FSB_TO_B(mp, count_fsb);
623 	wpc->iomap.flags = IOMAP_F_ANON_WRITE;
624 
625 	trace_xfs_zoned_map_blocks(ip, offset, wpc->iomap.length);
626 	return 0;
627 }
628 
629 static ssize_t
630 xfs_zoned_writeback_range(
631 	struct iomap_writepage_ctx *wpc,
632 	struct folio		*folio,
633 	u64			offset,
634 	unsigned int		len,
635 	u64			end_pos)
636 {
637 	ssize_t			ret;
638 
639 	ret = xfs_zoned_map_blocks(wpc, offset, len);
640 	if (!ret)
641 		ret = iomap_add_to_ioend(wpc, folio, offset, end_pos, len);
642 	if (ret < 0)
643 		xfs_discard_folio(folio, offset);
644 	return ret;
645 }
646 
647 static int
648 xfs_zoned_writeback_submit(
649 	struct iomap_writepage_ctx	*wpc,
650 	int				error)
651 {
652 	struct iomap_ioend		*ioend = wpc->wb_ctx;
653 
654 	ioend->io_bio.bi_end_io = xfs_end_bio;
655 	if (error) {
656 		ioend->io_bio.bi_status = errno_to_blk_status(error);
657 		bio_endio(&ioend->io_bio);
658 		return error;
659 	}
660 	xfs_zone_alloc_and_submit(ioend, &XFS_ZWPC(wpc)->open_zone);
661 	return 0;
662 }
663 
664 static const struct iomap_writeback_ops xfs_zoned_writeback_ops = {
665 	.writeback_range	= xfs_zoned_writeback_range,
666 	.writeback_submit	= xfs_zoned_writeback_submit,
667 };
668 
669 STATIC int
670 xfs_vm_writepages(
671 	struct address_space	*mapping,
672 	struct writeback_control *wbc)
673 {
674 	struct xfs_inode	*ip = XFS_I(mapping->host);
675 
676 	xfs_iflags_clear(ip, XFS_ITRUNCATED);
677 
678 	if (xfs_is_zoned_inode(ip)) {
679 		struct xfs_zoned_writepage_ctx	xc = {
680 			.ctx = {
681 				.inode	= mapping->host,
682 				.wbc	= wbc,
683 				.ops	= &xfs_zoned_writeback_ops
684 			},
685 		};
686 		int				error;
687 
688 		error = iomap_writepages(&xc.ctx);
689 		if (xc.open_zone)
690 			xfs_open_zone_put(xc.open_zone);
691 		return error;
692 	} else {
693 		struct xfs_writepage_ctx	wpc = {
694 			.ctx = {
695 				.inode	= mapping->host,
696 				.wbc	= wbc,
697 				.ops	= &xfs_writeback_ops
698 			},
699 		};
700 
701 		return iomap_writepages(&wpc.ctx);
702 	}
703 }
704 
705 STATIC int
706 xfs_dax_writepages(
707 	struct address_space	*mapping,
708 	struct writeback_control *wbc)
709 {
710 	struct xfs_inode	*ip = XFS_I(mapping->host);
711 
712 	xfs_iflags_clear(ip, XFS_ITRUNCATED);
713 	return dax_writeback_mapping_range(mapping,
714 			xfs_inode_buftarg(ip)->bt_daxdev, wbc);
715 }
716 
717 STATIC sector_t
718 xfs_vm_bmap(
719 	struct address_space	*mapping,
720 	sector_t		block)
721 {
722 	struct xfs_inode	*ip = XFS_I(mapping->host);
723 
724 	trace_xfs_vm_bmap(ip);
725 
726 	/*
727 	 * The swap code (ab-)uses ->bmap to get a block mapping and then
728 	 * bypasses the file system for actual I/O.  We really can't allow
729 	 * that on reflinks inodes, so we have to skip out here.  And yes,
730 	 * 0 is the magic code for a bmap error.
731 	 *
732 	 * Since we don't pass back blockdev info, we can't return bmap
733 	 * information for rt files either.
734 	 */
735 	if (xfs_is_cow_inode(ip) || XFS_IS_REALTIME_INODE(ip))
736 		return 0;
737 	return iomap_bmap(mapping, block, &xfs_read_iomap_ops);
738 }
739 
740 STATIC int
741 xfs_vm_read_folio(
742 	struct file		*unused,
743 	struct folio		*folio)
744 {
745 	return iomap_read_folio(folio, &xfs_read_iomap_ops);
746 }
747 
748 STATIC void
749 xfs_vm_readahead(
750 	struct readahead_control	*rac)
751 {
752 	iomap_readahead(rac, &xfs_read_iomap_ops);
753 }
754 
755 static int
756 xfs_vm_swap_activate(
757 	struct swap_info_struct		*sis,
758 	struct file			*swap_file,
759 	sector_t			*span)
760 {
761 	struct xfs_inode		*ip = XFS_I(file_inode(swap_file));
762 
763 	if (xfs_is_zoned_inode(ip))
764 		return -EINVAL;
765 
766 	/*
767 	 * Swap file activation can race against concurrent shared extent
768 	 * removal in files that have been cloned.  If this happens,
769 	 * iomap_swapfile_iter() can fail because it encountered a shared
770 	 * extent even though an operation is in progress to remove those
771 	 * shared extents.
772 	 *
773 	 * This race becomes problematic when we defer extent removal
774 	 * operations beyond the end of a syscall (i.e. use async background
775 	 * processing algorithms).  Users think the extents are no longer
776 	 * shared, but iomap_swapfile_iter() still sees them as shared
777 	 * because the refcountbt entries for the extents being removed have
778 	 * not yet been updated.  Hence the swapon call fails unexpectedly.
779 	 *
780 	 * The race condition is currently most obvious from the unlink()
781 	 * operation as extent removal is deferred until after the last
782 	 * reference to the inode goes away.  We then process the extent
783 	 * removal asynchronously, hence triggers the "syscall completed but
784 	 * work not done" condition mentioned above.  To close this race
785 	 * window, we need to flush any pending inodegc operations to ensure
786 	 * they have updated the refcountbt records before we try to map the
787 	 * swapfile.
788 	 */
789 	xfs_inodegc_flush(ip->i_mount);
790 
791 	/*
792 	 * Direct the swap code to the correct block device when this file
793 	 * sits on the RT device.
794 	 */
795 	sis->bdev = xfs_inode_buftarg(ip)->bt_bdev;
796 
797 	return iomap_swapfile_activate(sis, swap_file, span,
798 			&xfs_read_iomap_ops);
799 }
800 
801 const struct address_space_operations xfs_address_space_operations = {
802 	.read_folio		= xfs_vm_read_folio,
803 	.readahead		= xfs_vm_readahead,
804 	.writepages		= xfs_vm_writepages,
805 	.dirty_folio		= iomap_dirty_folio,
806 	.release_folio		= iomap_release_folio,
807 	.invalidate_folio	= iomap_invalidate_folio,
808 	.bmap			= xfs_vm_bmap,
809 	.migrate_folio		= filemap_migrate_folio,
810 	.is_partially_uptodate  = iomap_is_partially_uptodate,
811 	.error_remove_folio	= generic_error_remove_folio,
812 	.swap_activate		= xfs_vm_swap_activate,
813 };
814 
815 const struct address_space_operations xfs_dax_aops = {
816 	.writepages		= xfs_dax_writepages,
817 	.dirty_folio		= noop_dirty_folio,
818 	.swap_activate		= xfs_vm_swap_activate,
819 };
820