1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
4 * Copyright (c) 2016-2025 Christoph Hellwig.
5 * All Rights Reserved.
6 */
7 #include "xfs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_mount.h"
13 #include "xfs_inode.h"
14 #include "xfs_trans.h"
15 #include "xfs_iomap.h"
16 #include "xfs_trace.h"
17 #include "xfs_bmap.h"
18 #include "xfs_bmap_util.h"
19 #include "xfs_reflink.h"
20 #include "xfs_errortag.h"
21 #include "xfs_error.h"
22 #include "xfs_icache.h"
23 #include "xfs_zone_alloc.h"
24 #include "xfs_rtgroup.h"
25
26 struct xfs_writepage_ctx {
27 struct iomap_writepage_ctx ctx;
28 unsigned int data_seq;
29 unsigned int cow_seq;
30 };
31
32 static inline struct xfs_writepage_ctx *
XFS_WPC(struct iomap_writepage_ctx * ctx)33 XFS_WPC(struct iomap_writepage_ctx *ctx)
34 {
35 return container_of(ctx, struct xfs_writepage_ctx, ctx);
36 }
37
38 /*
39 * Fast and loose check if this write could update the on-disk inode size.
40 */
xfs_ioend_is_append(struct iomap_ioend * ioend)41 static inline bool xfs_ioend_is_append(struct iomap_ioend *ioend)
42 {
43 return ioend->io_offset + ioend->io_size >
44 XFS_I(ioend->io_inode)->i_disk_size;
45 }
46
47 /*
48 * Update on-disk file size now that data has been written to disk.
49 */
50 int
xfs_setfilesize(struct xfs_inode * ip,xfs_off_t offset,size_t size)51 xfs_setfilesize(
52 struct xfs_inode *ip,
53 xfs_off_t offset,
54 size_t size)
55 {
56 struct xfs_mount *mp = ip->i_mount;
57 struct xfs_trans *tp;
58 xfs_fsize_t isize;
59 int error;
60
61 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
62 if (error)
63 return error;
64
65 xfs_ilock(ip, XFS_ILOCK_EXCL);
66 isize = xfs_new_eof(ip, offset + size);
67 if (!isize) {
68 xfs_iunlock(ip, XFS_ILOCK_EXCL);
69 xfs_trans_cancel(tp);
70 return 0;
71 }
72
73 trace_xfs_setfilesize(ip, offset, size);
74
75 ip->i_disk_size = isize;
76 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
77 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
78
79 return xfs_trans_commit(tp);
80 }
81
82 static void
xfs_ioend_put_open_zones(struct iomap_ioend * ioend)83 xfs_ioend_put_open_zones(
84 struct iomap_ioend *ioend)
85 {
86 struct iomap_ioend *tmp;
87
88 /*
89 * Put the open zone for all ioends merged into this one (if any).
90 */
91 list_for_each_entry(tmp, &ioend->io_list, io_list)
92 xfs_open_zone_put(tmp->io_private);
93
94 /*
95 * The main ioend might not have an open zone if the submission failed
96 * before xfs_zone_alloc_and_submit got called.
97 */
98 if (ioend->io_private)
99 xfs_open_zone_put(ioend->io_private);
100 }
101
102 /*
103 * IO write completion.
104 */
105 STATIC void
xfs_end_ioend(struct iomap_ioend * ioend)106 xfs_end_ioend(
107 struct iomap_ioend *ioend)
108 {
109 struct xfs_inode *ip = XFS_I(ioend->io_inode);
110 struct xfs_mount *mp = ip->i_mount;
111 bool is_zoned = xfs_is_zoned_inode(ip);
112 xfs_off_t offset = ioend->io_offset;
113 size_t size = ioend->io_size;
114 unsigned int nofs_flag;
115 int error;
116
117 /*
118 * We can allocate memory here while doing writeback on behalf of
119 * memory reclaim. To avoid memory allocation deadlocks set the
120 * task-wide nofs context for the following operations.
121 */
122 nofs_flag = memalloc_nofs_save();
123
124 /*
125 * Just clean up the in-memory structures if the fs has been shut down.
126 */
127 if (xfs_is_shutdown(mp)) {
128 error = -EIO;
129 goto done;
130 }
131
132 /*
133 * Clean up all COW blocks and underlying data fork delalloc blocks on
134 * I/O error. The delalloc punch is required because this ioend was
135 * mapped to blocks in the COW fork and the associated pages are no
136 * longer dirty. If we don't remove delalloc blocks here, they become
137 * stale and can corrupt free space accounting on unmount.
138 */
139 error = blk_status_to_errno(ioend->io_bio.bi_status);
140 if (unlikely(error)) {
141 if (ioend->io_flags & IOMAP_IOEND_SHARED) {
142 ASSERT(!is_zoned);
143 xfs_reflink_cancel_cow_range(ip, offset, size, true);
144 xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, offset,
145 offset + size, NULL);
146 }
147 goto done;
148 }
149
150 /*
151 * Success: commit the COW or unwritten blocks if needed.
152 */
153 if (is_zoned)
154 error = xfs_zoned_end_io(ip, offset, size, ioend->io_sector,
155 ioend->io_private, NULLFSBLOCK);
156 else if (ioend->io_flags & IOMAP_IOEND_SHARED)
157 error = xfs_reflink_end_cow(ip, offset, size);
158 else if (ioend->io_flags & IOMAP_IOEND_UNWRITTEN)
159 error = xfs_iomap_write_unwritten(ip, offset, size, false);
160
161 if (!error &&
162 !(ioend->io_flags & IOMAP_IOEND_DIRECT) &&
163 xfs_ioend_is_append(ioend))
164 error = xfs_setfilesize(ip, offset, size);
165 done:
166 if (is_zoned)
167 xfs_ioend_put_open_zones(ioend);
168 iomap_finish_ioends(ioend, error);
169 memalloc_nofs_restore(nofs_flag);
170 }
171
172 /*
173 * Finish all pending IO completions that require transactional modifications.
174 *
175 * We try to merge physical and logically contiguous ioends before completion to
176 * minimise the number of transactions we need to perform during IO completion.
177 * Both unwritten extent conversion and COW remapping need to iterate and modify
178 * one physical extent at a time, so we gain nothing by merging physically
179 * discontiguous extents here.
180 *
181 * The ioend chain length that we can be processing here is largely unbound in
182 * length and we may have to perform significant amounts of work on each ioend
183 * to complete it. Hence we have to be careful about holding the CPU for too
184 * long in this loop.
185 */
186 void
xfs_end_io(struct work_struct * work)187 xfs_end_io(
188 struct work_struct *work)
189 {
190 struct xfs_inode *ip =
191 container_of(work, struct xfs_inode, i_ioend_work);
192 struct iomap_ioend *ioend;
193 struct list_head tmp;
194 unsigned long flags;
195
196 spin_lock_irqsave(&ip->i_ioend_lock, flags);
197 list_replace_init(&ip->i_ioend_list, &tmp);
198 spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
199
200 iomap_sort_ioends(&tmp);
201 while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend,
202 io_list))) {
203 list_del_init(&ioend->io_list);
204 iomap_ioend_try_merge(ioend, &tmp);
205 xfs_end_ioend(ioend);
206 cond_resched();
207 }
208 }
209
210 void
xfs_end_bio(struct bio * bio)211 xfs_end_bio(
212 struct bio *bio)
213 {
214 struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
215 struct xfs_inode *ip = XFS_I(ioend->io_inode);
216 struct xfs_mount *mp = ip->i_mount;
217 unsigned long flags;
218
219 /*
220 * For Appends record the actually written block number and set the
221 * boundary flag if needed.
222 */
223 if (IS_ENABLED(CONFIG_XFS_RT) && bio_is_zone_append(bio)) {
224 ioend->io_sector = bio->bi_iter.bi_sector;
225 xfs_mark_rtg_boundary(ioend);
226 }
227
228 spin_lock_irqsave(&ip->i_ioend_lock, flags);
229 if (list_empty(&ip->i_ioend_list))
230 WARN_ON_ONCE(!queue_work(mp->m_unwritten_workqueue,
231 &ip->i_ioend_work));
232 list_add_tail(&ioend->io_list, &ip->i_ioend_list);
233 spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
234 }
235
236 /*
237 * Fast revalidation of the cached writeback mapping. Return true if the current
238 * mapping is valid, false otherwise.
239 */
240 static bool
xfs_imap_valid(struct iomap_writepage_ctx * wpc,struct xfs_inode * ip,loff_t offset)241 xfs_imap_valid(
242 struct iomap_writepage_ctx *wpc,
243 struct xfs_inode *ip,
244 loff_t offset)
245 {
246 if (offset < wpc->iomap.offset ||
247 offset >= wpc->iomap.offset + wpc->iomap.length)
248 return false;
249 /*
250 * If this is a COW mapping, it is sufficient to check that the mapping
251 * covers the offset. Be careful to check this first because the caller
252 * can revalidate a COW mapping without updating the data seqno.
253 */
254 if (wpc->iomap.flags & IOMAP_F_SHARED)
255 return true;
256
257 /*
258 * This is not a COW mapping. Check the sequence number of the data fork
259 * because concurrent changes could have invalidated the extent. Check
260 * the COW fork because concurrent changes since the last time we
261 * checked (and found nothing at this offset) could have added
262 * overlapping blocks.
263 */
264 if (XFS_WPC(wpc)->data_seq != READ_ONCE(ip->i_df.if_seq)) {
265 trace_xfs_wb_data_iomap_invalid(ip, &wpc->iomap,
266 XFS_WPC(wpc)->data_seq, XFS_DATA_FORK);
267 return false;
268 }
269 if (xfs_inode_has_cow_data(ip) &&
270 XFS_WPC(wpc)->cow_seq != READ_ONCE(ip->i_cowfp->if_seq)) {
271 trace_xfs_wb_cow_iomap_invalid(ip, &wpc->iomap,
272 XFS_WPC(wpc)->cow_seq, XFS_COW_FORK);
273 return false;
274 }
275 return true;
276 }
277
278 static int
xfs_map_blocks(struct iomap_writepage_ctx * wpc,struct inode * inode,loff_t offset,unsigned int len)279 xfs_map_blocks(
280 struct iomap_writepage_ctx *wpc,
281 struct inode *inode,
282 loff_t offset,
283 unsigned int len)
284 {
285 struct xfs_inode *ip = XFS_I(inode);
286 struct xfs_mount *mp = ip->i_mount;
287 ssize_t count = i_blocksize(inode);
288 xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
289 xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
290 xfs_fileoff_t cow_fsb;
291 int whichfork;
292 struct xfs_bmbt_irec imap;
293 struct xfs_iext_cursor icur;
294 int retries = 0;
295 int error = 0;
296 unsigned int *seq;
297
298 if (xfs_is_shutdown(mp))
299 return -EIO;
300
301 XFS_ERRORTAG_DELAY(mp, XFS_ERRTAG_WB_DELAY_MS);
302
303 /*
304 * COW fork blocks can overlap data fork blocks even if the blocks
305 * aren't shared. COW I/O always takes precedent, so we must always
306 * check for overlap on reflink inodes unless the mapping is already a
307 * COW one, or the COW fork hasn't changed from the last time we looked
308 * at it.
309 *
310 * It's safe to check the COW fork if_seq here without the ILOCK because
311 * we've indirectly protected against concurrent updates: writeback has
312 * the page locked, which prevents concurrent invalidations by reflink
313 * and directio and prevents concurrent buffered writes to the same
314 * page. Changes to if_seq always happen under i_lock, which protects
315 * against concurrent updates and provides a memory barrier on the way
316 * out that ensures that we always see the current value.
317 */
318 if (xfs_imap_valid(wpc, ip, offset))
319 return 0;
320
321 /*
322 * If we don't have a valid map, now it's time to get a new one for this
323 * offset. This will convert delayed allocations (including COW ones)
324 * into real extents. If we return without a valid map, it means we
325 * landed in a hole and we skip the block.
326 */
327 retry:
328 cow_fsb = NULLFILEOFF;
329 whichfork = XFS_DATA_FORK;
330 xfs_ilock(ip, XFS_ILOCK_SHARED);
331 ASSERT(!xfs_need_iread_extents(&ip->i_df));
332
333 /*
334 * Check if this is offset is covered by a COW extents, and if yes use
335 * it directly instead of looking up anything in the data fork.
336 */
337 if (xfs_inode_has_cow_data(ip) &&
338 xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
339 cow_fsb = imap.br_startoff;
340 if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
341 XFS_WPC(wpc)->cow_seq = READ_ONCE(ip->i_cowfp->if_seq);
342 xfs_iunlock(ip, XFS_ILOCK_SHARED);
343
344 whichfork = XFS_COW_FORK;
345 goto allocate_blocks;
346 }
347
348 /*
349 * No COW extent overlap. Revalidate now that we may have updated
350 * ->cow_seq. If the data mapping is still valid, we're done.
351 */
352 if (xfs_imap_valid(wpc, ip, offset)) {
353 xfs_iunlock(ip, XFS_ILOCK_SHARED);
354 return 0;
355 }
356
357 /*
358 * If we don't have a valid map, now it's time to get a new one for this
359 * offset. This will convert delayed allocations (including COW ones)
360 * into real extents.
361 */
362 if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap))
363 imap.br_startoff = end_fsb; /* fake a hole past EOF */
364 XFS_WPC(wpc)->data_seq = READ_ONCE(ip->i_df.if_seq);
365 xfs_iunlock(ip, XFS_ILOCK_SHARED);
366
367 /* landed in a hole or beyond EOF? */
368 if (imap.br_startoff > offset_fsb) {
369 imap.br_blockcount = imap.br_startoff - offset_fsb;
370 imap.br_startoff = offset_fsb;
371 imap.br_startblock = HOLESTARTBLOCK;
372 imap.br_state = XFS_EXT_NORM;
373 }
374
375 /*
376 * Truncate to the next COW extent if there is one. This is the only
377 * opportunity to do this because we can skip COW fork lookups for the
378 * subsequent blocks in the mapping; however, the requirement to treat
379 * the COW range separately remains.
380 */
381 if (cow_fsb != NULLFILEOFF &&
382 cow_fsb < imap.br_startoff + imap.br_blockcount)
383 imap.br_blockcount = cow_fsb - imap.br_startoff;
384
385 /* got a delalloc extent? */
386 if (imap.br_startblock != HOLESTARTBLOCK &&
387 isnullstartblock(imap.br_startblock))
388 goto allocate_blocks;
389
390 xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, XFS_WPC(wpc)->data_seq);
391 trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap);
392 return 0;
393 allocate_blocks:
394 /*
395 * Convert a dellalloc extent to a real one. The current page is held
396 * locked so nothing could have removed the block backing offset_fsb,
397 * although it could have moved from the COW to the data fork by another
398 * thread.
399 */
400 if (whichfork == XFS_COW_FORK)
401 seq = &XFS_WPC(wpc)->cow_seq;
402 else
403 seq = &XFS_WPC(wpc)->data_seq;
404
405 error = xfs_bmapi_convert_delalloc(ip, whichfork, offset,
406 &wpc->iomap, seq);
407 if (error) {
408 /*
409 * If we failed to find the extent in the COW fork we might have
410 * raced with a COW to data fork conversion or truncate.
411 * Restart the lookup to catch the extent in the data fork for
412 * the former case, but prevent additional retries to avoid
413 * looping forever for the latter case.
414 */
415 if (error == -EAGAIN && whichfork == XFS_COW_FORK && !retries++)
416 goto retry;
417 ASSERT(error != -EAGAIN);
418 return error;
419 }
420
421 /*
422 * Due to merging the return real extent might be larger than the
423 * original delalloc one. Trim the return extent to the next COW
424 * boundary again to force a re-lookup.
425 */
426 if (whichfork != XFS_COW_FORK && cow_fsb != NULLFILEOFF) {
427 loff_t cow_offset = XFS_FSB_TO_B(mp, cow_fsb);
428
429 if (cow_offset < wpc->iomap.offset + wpc->iomap.length)
430 wpc->iomap.length = cow_offset - wpc->iomap.offset;
431 }
432
433 ASSERT(wpc->iomap.offset <= offset);
434 ASSERT(wpc->iomap.offset + wpc->iomap.length > offset);
435 trace_xfs_map_blocks_alloc(ip, offset, count, whichfork, &imap);
436 return 0;
437 }
438
439 static int
xfs_submit_ioend(struct iomap_writepage_ctx * wpc,int status)440 xfs_submit_ioend(
441 struct iomap_writepage_ctx *wpc,
442 int status)
443 {
444 struct iomap_ioend *ioend = wpc->ioend;
445 unsigned int nofs_flag;
446
447 /*
448 * We can allocate memory here while doing writeback on behalf of
449 * memory reclaim. To avoid memory allocation deadlocks set the
450 * task-wide nofs context for the following operations.
451 */
452 nofs_flag = memalloc_nofs_save();
453
454 /* Convert CoW extents to regular */
455 if (!status && (ioend->io_flags & IOMAP_IOEND_SHARED)) {
456 status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
457 ioend->io_offset, ioend->io_size);
458 }
459
460 memalloc_nofs_restore(nofs_flag);
461
462 /* send ioends that might require a transaction to the completion wq */
463 if (xfs_ioend_is_append(ioend) ||
464 (ioend->io_flags & (IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_SHARED)))
465 ioend->io_bio.bi_end_io = xfs_end_bio;
466
467 if (status)
468 return status;
469 submit_bio(&ioend->io_bio);
470 return 0;
471 }
472
473 /*
474 * If the folio has delalloc blocks on it, the caller is asking us to punch them
475 * out. If we don't, we can leave a stale delalloc mapping covered by a clean
476 * page that needs to be dirtied again before the delalloc mapping can be
477 * converted. This stale delalloc mapping can trip up a later direct I/O read
478 * operation on the same region.
479 *
480 * We prevent this by truncating away the delalloc regions on the folio. Because
481 * they are delalloc, we can do this without needing a transaction. Indeed - if
482 * we get ENOSPC errors, we have to be able to do this truncation without a
483 * transaction as there is no space left for block reservation (typically why
484 * we see a ENOSPC in writeback).
485 */
486 static void
xfs_discard_folio(struct folio * folio,loff_t pos)487 xfs_discard_folio(
488 struct folio *folio,
489 loff_t pos)
490 {
491 struct xfs_inode *ip = XFS_I(folio->mapping->host);
492 struct xfs_mount *mp = ip->i_mount;
493
494 if (xfs_is_shutdown(mp))
495 return;
496
497 xfs_alert_ratelimited(mp,
498 "page discard on page "PTR_FMT", inode 0x%llx, pos %llu.",
499 folio, ip->i_ino, pos);
500
501 /*
502 * The end of the punch range is always the offset of the first
503 * byte of the next folio. Hence the end offset is only dependent on the
504 * folio itself and not the start offset that is passed in.
505 */
506 xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos,
507 folio_pos(folio) + folio_size(folio), NULL);
508 }
509
510 static const struct iomap_writeback_ops xfs_writeback_ops = {
511 .map_blocks = xfs_map_blocks,
512 .submit_ioend = xfs_submit_ioend,
513 .discard_folio = xfs_discard_folio,
514 };
515
516 struct xfs_zoned_writepage_ctx {
517 struct iomap_writepage_ctx ctx;
518 struct xfs_open_zone *open_zone;
519 };
520
521 static inline struct xfs_zoned_writepage_ctx *
XFS_ZWPC(struct iomap_writepage_ctx * ctx)522 XFS_ZWPC(struct iomap_writepage_ctx *ctx)
523 {
524 return container_of(ctx, struct xfs_zoned_writepage_ctx, ctx);
525 }
526
527 static int
xfs_zoned_map_blocks(struct iomap_writepage_ctx * wpc,struct inode * inode,loff_t offset,unsigned int len)528 xfs_zoned_map_blocks(
529 struct iomap_writepage_ctx *wpc,
530 struct inode *inode,
531 loff_t offset,
532 unsigned int len)
533 {
534 struct xfs_inode *ip = XFS_I(inode);
535 struct xfs_mount *mp = ip->i_mount;
536 xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
537 xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + len);
538 xfs_filblks_t count_fsb;
539 struct xfs_bmbt_irec imap, del;
540 struct xfs_iext_cursor icur;
541
542 if (xfs_is_shutdown(mp))
543 return -EIO;
544
545 XFS_ERRORTAG_DELAY(mp, XFS_ERRTAG_WB_DELAY_MS);
546
547 /*
548 * All dirty data must be covered by delalloc extents. But truncate can
549 * remove delalloc extents underneath us or reduce their size.
550 * Returning a hole tells iomap to not write back any data from this
551 * range, which is the right thing to do in that case.
552 *
553 * Otherwise just tell iomap to treat ranges previously covered by a
554 * delalloc extent as mapped. The actual block allocation will be done
555 * just before submitting the bio.
556 *
557 * This implies we never map outside folios that are locked or marked
558 * as under writeback, and thus there is no need check the fork sequence
559 * count here.
560 */
561 xfs_ilock(ip, XFS_ILOCK_EXCL);
562 if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
563 imap.br_startoff = end_fsb; /* fake a hole past EOF */
564 if (imap.br_startoff > offset_fsb) {
565 imap.br_blockcount = imap.br_startoff - offset_fsb;
566 imap.br_startoff = offset_fsb;
567 imap.br_startblock = HOLESTARTBLOCK;
568 imap.br_state = XFS_EXT_NORM;
569 xfs_iunlock(ip, XFS_ILOCK_EXCL);
570 xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, 0);
571 return 0;
572 }
573 end_fsb = min(end_fsb, imap.br_startoff + imap.br_blockcount);
574 count_fsb = end_fsb - offset_fsb;
575
576 del = imap;
577 xfs_trim_extent(&del, offset_fsb, count_fsb);
578 xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &icur, &imap, &del,
579 XFS_BMAPI_REMAP);
580 xfs_iunlock(ip, XFS_ILOCK_EXCL);
581
582 wpc->iomap.type = IOMAP_MAPPED;
583 wpc->iomap.flags = IOMAP_F_DIRTY;
584 wpc->iomap.bdev = mp->m_rtdev_targp->bt_bdev;
585 wpc->iomap.offset = offset;
586 wpc->iomap.length = XFS_FSB_TO_B(mp, count_fsb);
587 wpc->iomap.flags = IOMAP_F_ANON_WRITE;
588
589 trace_xfs_zoned_map_blocks(ip, offset, wpc->iomap.length);
590 return 0;
591 }
592
593 static int
xfs_zoned_submit_ioend(struct iomap_writepage_ctx * wpc,int status)594 xfs_zoned_submit_ioend(
595 struct iomap_writepage_ctx *wpc,
596 int status)
597 {
598 wpc->ioend->io_bio.bi_end_io = xfs_end_bio;
599 if (status)
600 return status;
601 xfs_zone_alloc_and_submit(wpc->ioend, &XFS_ZWPC(wpc)->open_zone);
602 return 0;
603 }
604
605 static const struct iomap_writeback_ops xfs_zoned_writeback_ops = {
606 .map_blocks = xfs_zoned_map_blocks,
607 .submit_ioend = xfs_zoned_submit_ioend,
608 .discard_folio = xfs_discard_folio,
609 };
610
611 STATIC int
xfs_vm_writepages(struct address_space * mapping,struct writeback_control * wbc)612 xfs_vm_writepages(
613 struct address_space *mapping,
614 struct writeback_control *wbc)
615 {
616 struct xfs_inode *ip = XFS_I(mapping->host);
617
618 xfs_iflags_clear(ip, XFS_ITRUNCATED);
619
620 if (xfs_is_zoned_inode(ip)) {
621 struct xfs_zoned_writepage_ctx xc = { };
622 int error;
623
624 error = iomap_writepages(mapping, wbc, &xc.ctx,
625 &xfs_zoned_writeback_ops);
626 if (xc.open_zone)
627 xfs_open_zone_put(xc.open_zone);
628 return error;
629 } else {
630 struct xfs_writepage_ctx wpc = { };
631
632 return iomap_writepages(mapping, wbc, &wpc.ctx,
633 &xfs_writeback_ops);
634 }
635 }
636
637 STATIC int
xfs_dax_writepages(struct address_space * mapping,struct writeback_control * wbc)638 xfs_dax_writepages(
639 struct address_space *mapping,
640 struct writeback_control *wbc)
641 {
642 struct xfs_inode *ip = XFS_I(mapping->host);
643
644 xfs_iflags_clear(ip, XFS_ITRUNCATED);
645 return dax_writeback_mapping_range(mapping,
646 xfs_inode_buftarg(ip)->bt_daxdev, wbc);
647 }
648
649 STATIC sector_t
xfs_vm_bmap(struct address_space * mapping,sector_t block)650 xfs_vm_bmap(
651 struct address_space *mapping,
652 sector_t block)
653 {
654 struct xfs_inode *ip = XFS_I(mapping->host);
655
656 trace_xfs_vm_bmap(ip);
657
658 /*
659 * The swap code (ab-)uses ->bmap to get a block mapping and then
660 * bypasses the file system for actual I/O. We really can't allow
661 * that on reflinks inodes, so we have to skip out here. And yes,
662 * 0 is the magic code for a bmap error.
663 *
664 * Since we don't pass back blockdev info, we can't return bmap
665 * information for rt files either.
666 */
667 if (xfs_is_cow_inode(ip) || XFS_IS_REALTIME_INODE(ip))
668 return 0;
669 return iomap_bmap(mapping, block, &xfs_read_iomap_ops);
670 }
671
672 STATIC int
xfs_vm_read_folio(struct file * unused,struct folio * folio)673 xfs_vm_read_folio(
674 struct file *unused,
675 struct folio *folio)
676 {
677 return iomap_read_folio(folio, &xfs_read_iomap_ops);
678 }
679
680 STATIC void
xfs_vm_readahead(struct readahead_control * rac)681 xfs_vm_readahead(
682 struct readahead_control *rac)
683 {
684 iomap_readahead(rac, &xfs_read_iomap_ops);
685 }
686
687 static int
xfs_vm_swap_activate(struct swap_info_struct * sis,struct file * swap_file,sector_t * span)688 xfs_vm_swap_activate(
689 struct swap_info_struct *sis,
690 struct file *swap_file,
691 sector_t *span)
692 {
693 struct xfs_inode *ip = XFS_I(file_inode(swap_file));
694
695 /*
696 * Swap file activation can race against concurrent shared extent
697 * removal in files that have been cloned. If this happens,
698 * iomap_swapfile_iter() can fail because it encountered a shared
699 * extent even though an operation is in progress to remove those
700 * shared extents.
701 *
702 * This race becomes problematic when we defer extent removal
703 * operations beyond the end of a syscall (i.e. use async background
704 * processing algorithms). Users think the extents are no longer
705 * shared, but iomap_swapfile_iter() still sees them as shared
706 * because the refcountbt entries for the extents being removed have
707 * not yet been updated. Hence the swapon call fails unexpectedly.
708 *
709 * The race condition is currently most obvious from the unlink()
710 * operation as extent removal is deferred until after the last
711 * reference to the inode goes away. We then process the extent
712 * removal asynchronously, hence triggers the "syscall completed but
713 * work not done" condition mentioned above. To close this race
714 * window, we need to flush any pending inodegc operations to ensure
715 * they have updated the refcountbt records before we try to map the
716 * swapfile.
717 */
718 xfs_inodegc_flush(ip->i_mount);
719
720 /*
721 * Direct the swap code to the correct block device when this file
722 * sits on the RT device.
723 */
724 sis->bdev = xfs_inode_buftarg(ip)->bt_bdev;
725
726 return iomap_swapfile_activate(sis, swap_file, span,
727 &xfs_read_iomap_ops);
728 }
729
730 const struct address_space_operations xfs_address_space_operations = {
731 .read_folio = xfs_vm_read_folio,
732 .readahead = xfs_vm_readahead,
733 .writepages = xfs_vm_writepages,
734 .dirty_folio = iomap_dirty_folio,
735 .release_folio = iomap_release_folio,
736 .invalidate_folio = iomap_invalidate_folio,
737 .bmap = xfs_vm_bmap,
738 .migrate_folio = filemap_migrate_folio,
739 .is_partially_uptodate = iomap_is_partially_uptodate,
740 .error_remove_folio = generic_error_remove_folio,
741 .swap_activate = xfs_vm_swap_activate,
742 };
743
744 const struct address_space_operations xfs_dax_aops = {
745 .writepages = xfs_dax_writepages,
746 .dirty_folio = noop_dirty_folio,
747 .swap_activate = xfs_vm_swap_activate,
748 };
749