1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
4 * Copyright (c) 2016-2025 Christoph Hellwig.
5 * All Rights Reserved.
6 */
7 #include "xfs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_mount.h"
13 #include "xfs_inode.h"
14 #include "xfs_trans.h"
15 #include "xfs_iomap.h"
16 #include "xfs_trace.h"
17 #include "xfs_bmap.h"
18 #include "xfs_bmap_util.h"
19 #include "xfs_reflink.h"
20 #include "xfs_errortag.h"
21 #include "xfs_error.h"
22 #include "xfs_icache.h"
23 #include "xfs_zone_alloc.h"
24 #include "xfs_rtgroup.h"
25
26 struct xfs_writepage_ctx {
27 struct iomap_writepage_ctx ctx;
28 unsigned int data_seq;
29 unsigned int cow_seq;
30 };
31
32 static inline struct xfs_writepage_ctx *
XFS_WPC(struct iomap_writepage_ctx * ctx)33 XFS_WPC(struct iomap_writepage_ctx *ctx)
34 {
35 return container_of(ctx, struct xfs_writepage_ctx, ctx);
36 }
37
38 /*
39 * Fast and loose check if this write could update the on-disk inode size.
40 */
xfs_ioend_is_append(struct iomap_ioend * ioend)41 static inline bool xfs_ioend_is_append(struct iomap_ioend *ioend)
42 {
43 return ioend->io_offset + ioend->io_size >
44 XFS_I(ioend->io_inode)->i_disk_size;
45 }
46
47 /*
48 * Update on-disk file size now that data has been written to disk.
49 */
50 int
xfs_setfilesize(struct xfs_inode * ip,xfs_off_t offset,size_t size)51 xfs_setfilesize(
52 struct xfs_inode *ip,
53 xfs_off_t offset,
54 size_t size)
55 {
56 struct xfs_mount *mp = ip->i_mount;
57 struct xfs_trans *tp;
58 xfs_fsize_t isize;
59 int error;
60
61 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
62 if (error)
63 return error;
64
65 xfs_ilock(ip, XFS_ILOCK_EXCL);
66 isize = xfs_new_eof(ip, offset + size);
67 if (!isize) {
68 xfs_iunlock(ip, XFS_ILOCK_EXCL);
69 xfs_trans_cancel(tp);
70 return 0;
71 }
72
73 trace_xfs_setfilesize(ip, offset, size);
74
75 ip->i_disk_size = isize;
76 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
77 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
78
79 return xfs_trans_commit(tp);
80 }
81
82 static void
xfs_ioend_put_open_zones(struct iomap_ioend * ioend)83 xfs_ioend_put_open_zones(
84 struct iomap_ioend *ioend)
85 {
86 struct iomap_ioend *tmp;
87
88 /*
89 * Put the open zone for all ioends merged into this one (if any).
90 */
91 list_for_each_entry(tmp, &ioend->io_list, io_list)
92 xfs_open_zone_put(tmp->io_private);
93
94 /*
95 * The main ioend might not have an open zone if the submission failed
96 * before xfs_zone_alloc_and_submit got called.
97 */
98 if (ioend->io_private)
99 xfs_open_zone_put(ioend->io_private);
100 }
101
102 /*
103 * IO write completion.
104 */
105 STATIC void
xfs_end_ioend(struct iomap_ioend * ioend)106 xfs_end_ioend(
107 struct iomap_ioend *ioend)
108 {
109 struct xfs_inode *ip = XFS_I(ioend->io_inode);
110 struct xfs_mount *mp = ip->i_mount;
111 bool is_zoned = xfs_is_zoned_inode(ip);
112 xfs_off_t offset = ioend->io_offset;
113 size_t size = ioend->io_size;
114 unsigned int nofs_flag;
115 int error;
116
117 /*
118 * We can allocate memory here while doing writeback on behalf of
119 * memory reclaim. To avoid memory allocation deadlocks set the
120 * task-wide nofs context for the following operations.
121 */
122 nofs_flag = memalloc_nofs_save();
123
124 /*
125 * Just clean up the in-memory structures if the fs has been shut down.
126 */
127 if (xfs_is_shutdown(mp)) {
128 error = -EIO;
129 goto done;
130 }
131
132 /*
133 * Clean up all COW blocks and underlying data fork delalloc blocks on
134 * I/O error. The delalloc punch is required because this ioend was
135 * mapped to blocks in the COW fork and the associated pages are no
136 * longer dirty. If we don't remove delalloc blocks here, they become
137 * stale and can corrupt free space accounting on unmount.
138 */
139 error = blk_status_to_errno(ioend->io_bio.bi_status);
140 if (unlikely(error)) {
141 if (ioend->io_flags & IOMAP_IOEND_SHARED) {
142 ASSERT(!is_zoned);
143 xfs_reflink_cancel_cow_range(ip, offset, size, true);
144 xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, offset,
145 offset + size, NULL);
146 }
147 goto done;
148 }
149
150 /*
151 * Success: commit the COW or unwritten blocks if needed.
152 */
153 if (is_zoned)
154 error = xfs_zoned_end_io(ip, offset, size, ioend->io_sector,
155 ioend->io_private, NULLFSBLOCK);
156 else if (ioend->io_flags & IOMAP_IOEND_SHARED)
157 error = xfs_reflink_end_cow(ip, offset, size);
158 else if (ioend->io_flags & IOMAP_IOEND_UNWRITTEN)
159 error = xfs_iomap_write_unwritten(ip, offset, size, false);
160
161 if (!error &&
162 !(ioend->io_flags & IOMAP_IOEND_DIRECT) &&
163 xfs_ioend_is_append(ioend))
164 error = xfs_setfilesize(ip, offset, size);
165 done:
166 if (is_zoned)
167 xfs_ioend_put_open_zones(ioend);
168 iomap_finish_ioends(ioend, error);
169 memalloc_nofs_restore(nofs_flag);
170 }
171
172 /*
173 * Finish all pending IO completions that require transactional modifications.
174 *
175 * We try to merge physical and logically contiguous ioends before completion to
176 * minimise the number of transactions we need to perform during IO completion.
177 * Both unwritten extent conversion and COW remapping need to iterate and modify
178 * one physical extent at a time, so we gain nothing by merging physically
179 * discontiguous extents here.
180 *
181 * The ioend chain length that we can be processing here is largely unbound in
182 * length and we may have to perform significant amounts of work on each ioend
183 * to complete it. Hence we have to be careful about holding the CPU for too
184 * long in this loop.
185 */
186 void
xfs_end_io(struct work_struct * work)187 xfs_end_io(
188 struct work_struct *work)
189 {
190 struct xfs_inode *ip =
191 container_of(work, struct xfs_inode, i_ioend_work);
192 struct iomap_ioend *ioend;
193 struct list_head tmp;
194 unsigned long flags;
195
196 spin_lock_irqsave(&ip->i_ioend_lock, flags);
197 list_replace_init(&ip->i_ioend_list, &tmp);
198 spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
199
200 iomap_sort_ioends(&tmp);
201 while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend,
202 io_list))) {
203 list_del_init(&ioend->io_list);
204 iomap_ioend_try_merge(ioend, &tmp);
205 xfs_end_ioend(ioend);
206 cond_resched();
207 }
208 }
209
210 void
xfs_end_bio(struct bio * bio)211 xfs_end_bio(
212 struct bio *bio)
213 {
214 struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
215 struct xfs_inode *ip = XFS_I(ioend->io_inode);
216 struct xfs_mount *mp = ip->i_mount;
217 unsigned long flags;
218
219 /*
220 * For Appends record the actually written block number and set the
221 * boundary flag if needed.
222 */
223 if (IS_ENABLED(CONFIG_XFS_RT) && bio_is_zone_append(bio)) {
224 ioend->io_sector = bio->bi_iter.bi_sector;
225 xfs_mark_rtg_boundary(ioend);
226 }
227
228 spin_lock_irqsave(&ip->i_ioend_lock, flags);
229 if (list_empty(&ip->i_ioend_list))
230 WARN_ON_ONCE(!queue_work(mp->m_unwritten_workqueue,
231 &ip->i_ioend_work));
232 list_add_tail(&ioend->io_list, &ip->i_ioend_list);
233 spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
234 }
235
236 /*
237 * We cannot cancel the ioend directly on error. We may have already set other
238 * pages under writeback and hence we have to run I/O completion to mark the
239 * error state of the pages under writeback appropriately.
240 *
241 * If the folio has delalloc blocks on it, the caller is asking us to punch them
242 * out. If we don't, we can leave a stale delalloc mapping covered by a clean
243 * page that needs to be dirtied again before the delalloc mapping can be
244 * converted. This stale delalloc mapping can trip up a later direct I/O read
245 * operation on the same region.
246 *
247 * We prevent this by truncating away the delalloc regions on the folio. Because
248 * they are delalloc, we can do this without needing a transaction. Indeed - if
249 * we get ENOSPC errors, we have to be able to do this truncation without a
250 * transaction as there is no space left for block reservation (typically why
251 * we see a ENOSPC in writeback).
252 */
253 static void
xfs_discard_folio(struct folio * folio,loff_t pos)254 xfs_discard_folio(
255 struct folio *folio,
256 loff_t pos)
257 {
258 struct xfs_inode *ip = XFS_I(folio->mapping->host);
259 struct xfs_mount *mp = ip->i_mount;
260
261 if (xfs_is_shutdown(mp))
262 return;
263
264 xfs_alert_ratelimited(mp,
265 "page discard on page "PTR_FMT", inode 0x%llx, pos %llu.",
266 folio, ip->i_ino, pos);
267
268 /*
269 * The end of the punch range is always the offset of the first
270 * byte of the next folio. Hence the end offset is only dependent on the
271 * folio itself and not the start offset that is passed in.
272 */
273 xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos,
274 folio_pos(folio) + folio_size(folio), NULL);
275 }
276
277 /*
278 * Fast revalidation of the cached writeback mapping. Return true if the current
279 * mapping is valid, false otherwise.
280 */
281 static bool
xfs_imap_valid(struct iomap_writepage_ctx * wpc,struct xfs_inode * ip,loff_t offset)282 xfs_imap_valid(
283 struct iomap_writepage_ctx *wpc,
284 struct xfs_inode *ip,
285 loff_t offset)
286 {
287 if (offset < wpc->iomap.offset ||
288 offset >= wpc->iomap.offset + wpc->iomap.length)
289 return false;
290 /*
291 * If this is a COW mapping, it is sufficient to check that the mapping
292 * covers the offset. Be careful to check this first because the caller
293 * can revalidate a COW mapping without updating the data seqno.
294 */
295 if (wpc->iomap.flags & IOMAP_F_SHARED)
296 return true;
297
298 /*
299 * This is not a COW mapping. Check the sequence number of the data fork
300 * because concurrent changes could have invalidated the extent. Check
301 * the COW fork because concurrent changes since the last time we
302 * checked (and found nothing at this offset) could have added
303 * overlapping blocks.
304 */
305 if (XFS_WPC(wpc)->data_seq != READ_ONCE(ip->i_df.if_seq)) {
306 trace_xfs_wb_data_iomap_invalid(ip, &wpc->iomap,
307 XFS_WPC(wpc)->data_seq, XFS_DATA_FORK);
308 return false;
309 }
310 if (xfs_inode_has_cow_data(ip) &&
311 XFS_WPC(wpc)->cow_seq != READ_ONCE(ip->i_cowfp->if_seq)) {
312 trace_xfs_wb_cow_iomap_invalid(ip, &wpc->iomap,
313 XFS_WPC(wpc)->cow_seq, XFS_COW_FORK);
314 return false;
315 }
316 return true;
317 }
318
319 static int
xfs_map_blocks(struct iomap_writepage_ctx * wpc,loff_t offset,unsigned int len)320 xfs_map_blocks(
321 struct iomap_writepage_ctx *wpc,
322 loff_t offset,
323 unsigned int len)
324 {
325 struct xfs_inode *ip = XFS_I(wpc->inode);
326 struct xfs_mount *mp = ip->i_mount;
327 ssize_t count = i_blocksize(wpc->inode);
328 xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
329 xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
330 xfs_fileoff_t cow_fsb;
331 int whichfork;
332 struct xfs_bmbt_irec imap;
333 struct xfs_iext_cursor icur;
334 int retries = 0;
335 int error = 0;
336 unsigned int *seq;
337
338 if (xfs_is_shutdown(mp))
339 return -EIO;
340
341 XFS_ERRORTAG_DELAY(mp, XFS_ERRTAG_WB_DELAY_MS);
342
343 /*
344 * COW fork blocks can overlap data fork blocks even if the blocks
345 * aren't shared. COW I/O always takes precedent, so we must always
346 * check for overlap on reflink inodes unless the mapping is already a
347 * COW one, or the COW fork hasn't changed from the last time we looked
348 * at it.
349 *
350 * It's safe to check the COW fork if_seq here without the ILOCK because
351 * we've indirectly protected against concurrent updates: writeback has
352 * the page locked, which prevents concurrent invalidations by reflink
353 * and directio and prevents concurrent buffered writes to the same
354 * page. Changes to if_seq always happen under i_lock, which protects
355 * against concurrent updates and provides a memory barrier on the way
356 * out that ensures that we always see the current value.
357 */
358 if (xfs_imap_valid(wpc, ip, offset))
359 return 0;
360
361 /*
362 * If we don't have a valid map, now it's time to get a new one for this
363 * offset. This will convert delayed allocations (including COW ones)
364 * into real extents. If we return without a valid map, it means we
365 * landed in a hole and we skip the block.
366 */
367 retry:
368 cow_fsb = NULLFILEOFF;
369 whichfork = XFS_DATA_FORK;
370 xfs_ilock(ip, XFS_ILOCK_SHARED);
371 ASSERT(!xfs_need_iread_extents(&ip->i_df));
372
373 /*
374 * Check if this is offset is covered by a COW extents, and if yes use
375 * it directly instead of looking up anything in the data fork.
376 */
377 if (xfs_inode_has_cow_data(ip) &&
378 xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
379 cow_fsb = imap.br_startoff;
380 if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
381 XFS_WPC(wpc)->cow_seq = READ_ONCE(ip->i_cowfp->if_seq);
382 xfs_iunlock(ip, XFS_ILOCK_SHARED);
383
384 whichfork = XFS_COW_FORK;
385 goto allocate_blocks;
386 }
387
388 /*
389 * No COW extent overlap. Revalidate now that we may have updated
390 * ->cow_seq. If the data mapping is still valid, we're done.
391 */
392 if (xfs_imap_valid(wpc, ip, offset)) {
393 xfs_iunlock(ip, XFS_ILOCK_SHARED);
394 return 0;
395 }
396
397 /*
398 * If we don't have a valid map, now it's time to get a new one for this
399 * offset. This will convert delayed allocations (including COW ones)
400 * into real extents.
401 */
402 if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap))
403 imap.br_startoff = end_fsb; /* fake a hole past EOF */
404 XFS_WPC(wpc)->data_seq = READ_ONCE(ip->i_df.if_seq);
405 xfs_iunlock(ip, XFS_ILOCK_SHARED);
406
407 /* landed in a hole or beyond EOF? */
408 if (imap.br_startoff > offset_fsb) {
409 imap.br_blockcount = imap.br_startoff - offset_fsb;
410 imap.br_startoff = offset_fsb;
411 imap.br_startblock = HOLESTARTBLOCK;
412 imap.br_state = XFS_EXT_NORM;
413 }
414
415 /*
416 * Truncate to the next COW extent if there is one. This is the only
417 * opportunity to do this because we can skip COW fork lookups for the
418 * subsequent blocks in the mapping; however, the requirement to treat
419 * the COW range separately remains.
420 */
421 if (cow_fsb != NULLFILEOFF &&
422 cow_fsb < imap.br_startoff + imap.br_blockcount)
423 imap.br_blockcount = cow_fsb - imap.br_startoff;
424
425 /* got a delalloc extent? */
426 if (imap.br_startblock != HOLESTARTBLOCK &&
427 isnullstartblock(imap.br_startblock))
428 goto allocate_blocks;
429
430 xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, XFS_WPC(wpc)->data_seq);
431 trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap);
432 return 0;
433 allocate_blocks:
434 /*
435 * Convert a dellalloc extent to a real one. The current page is held
436 * locked so nothing could have removed the block backing offset_fsb,
437 * although it could have moved from the COW to the data fork by another
438 * thread.
439 */
440 if (whichfork == XFS_COW_FORK)
441 seq = &XFS_WPC(wpc)->cow_seq;
442 else
443 seq = &XFS_WPC(wpc)->data_seq;
444
445 error = xfs_bmapi_convert_delalloc(ip, whichfork, offset,
446 &wpc->iomap, seq);
447 if (error) {
448 /*
449 * If we failed to find the extent in the COW fork we might have
450 * raced with a COW to data fork conversion or truncate.
451 * Restart the lookup to catch the extent in the data fork for
452 * the former case, but prevent additional retries to avoid
453 * looping forever for the latter case.
454 */
455 if (error == -EAGAIN && whichfork == XFS_COW_FORK && !retries++)
456 goto retry;
457 ASSERT(error != -EAGAIN);
458 return error;
459 }
460
461 /*
462 * Due to merging the return real extent might be larger than the
463 * original delalloc one. Trim the return extent to the next COW
464 * boundary again to force a re-lookup.
465 */
466 if (whichfork != XFS_COW_FORK && cow_fsb != NULLFILEOFF) {
467 loff_t cow_offset = XFS_FSB_TO_B(mp, cow_fsb);
468
469 if (cow_offset < wpc->iomap.offset + wpc->iomap.length)
470 wpc->iomap.length = cow_offset - wpc->iomap.offset;
471 }
472
473 ASSERT(wpc->iomap.offset <= offset);
474 ASSERT(wpc->iomap.offset + wpc->iomap.length > offset);
475 trace_xfs_map_blocks_alloc(ip, offset, count, whichfork, &imap);
476 return 0;
477 }
478
479 static ssize_t
xfs_writeback_range(struct iomap_writepage_ctx * wpc,struct folio * folio,u64 offset,unsigned int len,u64 end_pos)480 xfs_writeback_range(
481 struct iomap_writepage_ctx *wpc,
482 struct folio *folio,
483 u64 offset,
484 unsigned int len,
485 u64 end_pos)
486 {
487 ssize_t ret;
488
489 ret = xfs_map_blocks(wpc, offset, len);
490 if (!ret)
491 ret = iomap_add_to_ioend(wpc, folio, offset, end_pos, len);
492 if (ret < 0)
493 xfs_discard_folio(folio, offset);
494 return ret;
495 }
496
497 static bool
xfs_ioend_needs_wq_completion(struct iomap_ioend * ioend)498 xfs_ioend_needs_wq_completion(
499 struct iomap_ioend *ioend)
500 {
501 /* Changing inode size requires a transaction. */
502 if (xfs_ioend_is_append(ioend))
503 return true;
504
505 /* Extent manipulation requires a transaction. */
506 if (ioend->io_flags & (IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_SHARED))
507 return true;
508
509 /* Page cache invalidation cannot be done in irq context. */
510 if (ioend->io_flags & IOMAP_IOEND_DONTCACHE)
511 return true;
512
513 return false;
514 }
515
516 static int
xfs_writeback_submit(struct iomap_writepage_ctx * wpc,int error)517 xfs_writeback_submit(
518 struct iomap_writepage_ctx *wpc,
519 int error)
520 {
521 struct iomap_ioend *ioend = wpc->wb_ctx;
522
523 /*
524 * Convert CoW extents to regular.
525 *
526 * We can allocate memory here while doing writeback on behalf of memory
527 * reclaim. To avoid memory allocation deadlocks, set the task-wide
528 * nofs context.
529 */
530 if (!error && (ioend->io_flags & IOMAP_IOEND_SHARED)) {
531 unsigned int nofs_flag;
532
533 nofs_flag = memalloc_nofs_save();
534 error = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
535 ioend->io_offset, ioend->io_size);
536 memalloc_nofs_restore(nofs_flag);
537 }
538
539 /*
540 * Send ioends that might require a transaction to the completion wq.
541 */
542 if (xfs_ioend_needs_wq_completion(ioend))
543 ioend->io_bio.bi_end_io = xfs_end_bio;
544
545 return iomap_ioend_writeback_submit(wpc, error);
546 }
547
548 static const struct iomap_writeback_ops xfs_writeback_ops = {
549 .writeback_range = xfs_writeback_range,
550 .writeback_submit = xfs_writeback_submit,
551 };
552
553 struct xfs_zoned_writepage_ctx {
554 struct iomap_writepage_ctx ctx;
555 struct xfs_open_zone *open_zone;
556 };
557
558 static inline struct xfs_zoned_writepage_ctx *
XFS_ZWPC(struct iomap_writepage_ctx * ctx)559 XFS_ZWPC(struct iomap_writepage_ctx *ctx)
560 {
561 return container_of(ctx, struct xfs_zoned_writepage_ctx, ctx);
562 }
563
564 static int
xfs_zoned_map_blocks(struct iomap_writepage_ctx * wpc,loff_t offset,unsigned int len)565 xfs_zoned_map_blocks(
566 struct iomap_writepage_ctx *wpc,
567 loff_t offset,
568 unsigned int len)
569 {
570 struct xfs_inode *ip = XFS_I(wpc->inode);
571 struct xfs_mount *mp = ip->i_mount;
572 xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
573 xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + len);
574 xfs_filblks_t count_fsb;
575 struct xfs_bmbt_irec imap, del;
576 struct xfs_iext_cursor icur;
577
578 if (xfs_is_shutdown(mp))
579 return -EIO;
580
581 XFS_ERRORTAG_DELAY(mp, XFS_ERRTAG_WB_DELAY_MS);
582
583 /*
584 * All dirty data must be covered by delalloc extents. But truncate can
585 * remove delalloc extents underneath us or reduce their size.
586 * Returning a hole tells iomap to not write back any data from this
587 * range, which is the right thing to do in that case.
588 *
589 * Otherwise just tell iomap to treat ranges previously covered by a
590 * delalloc extent as mapped. The actual block allocation will be done
591 * just before submitting the bio.
592 *
593 * This implies we never map outside folios that are locked or marked
594 * as under writeback, and thus there is no need check the fork sequence
595 * count here.
596 */
597 xfs_ilock(ip, XFS_ILOCK_EXCL);
598 if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
599 imap.br_startoff = end_fsb; /* fake a hole past EOF */
600 if (imap.br_startoff > offset_fsb) {
601 imap.br_blockcount = imap.br_startoff - offset_fsb;
602 imap.br_startoff = offset_fsb;
603 imap.br_startblock = HOLESTARTBLOCK;
604 imap.br_state = XFS_EXT_NORM;
605 xfs_iunlock(ip, XFS_ILOCK_EXCL);
606 xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, 0);
607 return 0;
608 }
609 end_fsb = min(end_fsb, imap.br_startoff + imap.br_blockcount);
610 count_fsb = end_fsb - offset_fsb;
611
612 del = imap;
613 xfs_trim_extent(&del, offset_fsb, count_fsb);
614 xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &icur, &imap, &del,
615 XFS_BMAPI_REMAP);
616 xfs_iunlock(ip, XFS_ILOCK_EXCL);
617
618 wpc->iomap.type = IOMAP_MAPPED;
619 wpc->iomap.flags = IOMAP_F_DIRTY;
620 wpc->iomap.bdev = mp->m_rtdev_targp->bt_bdev;
621 wpc->iomap.offset = offset;
622 wpc->iomap.length = XFS_FSB_TO_B(mp, count_fsb);
623 wpc->iomap.flags = IOMAP_F_ANON_WRITE;
624
625 trace_xfs_zoned_map_blocks(ip, offset, wpc->iomap.length);
626 return 0;
627 }
628
629 static ssize_t
xfs_zoned_writeback_range(struct iomap_writepage_ctx * wpc,struct folio * folio,u64 offset,unsigned int len,u64 end_pos)630 xfs_zoned_writeback_range(
631 struct iomap_writepage_ctx *wpc,
632 struct folio *folio,
633 u64 offset,
634 unsigned int len,
635 u64 end_pos)
636 {
637 ssize_t ret;
638
639 ret = xfs_zoned_map_blocks(wpc, offset, len);
640 if (!ret)
641 ret = iomap_add_to_ioend(wpc, folio, offset, end_pos, len);
642 if (ret < 0)
643 xfs_discard_folio(folio, offset);
644 return ret;
645 }
646
647 static int
xfs_zoned_writeback_submit(struct iomap_writepage_ctx * wpc,int error)648 xfs_zoned_writeback_submit(
649 struct iomap_writepage_ctx *wpc,
650 int error)
651 {
652 struct iomap_ioend *ioend = wpc->wb_ctx;
653
654 ioend->io_bio.bi_end_io = xfs_end_bio;
655 if (error) {
656 ioend->io_bio.bi_status = errno_to_blk_status(error);
657 bio_endio(&ioend->io_bio);
658 return error;
659 }
660 xfs_zone_alloc_and_submit(ioend, &XFS_ZWPC(wpc)->open_zone);
661 return 0;
662 }
663
664 static const struct iomap_writeback_ops xfs_zoned_writeback_ops = {
665 .writeback_range = xfs_zoned_writeback_range,
666 .writeback_submit = xfs_zoned_writeback_submit,
667 };
668
669 STATIC int
xfs_vm_writepages(struct address_space * mapping,struct writeback_control * wbc)670 xfs_vm_writepages(
671 struct address_space *mapping,
672 struct writeback_control *wbc)
673 {
674 struct xfs_inode *ip = XFS_I(mapping->host);
675
676 xfs_iflags_clear(ip, XFS_ITRUNCATED);
677
678 if (xfs_is_zoned_inode(ip)) {
679 struct xfs_zoned_writepage_ctx xc = {
680 .ctx = {
681 .inode = mapping->host,
682 .wbc = wbc,
683 .ops = &xfs_zoned_writeback_ops
684 },
685 };
686 int error;
687
688 error = iomap_writepages(&xc.ctx);
689 if (xc.open_zone)
690 xfs_open_zone_put(xc.open_zone);
691 return error;
692 } else {
693 struct xfs_writepage_ctx wpc = {
694 .ctx = {
695 .inode = mapping->host,
696 .wbc = wbc,
697 .ops = &xfs_writeback_ops
698 },
699 };
700
701 return iomap_writepages(&wpc.ctx);
702 }
703 }
704
705 STATIC int
xfs_dax_writepages(struct address_space * mapping,struct writeback_control * wbc)706 xfs_dax_writepages(
707 struct address_space *mapping,
708 struct writeback_control *wbc)
709 {
710 struct xfs_inode *ip = XFS_I(mapping->host);
711
712 xfs_iflags_clear(ip, XFS_ITRUNCATED);
713 return dax_writeback_mapping_range(mapping,
714 xfs_inode_buftarg(ip)->bt_daxdev, wbc);
715 }
716
717 STATIC sector_t
xfs_vm_bmap(struct address_space * mapping,sector_t block)718 xfs_vm_bmap(
719 struct address_space *mapping,
720 sector_t block)
721 {
722 struct xfs_inode *ip = XFS_I(mapping->host);
723
724 trace_xfs_vm_bmap(ip);
725
726 /*
727 * The swap code (ab-)uses ->bmap to get a block mapping and then
728 * bypasses the file system for actual I/O. We really can't allow
729 * that on reflinks inodes, so we have to skip out here. And yes,
730 * 0 is the magic code for a bmap error.
731 *
732 * Since we don't pass back blockdev info, we can't return bmap
733 * information for rt files either.
734 */
735 if (xfs_is_cow_inode(ip) || XFS_IS_REALTIME_INODE(ip))
736 return 0;
737 return iomap_bmap(mapping, block, &xfs_read_iomap_ops);
738 }
739
740 STATIC int
xfs_vm_read_folio(struct file * unused,struct folio * folio)741 xfs_vm_read_folio(
742 struct file *unused,
743 struct folio *folio)
744 {
745 return iomap_read_folio(folio, &xfs_read_iomap_ops);
746 }
747
748 STATIC void
xfs_vm_readahead(struct readahead_control * rac)749 xfs_vm_readahead(
750 struct readahead_control *rac)
751 {
752 iomap_readahead(rac, &xfs_read_iomap_ops);
753 }
754
755 static int
xfs_vm_swap_activate(struct swap_info_struct * sis,struct file * swap_file,sector_t * span)756 xfs_vm_swap_activate(
757 struct swap_info_struct *sis,
758 struct file *swap_file,
759 sector_t *span)
760 {
761 struct xfs_inode *ip = XFS_I(file_inode(swap_file));
762
763 if (xfs_is_zoned_inode(ip))
764 return -EINVAL;
765
766 /*
767 * Swap file activation can race against concurrent shared extent
768 * removal in files that have been cloned. If this happens,
769 * iomap_swapfile_iter() can fail because it encountered a shared
770 * extent even though an operation is in progress to remove those
771 * shared extents.
772 *
773 * This race becomes problematic when we defer extent removal
774 * operations beyond the end of a syscall (i.e. use async background
775 * processing algorithms). Users think the extents are no longer
776 * shared, but iomap_swapfile_iter() still sees them as shared
777 * because the refcountbt entries for the extents being removed have
778 * not yet been updated. Hence the swapon call fails unexpectedly.
779 *
780 * The race condition is currently most obvious from the unlink()
781 * operation as extent removal is deferred until after the last
782 * reference to the inode goes away. We then process the extent
783 * removal asynchronously, hence triggers the "syscall completed but
784 * work not done" condition mentioned above. To close this race
785 * window, we need to flush any pending inodegc operations to ensure
786 * they have updated the refcountbt records before we try to map the
787 * swapfile.
788 */
789 xfs_inodegc_flush(ip->i_mount);
790
791 /*
792 * Direct the swap code to the correct block device when this file
793 * sits on the RT device.
794 */
795 sis->bdev = xfs_inode_buftarg(ip)->bt_bdev;
796
797 return iomap_swapfile_activate(sis, swap_file, span,
798 &xfs_read_iomap_ops);
799 }
800
801 const struct address_space_operations xfs_address_space_operations = {
802 .read_folio = xfs_vm_read_folio,
803 .readahead = xfs_vm_readahead,
804 .writepages = xfs_vm_writepages,
805 .dirty_folio = iomap_dirty_folio,
806 .release_folio = iomap_release_folio,
807 .invalidate_folio = iomap_invalidate_folio,
808 .bmap = xfs_vm_bmap,
809 .migrate_folio = filemap_migrate_folio,
810 .is_partially_uptodate = iomap_is_partially_uptodate,
811 .error_remove_folio = generic_error_remove_folio,
812 .swap_activate = xfs_vm_swap_activate,
813 };
814
815 const struct address_space_operations xfs_dax_aops = {
816 .writepages = xfs_dax_writepages,
817 .dirty_folio = noop_dirty_folio,
818 .swap_activate = xfs_vm_swap_activate,
819 };
820