1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 4 * Copyright (c) 2016-2018 Christoph Hellwig. 5 * All Rights Reserved. 6 */ 7 #include "xfs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_inode.h" 14 #include "xfs_trans.h" 15 #include "xfs_iomap.h" 16 #include "xfs_trace.h" 17 #include "xfs_bmap.h" 18 #include "xfs_bmap_util.h" 19 #include "xfs_reflink.h" 20 #include "xfs_errortag.h" 21 #include "xfs_error.h" 22 #include "xfs_icache.h" 23 24 struct xfs_writepage_ctx { 25 struct iomap_writepage_ctx ctx; 26 unsigned int data_seq; 27 unsigned int cow_seq; 28 }; 29 30 static inline struct xfs_writepage_ctx * 31 XFS_WPC(struct iomap_writepage_ctx *ctx) 32 { 33 return container_of(ctx, struct xfs_writepage_ctx, ctx); 34 } 35 36 /* 37 * Fast and loose check if this write could update the on-disk inode size. 38 */ 39 static inline bool xfs_ioend_is_append(struct iomap_ioend *ioend) 40 { 41 return ioend->io_offset + ioend->io_size > 42 XFS_I(ioend->io_inode)->i_disk_size; 43 } 44 45 /* 46 * Update on-disk file size now that data has been written to disk. 47 */ 48 int 49 xfs_setfilesize( 50 struct xfs_inode *ip, 51 xfs_off_t offset, 52 size_t size) 53 { 54 struct xfs_mount *mp = ip->i_mount; 55 struct xfs_trans *tp; 56 xfs_fsize_t isize; 57 int error; 58 59 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp); 60 if (error) 61 return error; 62 63 xfs_ilock(ip, XFS_ILOCK_EXCL); 64 isize = xfs_new_eof(ip, offset + size); 65 if (!isize) { 66 xfs_iunlock(ip, XFS_ILOCK_EXCL); 67 xfs_trans_cancel(tp); 68 return 0; 69 } 70 71 trace_xfs_setfilesize(ip, offset, size); 72 73 ip->i_disk_size = isize; 74 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 75 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 76 77 return xfs_trans_commit(tp); 78 } 79 80 /* 81 * IO write completion. 82 */ 83 STATIC void 84 xfs_end_ioend( 85 struct iomap_ioend *ioend) 86 { 87 struct xfs_inode *ip = XFS_I(ioend->io_inode); 88 struct xfs_mount *mp = ip->i_mount; 89 xfs_off_t offset = ioend->io_offset; 90 size_t size = ioend->io_size; 91 unsigned int nofs_flag; 92 int error; 93 94 /* 95 * We can allocate memory here while doing writeback on behalf of 96 * memory reclaim. To avoid memory allocation deadlocks set the 97 * task-wide nofs context for the following operations. 98 */ 99 nofs_flag = memalloc_nofs_save(); 100 101 /* 102 * Just clean up the in-memory structures if the fs has been shut down. 103 */ 104 if (xfs_is_shutdown(mp)) { 105 error = -EIO; 106 goto done; 107 } 108 109 /* 110 * Clean up all COW blocks and underlying data fork delalloc blocks on 111 * I/O error. The delalloc punch is required because this ioend was 112 * mapped to blocks in the COW fork and the associated pages are no 113 * longer dirty. If we don't remove delalloc blocks here, they become 114 * stale and can corrupt free space accounting on unmount. 115 */ 116 error = blk_status_to_errno(ioend->io_bio.bi_status); 117 if (unlikely(error)) { 118 if (ioend->io_flags & IOMAP_IOEND_SHARED) { 119 xfs_reflink_cancel_cow_range(ip, offset, size, true); 120 xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, offset, 121 offset + size); 122 } 123 goto done; 124 } 125 126 /* 127 * Success: commit the COW or unwritten blocks if needed. 128 */ 129 if (ioend->io_flags & IOMAP_IOEND_SHARED) 130 error = xfs_reflink_end_cow(ip, offset, size); 131 else if (ioend->io_flags & IOMAP_IOEND_UNWRITTEN) 132 error = xfs_iomap_write_unwritten(ip, offset, size, false); 133 134 if (!error && xfs_ioend_is_append(ioend)) 135 error = xfs_setfilesize(ip, offset, size); 136 done: 137 iomap_finish_ioends(ioend, error); 138 memalloc_nofs_restore(nofs_flag); 139 } 140 141 /* 142 * Finish all pending IO completions that require transactional modifications. 143 * 144 * We try to merge physical and logically contiguous ioends before completion to 145 * minimise the number of transactions we need to perform during IO completion. 146 * Both unwritten extent conversion and COW remapping need to iterate and modify 147 * one physical extent at a time, so we gain nothing by merging physically 148 * discontiguous extents here. 149 * 150 * The ioend chain length that we can be processing here is largely unbound in 151 * length and we may have to perform significant amounts of work on each ioend 152 * to complete it. Hence we have to be careful about holding the CPU for too 153 * long in this loop. 154 */ 155 void 156 xfs_end_io( 157 struct work_struct *work) 158 { 159 struct xfs_inode *ip = 160 container_of(work, struct xfs_inode, i_ioend_work); 161 struct iomap_ioend *ioend; 162 struct list_head tmp; 163 unsigned long flags; 164 165 spin_lock_irqsave(&ip->i_ioend_lock, flags); 166 list_replace_init(&ip->i_ioend_list, &tmp); 167 spin_unlock_irqrestore(&ip->i_ioend_lock, flags); 168 169 iomap_sort_ioends(&tmp); 170 while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend, 171 io_list))) { 172 list_del_init(&ioend->io_list); 173 iomap_ioend_try_merge(ioend, &tmp); 174 xfs_end_ioend(ioend); 175 cond_resched(); 176 } 177 } 178 179 STATIC void 180 xfs_end_bio( 181 struct bio *bio) 182 { 183 struct iomap_ioend *ioend = iomap_ioend_from_bio(bio); 184 struct xfs_inode *ip = XFS_I(ioend->io_inode); 185 unsigned long flags; 186 187 spin_lock_irqsave(&ip->i_ioend_lock, flags); 188 if (list_empty(&ip->i_ioend_list)) 189 WARN_ON_ONCE(!queue_work(ip->i_mount->m_unwritten_workqueue, 190 &ip->i_ioend_work)); 191 list_add_tail(&ioend->io_list, &ip->i_ioend_list); 192 spin_unlock_irqrestore(&ip->i_ioend_lock, flags); 193 } 194 195 /* 196 * Fast revalidation of the cached writeback mapping. Return true if the current 197 * mapping is valid, false otherwise. 198 */ 199 static bool 200 xfs_imap_valid( 201 struct iomap_writepage_ctx *wpc, 202 struct xfs_inode *ip, 203 loff_t offset) 204 { 205 if (offset < wpc->iomap.offset || 206 offset >= wpc->iomap.offset + wpc->iomap.length) 207 return false; 208 /* 209 * If this is a COW mapping, it is sufficient to check that the mapping 210 * covers the offset. Be careful to check this first because the caller 211 * can revalidate a COW mapping without updating the data seqno. 212 */ 213 if (wpc->iomap.flags & IOMAP_F_SHARED) 214 return true; 215 216 /* 217 * This is not a COW mapping. Check the sequence number of the data fork 218 * because concurrent changes could have invalidated the extent. Check 219 * the COW fork because concurrent changes since the last time we 220 * checked (and found nothing at this offset) could have added 221 * overlapping blocks. 222 */ 223 if (XFS_WPC(wpc)->data_seq != READ_ONCE(ip->i_df.if_seq)) { 224 trace_xfs_wb_data_iomap_invalid(ip, &wpc->iomap, 225 XFS_WPC(wpc)->data_seq, XFS_DATA_FORK); 226 return false; 227 } 228 if (xfs_inode_has_cow_data(ip) && 229 XFS_WPC(wpc)->cow_seq != READ_ONCE(ip->i_cowfp->if_seq)) { 230 trace_xfs_wb_cow_iomap_invalid(ip, &wpc->iomap, 231 XFS_WPC(wpc)->cow_seq, XFS_COW_FORK); 232 return false; 233 } 234 return true; 235 } 236 237 static int 238 xfs_map_blocks( 239 struct iomap_writepage_ctx *wpc, 240 struct inode *inode, 241 loff_t offset, 242 unsigned int len) 243 { 244 struct xfs_inode *ip = XFS_I(inode); 245 struct xfs_mount *mp = ip->i_mount; 246 ssize_t count = i_blocksize(inode); 247 xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); 248 xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); 249 xfs_fileoff_t cow_fsb; 250 int whichfork; 251 struct xfs_bmbt_irec imap; 252 struct xfs_iext_cursor icur; 253 int retries = 0; 254 int error = 0; 255 unsigned int *seq; 256 257 if (xfs_is_shutdown(mp)) 258 return -EIO; 259 260 XFS_ERRORTAG_DELAY(mp, XFS_ERRTAG_WB_DELAY_MS); 261 262 /* 263 * COW fork blocks can overlap data fork blocks even if the blocks 264 * aren't shared. COW I/O always takes precedent, so we must always 265 * check for overlap on reflink inodes unless the mapping is already a 266 * COW one, or the COW fork hasn't changed from the last time we looked 267 * at it. 268 * 269 * It's safe to check the COW fork if_seq here without the ILOCK because 270 * we've indirectly protected against concurrent updates: writeback has 271 * the page locked, which prevents concurrent invalidations by reflink 272 * and directio and prevents concurrent buffered writes to the same 273 * page. Changes to if_seq always happen under i_lock, which protects 274 * against concurrent updates and provides a memory barrier on the way 275 * out that ensures that we always see the current value. 276 */ 277 if (xfs_imap_valid(wpc, ip, offset)) 278 return 0; 279 280 /* 281 * If we don't have a valid map, now it's time to get a new one for this 282 * offset. This will convert delayed allocations (including COW ones) 283 * into real extents. If we return without a valid map, it means we 284 * landed in a hole and we skip the block. 285 */ 286 retry: 287 cow_fsb = NULLFILEOFF; 288 whichfork = XFS_DATA_FORK; 289 xfs_ilock(ip, XFS_ILOCK_SHARED); 290 ASSERT(!xfs_need_iread_extents(&ip->i_df)); 291 292 /* 293 * Check if this is offset is covered by a COW extents, and if yes use 294 * it directly instead of looking up anything in the data fork. 295 */ 296 if (xfs_inode_has_cow_data(ip) && 297 xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap)) 298 cow_fsb = imap.br_startoff; 299 if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) { 300 XFS_WPC(wpc)->cow_seq = READ_ONCE(ip->i_cowfp->if_seq); 301 xfs_iunlock(ip, XFS_ILOCK_SHARED); 302 303 whichfork = XFS_COW_FORK; 304 goto allocate_blocks; 305 } 306 307 /* 308 * No COW extent overlap. Revalidate now that we may have updated 309 * ->cow_seq. If the data mapping is still valid, we're done. 310 */ 311 if (xfs_imap_valid(wpc, ip, offset)) { 312 xfs_iunlock(ip, XFS_ILOCK_SHARED); 313 return 0; 314 } 315 316 /* 317 * If we don't have a valid map, now it's time to get a new one for this 318 * offset. This will convert delayed allocations (including COW ones) 319 * into real extents. 320 */ 321 if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap)) 322 imap.br_startoff = end_fsb; /* fake a hole past EOF */ 323 XFS_WPC(wpc)->data_seq = READ_ONCE(ip->i_df.if_seq); 324 xfs_iunlock(ip, XFS_ILOCK_SHARED); 325 326 /* landed in a hole or beyond EOF? */ 327 if (imap.br_startoff > offset_fsb) { 328 imap.br_blockcount = imap.br_startoff - offset_fsb; 329 imap.br_startoff = offset_fsb; 330 imap.br_startblock = HOLESTARTBLOCK; 331 imap.br_state = XFS_EXT_NORM; 332 } 333 334 /* 335 * Truncate to the next COW extent if there is one. This is the only 336 * opportunity to do this because we can skip COW fork lookups for the 337 * subsequent blocks in the mapping; however, the requirement to treat 338 * the COW range separately remains. 339 */ 340 if (cow_fsb != NULLFILEOFF && 341 cow_fsb < imap.br_startoff + imap.br_blockcount) 342 imap.br_blockcount = cow_fsb - imap.br_startoff; 343 344 /* got a delalloc extent? */ 345 if (imap.br_startblock != HOLESTARTBLOCK && 346 isnullstartblock(imap.br_startblock)) 347 goto allocate_blocks; 348 349 xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, XFS_WPC(wpc)->data_seq); 350 trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap); 351 return 0; 352 allocate_blocks: 353 /* 354 * Convert a dellalloc extent to a real one. The current page is held 355 * locked so nothing could have removed the block backing offset_fsb, 356 * although it could have moved from the COW to the data fork by another 357 * thread. 358 */ 359 if (whichfork == XFS_COW_FORK) 360 seq = &XFS_WPC(wpc)->cow_seq; 361 else 362 seq = &XFS_WPC(wpc)->data_seq; 363 364 error = xfs_bmapi_convert_delalloc(ip, whichfork, offset, 365 &wpc->iomap, seq); 366 if (error) { 367 /* 368 * If we failed to find the extent in the COW fork we might have 369 * raced with a COW to data fork conversion or truncate. 370 * Restart the lookup to catch the extent in the data fork for 371 * the former case, but prevent additional retries to avoid 372 * looping forever for the latter case. 373 */ 374 if (error == -EAGAIN && whichfork == XFS_COW_FORK && !retries++) 375 goto retry; 376 ASSERT(error != -EAGAIN); 377 return error; 378 } 379 380 /* 381 * Due to merging the return real extent might be larger than the 382 * original delalloc one. Trim the return extent to the next COW 383 * boundary again to force a re-lookup. 384 */ 385 if (whichfork != XFS_COW_FORK && cow_fsb != NULLFILEOFF) { 386 loff_t cow_offset = XFS_FSB_TO_B(mp, cow_fsb); 387 388 if (cow_offset < wpc->iomap.offset + wpc->iomap.length) 389 wpc->iomap.length = cow_offset - wpc->iomap.offset; 390 } 391 392 ASSERT(wpc->iomap.offset <= offset); 393 ASSERT(wpc->iomap.offset + wpc->iomap.length > offset); 394 trace_xfs_map_blocks_alloc(ip, offset, count, whichfork, &imap); 395 return 0; 396 } 397 398 static int 399 xfs_submit_ioend( 400 struct iomap_writepage_ctx *wpc, 401 int status) 402 { 403 struct iomap_ioend *ioend = wpc->ioend; 404 unsigned int nofs_flag; 405 406 /* 407 * We can allocate memory here while doing writeback on behalf of 408 * memory reclaim. To avoid memory allocation deadlocks set the 409 * task-wide nofs context for the following operations. 410 */ 411 nofs_flag = memalloc_nofs_save(); 412 413 /* Convert CoW extents to regular */ 414 if (!status && (ioend->io_flags & IOMAP_IOEND_SHARED)) { 415 status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode), 416 ioend->io_offset, ioend->io_size); 417 } 418 419 memalloc_nofs_restore(nofs_flag); 420 421 /* send ioends that might require a transaction to the completion wq */ 422 if (xfs_ioend_is_append(ioend) || 423 (ioend->io_flags & (IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_SHARED))) 424 ioend->io_bio.bi_end_io = xfs_end_bio; 425 426 if (status) 427 return status; 428 submit_bio(&ioend->io_bio); 429 return 0; 430 } 431 432 /* 433 * If the folio has delalloc blocks on it, the caller is asking us to punch them 434 * out. If we don't, we can leave a stale delalloc mapping covered by a clean 435 * page that needs to be dirtied again before the delalloc mapping can be 436 * converted. This stale delalloc mapping can trip up a later direct I/O read 437 * operation on the same region. 438 * 439 * We prevent this by truncating away the delalloc regions on the folio. Because 440 * they are delalloc, we can do this without needing a transaction. Indeed - if 441 * we get ENOSPC errors, we have to be able to do this truncation without a 442 * transaction as there is no space left for block reservation (typically why 443 * we see a ENOSPC in writeback). 444 */ 445 static void 446 xfs_discard_folio( 447 struct folio *folio, 448 loff_t pos) 449 { 450 struct xfs_inode *ip = XFS_I(folio->mapping->host); 451 struct xfs_mount *mp = ip->i_mount; 452 453 if (xfs_is_shutdown(mp)) 454 return; 455 456 xfs_alert_ratelimited(mp, 457 "page discard on page "PTR_FMT", inode 0x%llx, pos %llu.", 458 folio, ip->i_ino, pos); 459 460 /* 461 * The end of the punch range is always the offset of the first 462 * byte of the next folio. Hence the end offset is only dependent on the 463 * folio itself and not the start offset that is passed in. 464 */ 465 xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos, 466 folio_pos(folio) + folio_size(folio)); 467 } 468 469 static const struct iomap_writeback_ops xfs_writeback_ops = { 470 .map_blocks = xfs_map_blocks, 471 .submit_ioend = xfs_submit_ioend, 472 .discard_folio = xfs_discard_folio, 473 }; 474 475 STATIC int 476 xfs_vm_writepages( 477 struct address_space *mapping, 478 struct writeback_control *wbc) 479 { 480 struct xfs_writepage_ctx wpc = { }; 481 482 xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); 483 return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops); 484 } 485 486 STATIC int 487 xfs_dax_writepages( 488 struct address_space *mapping, 489 struct writeback_control *wbc) 490 { 491 struct xfs_inode *ip = XFS_I(mapping->host); 492 493 xfs_iflags_clear(ip, XFS_ITRUNCATED); 494 return dax_writeback_mapping_range(mapping, 495 xfs_inode_buftarg(ip)->bt_daxdev, wbc); 496 } 497 498 STATIC sector_t 499 xfs_vm_bmap( 500 struct address_space *mapping, 501 sector_t block) 502 { 503 struct xfs_inode *ip = XFS_I(mapping->host); 504 505 trace_xfs_vm_bmap(ip); 506 507 /* 508 * The swap code (ab-)uses ->bmap to get a block mapping and then 509 * bypasses the file system for actual I/O. We really can't allow 510 * that on reflinks inodes, so we have to skip out here. And yes, 511 * 0 is the magic code for a bmap error. 512 * 513 * Since we don't pass back blockdev info, we can't return bmap 514 * information for rt files either. 515 */ 516 if (xfs_is_cow_inode(ip) || XFS_IS_REALTIME_INODE(ip)) 517 return 0; 518 return iomap_bmap(mapping, block, &xfs_read_iomap_ops); 519 } 520 521 STATIC int 522 xfs_vm_read_folio( 523 struct file *unused, 524 struct folio *folio) 525 { 526 return iomap_read_folio(folio, &xfs_read_iomap_ops); 527 } 528 529 STATIC void 530 xfs_vm_readahead( 531 struct readahead_control *rac) 532 { 533 iomap_readahead(rac, &xfs_read_iomap_ops); 534 } 535 536 static int 537 xfs_vm_swap_activate( 538 struct swap_info_struct *sis, 539 struct file *swap_file, 540 sector_t *span) 541 { 542 struct xfs_inode *ip = XFS_I(file_inode(swap_file)); 543 544 /* 545 * Swap file activation can race against concurrent shared extent 546 * removal in files that have been cloned. If this happens, 547 * iomap_swapfile_iter() can fail because it encountered a shared 548 * extent even though an operation is in progress to remove those 549 * shared extents. 550 * 551 * This race becomes problematic when we defer extent removal 552 * operations beyond the end of a syscall (i.e. use async background 553 * processing algorithms). Users think the extents are no longer 554 * shared, but iomap_swapfile_iter() still sees them as shared 555 * because the refcountbt entries for the extents being removed have 556 * not yet been updated. Hence the swapon call fails unexpectedly. 557 * 558 * The race condition is currently most obvious from the unlink() 559 * operation as extent removal is deferred until after the last 560 * reference to the inode goes away. We then process the extent 561 * removal asynchronously, hence triggers the "syscall completed but 562 * work not done" condition mentioned above. To close this race 563 * window, we need to flush any pending inodegc operations to ensure 564 * they have updated the refcountbt records before we try to map the 565 * swapfile. 566 */ 567 xfs_inodegc_flush(ip->i_mount); 568 569 /* 570 * Direct the swap code to the correct block device when this file 571 * sits on the RT device. 572 */ 573 sis->bdev = xfs_inode_buftarg(ip)->bt_bdev; 574 575 return iomap_swapfile_activate(sis, swap_file, span, 576 &xfs_read_iomap_ops); 577 } 578 579 const struct address_space_operations xfs_address_space_operations = { 580 .read_folio = xfs_vm_read_folio, 581 .readahead = xfs_vm_readahead, 582 .writepages = xfs_vm_writepages, 583 .dirty_folio = iomap_dirty_folio, 584 .release_folio = iomap_release_folio, 585 .invalidate_folio = iomap_invalidate_folio, 586 .bmap = xfs_vm_bmap, 587 .migrate_folio = filemap_migrate_folio, 588 .is_partially_uptodate = iomap_is_partially_uptodate, 589 .error_remove_folio = generic_error_remove_folio, 590 .swap_activate = xfs_vm_swap_activate, 591 }; 592 593 const struct address_space_operations xfs_dax_aops = { 594 .writepages = xfs_dax_writepages, 595 .dirty_folio = noop_dirty_folio, 596 .swap_activate = xfs_vm_swap_activate, 597 }; 598