1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 4 * Copyright (c) 2016-2018 Christoph Hellwig. 5 * All Rights Reserved. 6 */ 7 #include "xfs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_inode.h" 14 #include "xfs_trans.h" 15 #include "xfs_iomap.h" 16 #include "xfs_trace.h" 17 #include "xfs_bmap.h" 18 #include "xfs_bmap_util.h" 19 #include "xfs_reflink.h" 20 #include "xfs_errortag.h" 21 #include "xfs_error.h" 22 #include "xfs_icache.h" 23 24 struct xfs_writepage_ctx { 25 struct iomap_writepage_ctx ctx; 26 unsigned int data_seq; 27 unsigned int cow_seq; 28 }; 29 30 static inline struct xfs_writepage_ctx * 31 XFS_WPC(struct iomap_writepage_ctx *ctx) 32 { 33 return container_of(ctx, struct xfs_writepage_ctx, ctx); 34 } 35 36 /* 37 * Fast and loose check if this write could update the on-disk inode size. 38 */ 39 static inline bool xfs_ioend_is_append(struct iomap_ioend *ioend) 40 { 41 return ioend->io_offset + ioend->io_size > 42 XFS_I(ioend->io_inode)->i_disk_size; 43 } 44 45 /* 46 * Update on-disk file size now that data has been written to disk. 47 */ 48 int 49 xfs_setfilesize( 50 struct xfs_inode *ip, 51 xfs_off_t offset, 52 size_t size) 53 { 54 struct xfs_mount *mp = ip->i_mount; 55 struct xfs_trans *tp; 56 xfs_fsize_t isize; 57 int error; 58 59 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp); 60 if (error) 61 return error; 62 63 xfs_ilock(ip, XFS_ILOCK_EXCL); 64 isize = xfs_new_eof(ip, offset + size); 65 if (!isize) { 66 xfs_iunlock(ip, XFS_ILOCK_EXCL); 67 xfs_trans_cancel(tp); 68 return 0; 69 } 70 71 trace_xfs_setfilesize(ip, offset, size); 72 73 ip->i_disk_size = isize; 74 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 75 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 76 77 return xfs_trans_commit(tp); 78 } 79 80 /* 81 * IO write completion. 82 */ 83 STATIC void 84 xfs_end_ioend( 85 struct iomap_ioend *ioend) 86 { 87 struct xfs_inode *ip = XFS_I(ioend->io_inode); 88 struct xfs_mount *mp = ip->i_mount; 89 xfs_off_t offset = ioend->io_offset; 90 size_t size = ioend->io_size; 91 unsigned int nofs_flag; 92 int error; 93 94 /* 95 * We can allocate memory here while doing writeback on behalf of 96 * memory reclaim. To avoid memory allocation deadlocks set the 97 * task-wide nofs context for the following operations. 98 */ 99 nofs_flag = memalloc_nofs_save(); 100 101 /* 102 * Just clean up the in-memory structures if the fs has been shut down. 103 */ 104 if (xfs_is_shutdown(mp)) { 105 error = -EIO; 106 goto done; 107 } 108 109 /* 110 * Clean up all COW blocks and underlying data fork delalloc blocks on 111 * I/O error. The delalloc punch is required because this ioend was 112 * mapped to blocks in the COW fork and the associated pages are no 113 * longer dirty. If we don't remove delalloc blocks here, they become 114 * stale and can corrupt free space accounting on unmount. 115 */ 116 error = blk_status_to_errno(ioend->io_bio.bi_status); 117 if (unlikely(error)) { 118 if (ioend->io_flags & IOMAP_F_SHARED) { 119 xfs_reflink_cancel_cow_range(ip, offset, size, true); 120 xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, offset, 121 offset + size); 122 } 123 goto done; 124 } 125 126 /* 127 * Success: commit the COW or unwritten blocks if needed. 128 */ 129 if (ioend->io_flags & IOMAP_F_SHARED) 130 error = xfs_reflink_end_cow(ip, offset, size); 131 else if (ioend->io_type == IOMAP_UNWRITTEN) 132 error = xfs_iomap_write_unwritten(ip, offset, size, false); 133 134 if (!error && xfs_ioend_is_append(ioend)) 135 error = xfs_setfilesize(ip, offset, size); 136 done: 137 iomap_finish_ioends(ioend, error); 138 memalloc_nofs_restore(nofs_flag); 139 } 140 141 /* 142 * Finish all pending IO completions that require transactional modifications. 143 * 144 * We try to merge physical and logically contiguous ioends before completion to 145 * minimise the number of transactions we need to perform during IO completion. 146 * Both unwritten extent conversion and COW remapping need to iterate and modify 147 * one physical extent at a time, so we gain nothing by merging physically 148 * discontiguous extents here. 149 * 150 * The ioend chain length that we can be processing here is largely unbound in 151 * length and we may have to perform significant amounts of work on each ioend 152 * to complete it. Hence we have to be careful about holding the CPU for too 153 * long in this loop. 154 */ 155 void 156 xfs_end_io( 157 struct work_struct *work) 158 { 159 struct xfs_inode *ip = 160 container_of(work, struct xfs_inode, i_ioend_work); 161 struct iomap_ioend *ioend; 162 struct list_head tmp; 163 unsigned long flags; 164 165 spin_lock_irqsave(&ip->i_ioend_lock, flags); 166 list_replace_init(&ip->i_ioend_list, &tmp); 167 spin_unlock_irqrestore(&ip->i_ioend_lock, flags); 168 169 iomap_sort_ioends(&tmp); 170 while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend, 171 io_list))) { 172 list_del_init(&ioend->io_list); 173 iomap_ioend_try_merge(ioend, &tmp); 174 xfs_end_ioend(ioend); 175 cond_resched(); 176 } 177 } 178 179 STATIC void 180 xfs_end_bio( 181 struct bio *bio) 182 { 183 struct iomap_ioend *ioend = iomap_ioend_from_bio(bio); 184 struct xfs_inode *ip = XFS_I(ioend->io_inode); 185 unsigned long flags; 186 187 spin_lock_irqsave(&ip->i_ioend_lock, flags); 188 if (list_empty(&ip->i_ioend_list)) 189 WARN_ON_ONCE(!queue_work(ip->i_mount->m_unwritten_workqueue, 190 &ip->i_ioend_work)); 191 list_add_tail(&ioend->io_list, &ip->i_ioend_list); 192 spin_unlock_irqrestore(&ip->i_ioend_lock, flags); 193 } 194 195 /* 196 * Fast revalidation of the cached writeback mapping. Return true if the current 197 * mapping is valid, false otherwise. 198 */ 199 static bool 200 xfs_imap_valid( 201 struct iomap_writepage_ctx *wpc, 202 struct xfs_inode *ip, 203 loff_t offset) 204 { 205 if (offset < wpc->iomap.offset || 206 offset >= wpc->iomap.offset + wpc->iomap.length) 207 return false; 208 /* 209 * If this is a COW mapping, it is sufficient to check that the mapping 210 * covers the offset. Be careful to check this first because the caller 211 * can revalidate a COW mapping without updating the data seqno. 212 */ 213 if (wpc->iomap.flags & IOMAP_F_SHARED) 214 return true; 215 216 /* 217 * This is not a COW mapping. Check the sequence number of the data fork 218 * because concurrent changes could have invalidated the extent. Check 219 * the COW fork because concurrent changes since the last time we 220 * checked (and found nothing at this offset) could have added 221 * overlapping blocks. 222 */ 223 if (XFS_WPC(wpc)->data_seq != READ_ONCE(ip->i_df.if_seq)) { 224 trace_xfs_wb_data_iomap_invalid(ip, &wpc->iomap, 225 XFS_WPC(wpc)->data_seq, XFS_DATA_FORK); 226 return false; 227 } 228 if (xfs_inode_has_cow_data(ip) && 229 XFS_WPC(wpc)->cow_seq != READ_ONCE(ip->i_cowfp->if_seq)) { 230 trace_xfs_wb_cow_iomap_invalid(ip, &wpc->iomap, 231 XFS_WPC(wpc)->cow_seq, XFS_COW_FORK); 232 return false; 233 } 234 return true; 235 } 236 237 static int 238 xfs_map_blocks( 239 struct iomap_writepage_ctx *wpc, 240 struct inode *inode, 241 loff_t offset, 242 unsigned int len) 243 { 244 struct xfs_inode *ip = XFS_I(inode); 245 struct xfs_mount *mp = ip->i_mount; 246 ssize_t count = i_blocksize(inode); 247 xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); 248 xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); 249 xfs_fileoff_t cow_fsb; 250 int whichfork; 251 struct xfs_bmbt_irec imap; 252 struct xfs_iext_cursor icur; 253 int retries = 0; 254 int error = 0; 255 unsigned int *seq; 256 257 if (xfs_is_shutdown(mp)) 258 return -EIO; 259 260 XFS_ERRORTAG_DELAY(mp, XFS_ERRTAG_WB_DELAY_MS); 261 262 /* 263 * COW fork blocks can overlap data fork blocks even if the blocks 264 * aren't shared. COW I/O always takes precedent, so we must always 265 * check for overlap on reflink inodes unless the mapping is already a 266 * COW one, or the COW fork hasn't changed from the last time we looked 267 * at it. 268 * 269 * It's safe to check the COW fork if_seq here without the ILOCK because 270 * we've indirectly protected against concurrent updates: writeback has 271 * the page locked, which prevents concurrent invalidations by reflink 272 * and directio and prevents concurrent buffered writes to the same 273 * page. Changes to if_seq always happen under i_lock, which protects 274 * against concurrent updates and provides a memory barrier on the way 275 * out that ensures that we always see the current value. 276 */ 277 if (xfs_imap_valid(wpc, ip, offset)) 278 return 0; 279 280 /* 281 * If we don't have a valid map, now it's time to get a new one for this 282 * offset. This will convert delayed allocations (including COW ones) 283 * into real extents. If we return without a valid map, it means we 284 * landed in a hole and we skip the block. 285 */ 286 retry: 287 cow_fsb = NULLFILEOFF; 288 whichfork = XFS_DATA_FORK; 289 xfs_ilock(ip, XFS_ILOCK_SHARED); 290 ASSERT(!xfs_need_iread_extents(&ip->i_df)); 291 292 /* 293 * Check if this is offset is covered by a COW extents, and if yes use 294 * it directly instead of looking up anything in the data fork. 295 */ 296 if (xfs_inode_has_cow_data(ip) && 297 xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap)) 298 cow_fsb = imap.br_startoff; 299 if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) { 300 XFS_WPC(wpc)->cow_seq = READ_ONCE(ip->i_cowfp->if_seq); 301 xfs_iunlock(ip, XFS_ILOCK_SHARED); 302 303 whichfork = XFS_COW_FORK; 304 goto allocate_blocks; 305 } 306 307 /* 308 * No COW extent overlap. Revalidate now that we may have updated 309 * ->cow_seq. If the data mapping is still valid, we're done. 310 */ 311 if (xfs_imap_valid(wpc, ip, offset)) { 312 xfs_iunlock(ip, XFS_ILOCK_SHARED); 313 return 0; 314 } 315 316 /* 317 * If we don't have a valid map, now it's time to get a new one for this 318 * offset. This will convert delayed allocations (including COW ones) 319 * into real extents. 320 */ 321 if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap)) 322 imap.br_startoff = end_fsb; /* fake a hole past EOF */ 323 XFS_WPC(wpc)->data_seq = READ_ONCE(ip->i_df.if_seq); 324 xfs_iunlock(ip, XFS_ILOCK_SHARED); 325 326 /* landed in a hole or beyond EOF? */ 327 if (imap.br_startoff > offset_fsb) { 328 imap.br_blockcount = imap.br_startoff - offset_fsb; 329 imap.br_startoff = offset_fsb; 330 imap.br_startblock = HOLESTARTBLOCK; 331 imap.br_state = XFS_EXT_NORM; 332 } 333 334 /* 335 * Truncate to the next COW extent if there is one. This is the only 336 * opportunity to do this because we can skip COW fork lookups for the 337 * subsequent blocks in the mapping; however, the requirement to treat 338 * the COW range separately remains. 339 */ 340 if (cow_fsb != NULLFILEOFF && 341 cow_fsb < imap.br_startoff + imap.br_blockcount) 342 imap.br_blockcount = cow_fsb - imap.br_startoff; 343 344 /* got a delalloc extent? */ 345 if (imap.br_startblock != HOLESTARTBLOCK && 346 isnullstartblock(imap.br_startblock)) 347 goto allocate_blocks; 348 349 xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, XFS_WPC(wpc)->data_seq); 350 trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap); 351 return 0; 352 allocate_blocks: 353 /* 354 * Convert a dellalloc extent to a real one. The current page is held 355 * locked so nothing could have removed the block backing offset_fsb, 356 * although it could have moved from the COW to the data fork by another 357 * thread. 358 */ 359 if (whichfork == XFS_COW_FORK) 360 seq = &XFS_WPC(wpc)->cow_seq; 361 else 362 seq = &XFS_WPC(wpc)->data_seq; 363 364 error = xfs_bmapi_convert_delalloc(ip, whichfork, offset, 365 &wpc->iomap, seq); 366 if (error) { 367 /* 368 * If we failed to find the extent in the COW fork we might have 369 * raced with a COW to data fork conversion or truncate. 370 * Restart the lookup to catch the extent in the data fork for 371 * the former case, but prevent additional retries to avoid 372 * looping forever for the latter case. 373 */ 374 if (error == -EAGAIN && whichfork == XFS_COW_FORK && !retries++) 375 goto retry; 376 ASSERT(error != -EAGAIN); 377 return error; 378 } 379 380 /* 381 * Due to merging the return real extent might be larger than the 382 * original delalloc one. Trim the return extent to the next COW 383 * boundary again to force a re-lookup. 384 */ 385 if (whichfork != XFS_COW_FORK && cow_fsb != NULLFILEOFF) { 386 loff_t cow_offset = XFS_FSB_TO_B(mp, cow_fsb); 387 388 if (cow_offset < wpc->iomap.offset + wpc->iomap.length) 389 wpc->iomap.length = cow_offset - wpc->iomap.offset; 390 } 391 392 ASSERT(wpc->iomap.offset <= offset); 393 ASSERT(wpc->iomap.offset + wpc->iomap.length > offset); 394 trace_xfs_map_blocks_alloc(ip, offset, count, whichfork, &imap); 395 return 0; 396 } 397 398 static int 399 xfs_prepare_ioend( 400 struct iomap_ioend *ioend, 401 int status) 402 { 403 unsigned int nofs_flag; 404 405 /* 406 * We can allocate memory here while doing writeback on behalf of 407 * memory reclaim. To avoid memory allocation deadlocks set the 408 * task-wide nofs context for the following operations. 409 */ 410 nofs_flag = memalloc_nofs_save(); 411 412 /* Convert CoW extents to regular */ 413 if (!status && (ioend->io_flags & IOMAP_F_SHARED)) { 414 status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode), 415 ioend->io_offset, ioend->io_size); 416 } 417 418 memalloc_nofs_restore(nofs_flag); 419 420 /* send ioends that might require a transaction to the completion wq */ 421 if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN || 422 (ioend->io_flags & IOMAP_F_SHARED)) 423 ioend->io_bio.bi_end_io = xfs_end_bio; 424 return status; 425 } 426 427 /* 428 * If the folio has delalloc blocks on it, the caller is asking us to punch them 429 * out. If we don't, we can leave a stale delalloc mapping covered by a clean 430 * page that needs to be dirtied again before the delalloc mapping can be 431 * converted. This stale delalloc mapping can trip up a later direct I/O read 432 * operation on the same region. 433 * 434 * We prevent this by truncating away the delalloc regions on the folio. Because 435 * they are delalloc, we can do this without needing a transaction. Indeed - if 436 * we get ENOSPC errors, we have to be able to do this truncation without a 437 * transaction as there is no space left for block reservation (typically why 438 * we see a ENOSPC in writeback). 439 */ 440 static void 441 xfs_discard_folio( 442 struct folio *folio, 443 loff_t pos) 444 { 445 struct xfs_inode *ip = XFS_I(folio->mapping->host); 446 struct xfs_mount *mp = ip->i_mount; 447 448 if (xfs_is_shutdown(mp)) 449 return; 450 451 xfs_alert_ratelimited(mp, 452 "page discard on page "PTR_FMT", inode 0x%llx, pos %llu.", 453 folio, ip->i_ino, pos); 454 455 /* 456 * The end of the punch range is always the offset of the first 457 * byte of the next folio. Hence the end offset is only dependent on the 458 * folio itself and not the start offset that is passed in. 459 */ 460 xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos, 461 folio_pos(folio) + folio_size(folio)); 462 } 463 464 static const struct iomap_writeback_ops xfs_writeback_ops = { 465 .map_blocks = xfs_map_blocks, 466 .prepare_ioend = xfs_prepare_ioend, 467 .discard_folio = xfs_discard_folio, 468 }; 469 470 STATIC int 471 xfs_vm_writepages( 472 struct address_space *mapping, 473 struct writeback_control *wbc) 474 { 475 struct xfs_writepage_ctx wpc = { }; 476 477 xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); 478 return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops); 479 } 480 481 STATIC int 482 xfs_dax_writepages( 483 struct address_space *mapping, 484 struct writeback_control *wbc) 485 { 486 struct xfs_inode *ip = XFS_I(mapping->host); 487 488 xfs_iflags_clear(ip, XFS_ITRUNCATED); 489 return dax_writeback_mapping_range(mapping, 490 xfs_inode_buftarg(ip)->bt_daxdev, wbc); 491 } 492 493 STATIC sector_t 494 xfs_vm_bmap( 495 struct address_space *mapping, 496 sector_t block) 497 { 498 struct xfs_inode *ip = XFS_I(mapping->host); 499 500 trace_xfs_vm_bmap(ip); 501 502 /* 503 * The swap code (ab-)uses ->bmap to get a block mapping and then 504 * bypasses the file system for actual I/O. We really can't allow 505 * that on reflinks inodes, so we have to skip out here. And yes, 506 * 0 is the magic code for a bmap error. 507 * 508 * Since we don't pass back blockdev info, we can't return bmap 509 * information for rt files either. 510 */ 511 if (xfs_is_cow_inode(ip) || XFS_IS_REALTIME_INODE(ip)) 512 return 0; 513 return iomap_bmap(mapping, block, &xfs_read_iomap_ops); 514 } 515 516 STATIC int 517 xfs_vm_read_folio( 518 struct file *unused, 519 struct folio *folio) 520 { 521 return iomap_read_folio(folio, &xfs_read_iomap_ops); 522 } 523 524 STATIC void 525 xfs_vm_readahead( 526 struct readahead_control *rac) 527 { 528 iomap_readahead(rac, &xfs_read_iomap_ops); 529 } 530 531 static int 532 xfs_vm_swap_activate( 533 struct swap_info_struct *sis, 534 struct file *swap_file, 535 sector_t *span) 536 { 537 struct xfs_inode *ip = XFS_I(file_inode(swap_file)); 538 539 /* 540 * Swap file activation can race against concurrent shared extent 541 * removal in files that have been cloned. If this happens, 542 * iomap_swapfile_iter() can fail because it encountered a shared 543 * extent even though an operation is in progress to remove those 544 * shared extents. 545 * 546 * This race becomes problematic when we defer extent removal 547 * operations beyond the end of a syscall (i.e. use async background 548 * processing algorithms). Users think the extents are no longer 549 * shared, but iomap_swapfile_iter() still sees them as shared 550 * because the refcountbt entries for the extents being removed have 551 * not yet been updated. Hence the swapon call fails unexpectedly. 552 * 553 * The race condition is currently most obvious from the unlink() 554 * operation as extent removal is deferred until after the last 555 * reference to the inode goes away. We then process the extent 556 * removal asynchronously, hence triggers the "syscall completed but 557 * work not done" condition mentioned above. To close this race 558 * window, we need to flush any pending inodegc operations to ensure 559 * they have updated the refcountbt records before we try to map the 560 * swapfile. 561 */ 562 xfs_inodegc_flush(ip->i_mount); 563 564 /* 565 * Direct the swap code to the correct block device when this file 566 * sits on the RT device. 567 */ 568 sis->bdev = xfs_inode_buftarg(ip)->bt_bdev; 569 570 return iomap_swapfile_activate(sis, swap_file, span, 571 &xfs_read_iomap_ops); 572 } 573 574 const struct address_space_operations xfs_address_space_operations = { 575 .read_folio = xfs_vm_read_folio, 576 .readahead = xfs_vm_readahead, 577 .writepages = xfs_vm_writepages, 578 .dirty_folio = iomap_dirty_folio, 579 .release_folio = iomap_release_folio, 580 .invalidate_folio = iomap_invalidate_folio, 581 .bmap = xfs_vm_bmap, 582 .migrate_folio = filemap_migrate_folio, 583 .is_partially_uptodate = iomap_is_partially_uptodate, 584 .error_remove_folio = generic_error_remove_folio, 585 .swap_activate = xfs_vm_swap_activate, 586 }; 587 588 const struct address_space_operations xfs_dax_aops = { 589 .writepages = xfs_dax_writepages, 590 .dirty_folio = noop_dirty_folio, 591 .swap_activate = xfs_vm_swap_activate, 592 }; 593