1 // SPDX-License-Identifier: GPL-2.0+ 2 /* 3 * Copyright (C) 2016 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <darrick.wong@oracle.com> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_defer.h" 14 #include "xfs_da_format.h" 15 #include "xfs_da_btree.h" 16 #include "xfs_inode.h" 17 #include "xfs_trans.h" 18 #include "xfs_inode_item.h" 19 #include "xfs_bmap.h" 20 #include "xfs_bmap_util.h" 21 #include "xfs_error.h" 22 #include "xfs_dir2.h" 23 #include "xfs_dir2_priv.h" 24 #include "xfs_ioctl.h" 25 #include "xfs_trace.h" 26 #include "xfs_log.h" 27 #include "xfs_icache.h" 28 #include "xfs_pnfs.h" 29 #include "xfs_btree.h" 30 #include "xfs_refcount_btree.h" 31 #include "xfs_refcount.h" 32 #include "xfs_bmap_btree.h" 33 #include "xfs_trans_space.h" 34 #include "xfs_bit.h" 35 #include "xfs_alloc.h" 36 #include "xfs_quota_defs.h" 37 #include "xfs_quota.h" 38 #include "xfs_reflink.h" 39 #include "xfs_iomap.h" 40 #include "xfs_rmap_btree.h" 41 #include "xfs_sb.h" 42 #include "xfs_ag_resv.h" 43 44 /* 45 * Copy on Write of Shared Blocks 46 * 47 * XFS must preserve "the usual" file semantics even when two files share 48 * the same physical blocks. This means that a write to one file must not 49 * alter the blocks in a different file; the way that we'll do that is 50 * through the use of a copy-on-write mechanism. At a high level, that 51 * means that when we want to write to a shared block, we allocate a new 52 * block, write the data to the new block, and if that succeeds we map the 53 * new block into the file. 54 * 55 * XFS provides a "delayed allocation" mechanism that defers the allocation 56 * of disk blocks to dirty-but-not-yet-mapped file blocks as long as 57 * possible. This reduces fragmentation by enabling the filesystem to ask 58 * for bigger chunks less often, which is exactly what we want for CoW. 59 * 60 * The delalloc mechanism begins when the kernel wants to make a block 61 * writable (write_begin or page_mkwrite). If the offset is not mapped, we 62 * create a delalloc mapping, which is a regular in-core extent, but without 63 * a real startblock. (For delalloc mappings, the startblock encodes both 64 * a flag that this is a delalloc mapping, and a worst-case estimate of how 65 * many blocks might be required to put the mapping into the BMBT.) delalloc 66 * mappings are a reservation against the free space in the filesystem; 67 * adjacent mappings can also be combined into fewer larger mappings. 68 * 69 * As an optimization, the CoW extent size hint (cowextsz) creates 70 * outsized aligned delalloc reservations in the hope of landing out of 71 * order nearby CoW writes in a single extent on disk, thereby reducing 72 * fragmentation and improving future performance. 73 * 74 * D: --RRRRRRSSSRRRRRRRR--- (data fork) 75 * C: ------DDDDDDD--------- (CoW fork) 76 * 77 * When dirty pages are being written out (typically in writepage), the 78 * delalloc reservations are converted into unwritten mappings by 79 * allocating blocks and replacing the delalloc mapping with real ones. 80 * A delalloc mapping can be replaced by several unwritten ones if the 81 * free space is fragmented. 82 * 83 * D: --RRRRRRSSSRRRRRRRR--- 84 * C: ------UUUUUUU--------- 85 * 86 * We want to adapt the delalloc mechanism for copy-on-write, since the 87 * write paths are similar. The first two steps (creating the reservation 88 * and allocating the blocks) are exactly the same as delalloc except that 89 * the mappings must be stored in a separate CoW fork because we do not want 90 * to disturb the mapping in the data fork until we're sure that the write 91 * succeeded. IO completion in this case is the process of removing the old 92 * mapping from the data fork and moving the new mapping from the CoW fork to 93 * the data fork. This will be discussed shortly. 94 * 95 * For now, unaligned directio writes will be bounced back to the page cache. 96 * Block-aligned directio writes will use the same mechanism as buffered 97 * writes. 98 * 99 * Just prior to submitting the actual disk write requests, we convert 100 * the extents representing the range of the file actually being written 101 * (as opposed to extra pieces created for the cowextsize hint) to real 102 * extents. This will become important in the next step: 103 * 104 * D: --RRRRRRSSSRRRRRRRR--- 105 * C: ------UUrrUUU--------- 106 * 107 * CoW remapping must be done after the data block write completes, 108 * because we don't want to destroy the old data fork map until we're sure 109 * the new block has been written. Since the new mappings are kept in a 110 * separate fork, we can simply iterate these mappings to find the ones 111 * that cover the file blocks that we just CoW'd. For each extent, simply 112 * unmap the corresponding range in the data fork, map the new range into 113 * the data fork, and remove the extent from the CoW fork. Because of 114 * the presence of the cowextsize hint, however, we must be careful 115 * only to remap the blocks that we've actually written out -- we must 116 * never remap delalloc reservations nor CoW staging blocks that have 117 * yet to be written. This corresponds exactly to the real extents in 118 * the CoW fork: 119 * 120 * D: --RRRRRRrrSRRRRRRRR--- 121 * C: ------UU--UUU--------- 122 * 123 * Since the remapping operation can be applied to an arbitrary file 124 * range, we record the need for the remap step as a flag in the ioend 125 * instead of declaring a new IO type. This is required for direct io 126 * because we only have ioend for the whole dio, and we have to be able to 127 * remember the presence of unwritten blocks and CoW blocks with a single 128 * ioend structure. Better yet, the more ground we can cover with one 129 * ioend, the better. 130 */ 131 132 /* 133 * Given an AG extent, find the lowest-numbered run of shared blocks 134 * within that range and return the range in fbno/flen. If 135 * find_end_of_shared is true, return the longest contiguous extent of 136 * shared blocks. If there are no shared extents, fbno and flen will 137 * be set to NULLAGBLOCK and 0, respectively. 138 */ 139 int 140 xfs_reflink_find_shared( 141 struct xfs_mount *mp, 142 struct xfs_trans *tp, 143 xfs_agnumber_t agno, 144 xfs_agblock_t agbno, 145 xfs_extlen_t aglen, 146 xfs_agblock_t *fbno, 147 xfs_extlen_t *flen, 148 bool find_end_of_shared) 149 { 150 struct xfs_buf *agbp; 151 struct xfs_btree_cur *cur; 152 int error; 153 154 error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); 155 if (error) 156 return error; 157 if (!agbp) 158 return -ENOMEM; 159 160 cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno); 161 162 error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen, 163 find_end_of_shared); 164 165 xfs_btree_del_cursor(cur, error); 166 167 xfs_trans_brelse(tp, agbp); 168 return error; 169 } 170 171 /* 172 * Trim the mapping to the next block where there's a change in the 173 * shared/unshared status. More specifically, this means that we 174 * find the lowest-numbered extent of shared blocks that coincides with 175 * the given block mapping. If the shared extent overlaps the start of 176 * the mapping, trim the mapping to the end of the shared extent. If 177 * the shared region intersects the mapping, trim the mapping to the 178 * start of the shared extent. If there are no shared regions that 179 * overlap, just return the original extent. 180 */ 181 int 182 xfs_reflink_trim_around_shared( 183 struct xfs_inode *ip, 184 struct xfs_bmbt_irec *irec, 185 bool *shared) 186 { 187 xfs_agnumber_t agno; 188 xfs_agblock_t agbno; 189 xfs_extlen_t aglen; 190 xfs_agblock_t fbno; 191 xfs_extlen_t flen; 192 int error = 0; 193 194 /* Holes, unwritten, and delalloc extents cannot be shared */ 195 if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_real_extent(irec)) { 196 *shared = false; 197 return 0; 198 } 199 200 trace_xfs_reflink_trim_around_shared(ip, irec); 201 202 agno = XFS_FSB_TO_AGNO(ip->i_mount, irec->br_startblock); 203 agbno = XFS_FSB_TO_AGBNO(ip->i_mount, irec->br_startblock); 204 aglen = irec->br_blockcount; 205 206 error = xfs_reflink_find_shared(ip->i_mount, NULL, agno, agbno, 207 aglen, &fbno, &flen, true); 208 if (error) 209 return error; 210 211 *shared = false; 212 if (fbno == NULLAGBLOCK) { 213 /* No shared blocks at all. */ 214 return 0; 215 } else if (fbno == agbno) { 216 /* 217 * The start of this extent is shared. Truncate the 218 * mapping at the end of the shared region so that a 219 * subsequent iteration starts at the start of the 220 * unshared region. 221 */ 222 irec->br_blockcount = flen; 223 *shared = true; 224 return 0; 225 } else { 226 /* 227 * There's a shared extent midway through this extent. 228 * Truncate the mapping at the start of the shared 229 * extent so that a subsequent iteration starts at the 230 * start of the shared region. 231 */ 232 irec->br_blockcount = fbno - agbno; 233 return 0; 234 } 235 } 236 237 bool 238 xfs_inode_need_cow( 239 struct xfs_inode *ip, 240 struct xfs_bmbt_irec *imap, 241 bool *shared) 242 { 243 /* We can't update any real extents in always COW mode. */ 244 if (xfs_is_always_cow_inode(ip) && 245 !isnullstartblock(imap->br_startblock)) { 246 *shared = true; 247 return 0; 248 } 249 250 /* Trim the mapping to the nearest shared extent boundary. */ 251 return xfs_reflink_trim_around_shared(ip, imap, shared); 252 } 253 254 static int 255 xfs_reflink_convert_cow_locked( 256 struct xfs_inode *ip, 257 xfs_fileoff_t offset_fsb, 258 xfs_filblks_t count_fsb) 259 { 260 struct xfs_iext_cursor icur; 261 struct xfs_bmbt_irec got; 262 struct xfs_btree_cur *dummy_cur = NULL; 263 int dummy_logflags; 264 int error = 0; 265 266 if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got)) 267 return 0; 268 269 do { 270 if (got.br_startoff >= offset_fsb + count_fsb) 271 break; 272 if (got.br_state == XFS_EXT_NORM) 273 continue; 274 if (WARN_ON_ONCE(isnullstartblock(got.br_startblock))) 275 return -EIO; 276 277 xfs_trim_extent(&got, offset_fsb, count_fsb); 278 if (!got.br_blockcount) 279 continue; 280 281 got.br_state = XFS_EXT_NORM; 282 error = xfs_bmap_add_extent_unwritten_real(NULL, ip, 283 XFS_COW_FORK, &icur, &dummy_cur, &got, 284 &dummy_logflags); 285 if (error) 286 return error; 287 } while (xfs_iext_next_extent(ip->i_cowfp, &icur, &got)); 288 289 return error; 290 } 291 292 /* Convert all of the unwritten CoW extents in a file's range to real ones. */ 293 int 294 xfs_reflink_convert_cow( 295 struct xfs_inode *ip, 296 xfs_off_t offset, 297 xfs_off_t count) 298 { 299 struct xfs_mount *mp = ip->i_mount; 300 xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); 301 xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); 302 xfs_filblks_t count_fsb = end_fsb - offset_fsb; 303 int error; 304 305 ASSERT(count != 0); 306 307 xfs_ilock(ip, XFS_ILOCK_EXCL); 308 error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); 309 xfs_iunlock(ip, XFS_ILOCK_EXCL); 310 return error; 311 } 312 313 /* 314 * Find the extent that maps the given range in the COW fork. Even if the extent 315 * is not shared we might have a preallocation for it in the COW fork. If so we 316 * use it that rather than trigger a new allocation. 317 */ 318 static int 319 xfs_find_trim_cow_extent( 320 struct xfs_inode *ip, 321 struct xfs_bmbt_irec *imap, 322 bool *shared, 323 bool *found) 324 { 325 xfs_fileoff_t offset_fsb = imap->br_startoff; 326 xfs_filblks_t count_fsb = imap->br_blockcount; 327 struct xfs_iext_cursor icur; 328 struct xfs_bmbt_irec got; 329 330 *found = false; 331 332 /* 333 * If we don't find an overlapping extent, trim the range we need to 334 * allocate to fit the hole we found. 335 */ 336 if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got)) 337 got.br_startoff = offset_fsb + count_fsb; 338 if (got.br_startoff > offset_fsb) { 339 xfs_trim_extent(imap, imap->br_startoff, 340 got.br_startoff - imap->br_startoff); 341 return xfs_inode_need_cow(ip, imap, shared); 342 } 343 344 *shared = true; 345 if (isnullstartblock(got.br_startblock)) { 346 xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); 347 return 0; 348 } 349 350 /* real extent found - no need to allocate */ 351 xfs_trim_extent(&got, offset_fsb, count_fsb); 352 *imap = got; 353 *found = true; 354 return 0; 355 } 356 357 /* Allocate all CoW reservations covering a range of blocks in a file. */ 358 int 359 xfs_reflink_allocate_cow( 360 struct xfs_inode *ip, 361 struct xfs_bmbt_irec *imap, 362 bool *shared, 363 uint *lockmode, 364 bool convert_now) 365 { 366 struct xfs_mount *mp = ip->i_mount; 367 xfs_fileoff_t offset_fsb = imap->br_startoff; 368 xfs_filblks_t count_fsb = imap->br_blockcount; 369 struct xfs_trans *tp; 370 int nimaps, error = 0; 371 bool found; 372 xfs_filblks_t resaligned; 373 xfs_extlen_t resblks = 0; 374 375 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 376 if (!ip->i_cowfp) { 377 ASSERT(!xfs_is_reflink_inode(ip)); 378 xfs_ifork_init_cow(ip); 379 } 380 381 error = xfs_find_trim_cow_extent(ip, imap, shared, &found); 382 if (error || !*shared) 383 return error; 384 if (found) 385 goto convert; 386 387 resaligned = xfs_aligned_fsb_count(imap->br_startoff, 388 imap->br_blockcount, xfs_get_cowextsz_hint(ip)); 389 resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); 390 391 xfs_iunlock(ip, *lockmode); 392 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); 393 *lockmode = XFS_ILOCK_EXCL; 394 xfs_ilock(ip, *lockmode); 395 396 if (error) 397 return error; 398 399 error = xfs_qm_dqattach_locked(ip, false); 400 if (error) 401 goto out_trans_cancel; 402 403 /* 404 * Check for an overlapping extent again now that we dropped the ilock. 405 */ 406 error = xfs_find_trim_cow_extent(ip, imap, shared, &found); 407 if (error || !*shared) 408 goto out_trans_cancel; 409 if (found) { 410 xfs_trans_cancel(tp); 411 goto convert; 412 } 413 414 error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0, 415 XFS_QMOPT_RES_REGBLKS); 416 if (error) 417 goto out_trans_cancel; 418 419 xfs_trans_ijoin(tp, ip, 0); 420 421 /* Allocate the entire reservation as unwritten blocks. */ 422 nimaps = 1; 423 error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount, 424 XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 425 resblks, imap, &nimaps); 426 if (error) 427 goto out_unreserve; 428 429 xfs_inode_set_cowblocks_tag(ip); 430 error = xfs_trans_commit(tp); 431 if (error) 432 return error; 433 434 /* 435 * Allocation succeeded but the requested range was not even partially 436 * satisfied? Bail out! 437 */ 438 if (nimaps == 0) 439 return -ENOSPC; 440 convert: 441 xfs_trim_extent(imap, offset_fsb, count_fsb); 442 /* 443 * COW fork extents are supposed to remain unwritten until we're ready 444 * to initiate a disk write. For direct I/O we are going to write the 445 * data and need the conversion, but for buffered writes we're done. 446 */ 447 if (!convert_now || imap->br_state == XFS_EXT_NORM) 448 return 0; 449 trace_xfs_reflink_convert_cow(ip, imap); 450 return xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); 451 452 out_unreserve: 453 xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0, 454 XFS_QMOPT_RES_REGBLKS); 455 out_trans_cancel: 456 xfs_trans_cancel(tp); 457 return error; 458 } 459 460 /* 461 * Cancel CoW reservations for some block range of an inode. 462 * 463 * If cancel_real is true this function cancels all COW fork extents for the 464 * inode; if cancel_real is false, real extents are not cleared. 465 * 466 * Caller must have already joined the inode to the current transaction. The 467 * inode will be joined to the transaction returned to the caller. 468 */ 469 int 470 xfs_reflink_cancel_cow_blocks( 471 struct xfs_inode *ip, 472 struct xfs_trans **tpp, 473 xfs_fileoff_t offset_fsb, 474 xfs_fileoff_t end_fsb, 475 bool cancel_real) 476 { 477 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); 478 struct xfs_bmbt_irec got, del; 479 struct xfs_iext_cursor icur; 480 int error = 0; 481 482 if (!xfs_inode_has_cow_data(ip)) 483 return 0; 484 if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got)) 485 return 0; 486 487 /* Walk backwards until we're out of the I/O range... */ 488 while (got.br_startoff + got.br_blockcount > offset_fsb) { 489 del = got; 490 xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb); 491 492 /* Extent delete may have bumped ext forward */ 493 if (!del.br_blockcount) { 494 xfs_iext_prev(ifp, &icur); 495 goto next_extent; 496 } 497 498 trace_xfs_reflink_cancel_cow(ip, &del); 499 500 if (isnullstartblock(del.br_startblock)) { 501 error = xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, 502 &icur, &got, &del); 503 if (error) 504 break; 505 } else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) { 506 ASSERT((*tpp)->t_firstblock == NULLFSBLOCK); 507 508 /* Free the CoW orphan record. */ 509 error = xfs_refcount_free_cow_extent(*tpp, 510 del.br_startblock, del.br_blockcount); 511 if (error) 512 break; 513 514 xfs_bmap_add_free(*tpp, del.br_startblock, 515 del.br_blockcount, NULL); 516 517 /* Roll the transaction */ 518 error = xfs_defer_finish(tpp); 519 if (error) 520 break; 521 522 /* Remove the mapping from the CoW fork. */ 523 xfs_bmap_del_extent_cow(ip, &icur, &got, &del); 524 525 /* Remove the quota reservation */ 526 error = xfs_trans_reserve_quota_nblks(NULL, ip, 527 -(long)del.br_blockcount, 0, 528 XFS_QMOPT_RES_REGBLKS); 529 if (error) 530 break; 531 } else { 532 /* Didn't do anything, push cursor back. */ 533 xfs_iext_prev(ifp, &icur); 534 } 535 next_extent: 536 if (!xfs_iext_get_extent(ifp, &icur, &got)) 537 break; 538 } 539 540 /* clear tag if cow fork is emptied */ 541 if (!ifp->if_bytes) 542 xfs_inode_clear_cowblocks_tag(ip); 543 return error; 544 } 545 546 /* 547 * Cancel CoW reservations for some byte range of an inode. 548 * 549 * If cancel_real is true this function cancels all COW fork extents for the 550 * inode; if cancel_real is false, real extents are not cleared. 551 */ 552 int 553 xfs_reflink_cancel_cow_range( 554 struct xfs_inode *ip, 555 xfs_off_t offset, 556 xfs_off_t count, 557 bool cancel_real) 558 { 559 struct xfs_trans *tp; 560 xfs_fileoff_t offset_fsb; 561 xfs_fileoff_t end_fsb; 562 int error; 563 564 trace_xfs_reflink_cancel_cow_range(ip, offset, count); 565 ASSERT(ip->i_cowfp); 566 567 offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); 568 if (count == NULLFILEOFF) 569 end_fsb = NULLFILEOFF; 570 else 571 end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count); 572 573 /* Start a rolling transaction to remove the mappings */ 574 error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write, 575 0, 0, XFS_TRANS_NOFS, &tp); 576 if (error) 577 goto out; 578 579 xfs_ilock(ip, XFS_ILOCK_EXCL); 580 xfs_trans_ijoin(tp, ip, 0); 581 582 /* Scrape out the old CoW reservations */ 583 error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb, 584 cancel_real); 585 if (error) 586 goto out_cancel; 587 588 error = xfs_trans_commit(tp); 589 590 xfs_iunlock(ip, XFS_ILOCK_EXCL); 591 return error; 592 593 out_cancel: 594 xfs_trans_cancel(tp); 595 xfs_iunlock(ip, XFS_ILOCK_EXCL); 596 out: 597 trace_xfs_reflink_cancel_cow_range_error(ip, error, _RET_IP_); 598 return error; 599 } 600 601 /* 602 * Remap part of the CoW fork into the data fork. 603 * 604 * We aim to remap the range starting at @offset_fsb and ending at @end_fsb 605 * into the data fork; this function will remap what it can (at the end of the 606 * range) and update @end_fsb appropriately. Each remap gets its own 607 * transaction because we can end up merging and splitting bmbt blocks for 608 * every remap operation and we'd like to keep the block reservation 609 * requirements as low as possible. 610 */ 611 STATIC int 612 xfs_reflink_end_cow_extent( 613 struct xfs_inode *ip, 614 xfs_fileoff_t offset_fsb, 615 xfs_fileoff_t *end_fsb) 616 { 617 struct xfs_bmbt_irec got, del; 618 struct xfs_iext_cursor icur; 619 struct xfs_mount *mp = ip->i_mount; 620 struct xfs_trans *tp; 621 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); 622 xfs_filblks_t rlen; 623 unsigned int resblks; 624 int error; 625 626 /* No COW extents? That's easy! */ 627 if (ifp->if_bytes == 0) { 628 *end_fsb = offset_fsb; 629 return 0; 630 } 631 632 resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); 633 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 634 XFS_TRANS_RESERVE | XFS_TRANS_NOFS, &tp); 635 if (error) 636 return error; 637 638 /* 639 * Lock the inode. We have to ijoin without automatic unlock because 640 * the lead transaction is the refcountbt record deletion; the data 641 * fork update follows as a deferred log item. 642 */ 643 xfs_ilock(ip, XFS_ILOCK_EXCL); 644 xfs_trans_ijoin(tp, ip, 0); 645 646 /* 647 * In case of racing, overlapping AIO writes no COW extents might be 648 * left by the time I/O completes for the loser of the race. In that 649 * case we are done. 650 */ 651 if (!xfs_iext_lookup_extent_before(ip, ifp, end_fsb, &icur, &got) || 652 got.br_startoff + got.br_blockcount <= offset_fsb) { 653 *end_fsb = offset_fsb; 654 goto out_cancel; 655 } 656 657 /* 658 * Structure copy @got into @del, then trim @del to the range that we 659 * were asked to remap. We preserve @got for the eventual CoW fork 660 * deletion; from now on @del represents the mapping that we're 661 * actually remapping. 662 */ 663 del = got; 664 xfs_trim_extent(&del, offset_fsb, *end_fsb - offset_fsb); 665 666 ASSERT(del.br_blockcount > 0); 667 668 /* 669 * Only remap real extents that contain data. With AIO, speculative 670 * preallocations can leak into the range we are called upon, and we 671 * need to skip them. 672 */ 673 if (!xfs_bmap_is_real_extent(&got)) { 674 *end_fsb = del.br_startoff; 675 goto out_cancel; 676 } 677 678 /* Unmap the old blocks in the data fork. */ 679 rlen = del.br_blockcount; 680 error = __xfs_bunmapi(tp, ip, del.br_startoff, &rlen, 0, 1); 681 if (error) 682 goto out_cancel; 683 684 /* Trim the extent to whatever got unmapped. */ 685 xfs_trim_extent(&del, del.br_startoff + rlen, del.br_blockcount - rlen); 686 trace_xfs_reflink_cow_remap(ip, &del); 687 688 /* Free the CoW orphan record. */ 689 error = xfs_refcount_free_cow_extent(tp, del.br_startblock, 690 del.br_blockcount); 691 if (error) 692 goto out_cancel; 693 694 /* Map the new blocks into the data fork. */ 695 error = xfs_bmap_map_extent(tp, ip, &del); 696 if (error) 697 goto out_cancel; 698 699 /* Charge this new data fork mapping to the on-disk quota. */ 700 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT, 701 (long)del.br_blockcount); 702 703 /* Remove the mapping from the CoW fork. */ 704 xfs_bmap_del_extent_cow(ip, &icur, &got, &del); 705 706 error = xfs_trans_commit(tp); 707 xfs_iunlock(ip, XFS_ILOCK_EXCL); 708 if (error) 709 return error; 710 711 /* Update the caller about how much progress we made. */ 712 *end_fsb = del.br_startoff; 713 return 0; 714 715 out_cancel: 716 xfs_trans_cancel(tp); 717 xfs_iunlock(ip, XFS_ILOCK_EXCL); 718 return error; 719 } 720 721 /* 722 * Remap parts of a file's data fork after a successful CoW. 723 */ 724 int 725 xfs_reflink_end_cow( 726 struct xfs_inode *ip, 727 xfs_off_t offset, 728 xfs_off_t count) 729 { 730 xfs_fileoff_t offset_fsb; 731 xfs_fileoff_t end_fsb; 732 int error = 0; 733 734 trace_xfs_reflink_end_cow(ip, offset, count); 735 736 offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); 737 end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count); 738 739 /* 740 * Walk backwards until we're out of the I/O range. The loop function 741 * repeatedly cycles the ILOCK to allocate one transaction per remapped 742 * extent. 743 * 744 * If we're being called by writeback then the the pages will still 745 * have PageWriteback set, which prevents races with reflink remapping 746 * and truncate. Reflink remapping prevents races with writeback by 747 * taking the iolock and mmaplock before flushing the pages and 748 * remapping, which means there won't be any further writeback or page 749 * cache dirtying until the reflink completes. 750 * 751 * We should never have two threads issuing writeback for the same file 752 * region. There are also have post-eof checks in the writeback 753 * preparation code so that we don't bother writing out pages that are 754 * about to be truncated. 755 * 756 * If we're being called as part of directio write completion, the dio 757 * count is still elevated, which reflink and truncate will wait for. 758 * Reflink remapping takes the iolock and mmaplock and waits for 759 * pending dio to finish, which should prevent any directio until the 760 * remap completes. Multiple concurrent directio writes to the same 761 * region are handled by end_cow processing only occurring for the 762 * threads which succeed; the outcome of multiple overlapping direct 763 * writes is not well defined anyway. 764 * 765 * It's possible that a buffered write and a direct write could collide 766 * here (the buffered write stumbles in after the dio flushes and 767 * invalidates the page cache and immediately queues writeback), but we 768 * have never supported this 100%. If either disk write succeeds the 769 * blocks will be remapped. 770 */ 771 while (end_fsb > offset_fsb && !error) 772 error = xfs_reflink_end_cow_extent(ip, offset_fsb, &end_fsb); 773 774 if (error) 775 trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_); 776 return error; 777 } 778 779 /* 780 * Free leftover CoW reservations that didn't get cleaned out. 781 */ 782 int 783 xfs_reflink_recover_cow( 784 struct xfs_mount *mp) 785 { 786 xfs_agnumber_t agno; 787 int error = 0; 788 789 if (!xfs_sb_version_hasreflink(&mp->m_sb)) 790 return 0; 791 792 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { 793 error = xfs_refcount_recover_cow_leftovers(mp, agno); 794 if (error) 795 break; 796 } 797 798 return error; 799 } 800 801 /* 802 * Reflinking (Block) Ranges of Two Files Together 803 * 804 * First, ensure that the reflink flag is set on both inodes. The flag is an 805 * optimization to avoid unnecessary refcount btree lookups in the write path. 806 * 807 * Now we can iteratively remap the range of extents (and holes) in src to the 808 * corresponding ranges in dest. Let drange and srange denote the ranges of 809 * logical blocks in dest and src touched by the reflink operation. 810 * 811 * While the length of drange is greater than zero, 812 * - Read src's bmbt at the start of srange ("imap") 813 * - If imap doesn't exist, make imap appear to start at the end of srange 814 * with zero length. 815 * - If imap starts before srange, advance imap to start at srange. 816 * - If imap goes beyond srange, truncate imap to end at the end of srange. 817 * - Punch (imap start - srange start + imap len) blocks from dest at 818 * offset (drange start). 819 * - If imap points to a real range of pblks, 820 * > Increase the refcount of the imap's pblks 821 * > Map imap's pblks into dest at the offset 822 * (drange start + imap start - srange start) 823 * - Advance drange and srange by (imap start - srange start + imap len) 824 * 825 * Finally, if the reflink made dest longer, update both the in-core and 826 * on-disk file sizes. 827 * 828 * ASCII Art Demonstration: 829 * 830 * Let's say we want to reflink this source file: 831 * 832 * ----SSSSSSS-SSSSS----SSSSSS (src file) 833 * <--------------------> 834 * 835 * into this destination file: 836 * 837 * --DDDDDDDDDDDDDDDDDDD--DDD (dest file) 838 * <--------------------> 839 * '-' means a hole, and 'S' and 'D' are written blocks in the src and dest. 840 * Observe that the range has different logical offsets in either file. 841 * 842 * Consider that the first extent in the source file doesn't line up with our 843 * reflink range. Unmapping and remapping are separate operations, so we can 844 * unmap more blocks from the destination file than we remap. 845 * 846 * ----SSSSSSS-SSSSS----SSSSSS 847 * <-------> 848 * --DDDDD---------DDDDD--DDD 849 * <-------> 850 * 851 * Now remap the source extent into the destination file: 852 * 853 * ----SSSSSSS-SSSSS----SSSSSS 854 * <-------> 855 * --DDDDD--SSSSSSSDDDDD--DDD 856 * <-------> 857 * 858 * Do likewise with the second hole and extent in our range. Holes in the 859 * unmap range don't affect our operation. 860 * 861 * ----SSSSSSS-SSSSS----SSSSSS 862 * <----> 863 * --DDDDD--SSSSSSS-SSSSS-DDD 864 * <----> 865 * 866 * Finally, unmap and remap part of the third extent. This will increase the 867 * size of the destination file. 868 * 869 * ----SSSSSSS-SSSSS----SSSSSS 870 * <-----> 871 * --DDDDD--SSSSSSS-SSSSS----SSS 872 * <-----> 873 * 874 * Once we update the destination file's i_size, we're done. 875 */ 876 877 /* 878 * Ensure the reflink bit is set in both inodes. 879 */ 880 STATIC int 881 xfs_reflink_set_inode_flag( 882 struct xfs_inode *src, 883 struct xfs_inode *dest) 884 { 885 struct xfs_mount *mp = src->i_mount; 886 int error; 887 struct xfs_trans *tp; 888 889 if (xfs_is_reflink_inode(src) && xfs_is_reflink_inode(dest)) 890 return 0; 891 892 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); 893 if (error) 894 goto out_error; 895 896 /* Lock both files against IO */ 897 if (src->i_ino == dest->i_ino) 898 xfs_ilock(src, XFS_ILOCK_EXCL); 899 else 900 xfs_lock_two_inodes(src, XFS_ILOCK_EXCL, dest, XFS_ILOCK_EXCL); 901 902 if (!xfs_is_reflink_inode(src)) { 903 trace_xfs_reflink_set_inode_flag(src); 904 xfs_trans_ijoin(tp, src, XFS_ILOCK_EXCL); 905 src->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK; 906 xfs_trans_log_inode(tp, src, XFS_ILOG_CORE); 907 xfs_ifork_init_cow(src); 908 } else 909 xfs_iunlock(src, XFS_ILOCK_EXCL); 910 911 if (src->i_ino == dest->i_ino) 912 goto commit_flags; 913 914 if (!xfs_is_reflink_inode(dest)) { 915 trace_xfs_reflink_set_inode_flag(dest); 916 xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL); 917 dest->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK; 918 xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE); 919 xfs_ifork_init_cow(dest); 920 } else 921 xfs_iunlock(dest, XFS_ILOCK_EXCL); 922 923 commit_flags: 924 error = xfs_trans_commit(tp); 925 if (error) 926 goto out_error; 927 return error; 928 929 out_error: 930 trace_xfs_reflink_set_inode_flag_error(dest, error, _RET_IP_); 931 return error; 932 } 933 934 /* 935 * Update destination inode size & cowextsize hint, if necessary. 936 */ 937 int 938 xfs_reflink_update_dest( 939 struct xfs_inode *dest, 940 xfs_off_t newlen, 941 xfs_extlen_t cowextsize, 942 unsigned int remap_flags) 943 { 944 struct xfs_mount *mp = dest->i_mount; 945 struct xfs_trans *tp; 946 int error; 947 948 if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0) 949 return 0; 950 951 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); 952 if (error) 953 goto out_error; 954 955 xfs_ilock(dest, XFS_ILOCK_EXCL); 956 xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL); 957 958 if (newlen > i_size_read(VFS_I(dest))) { 959 trace_xfs_reflink_update_inode_size(dest, newlen); 960 i_size_write(VFS_I(dest), newlen); 961 dest->i_d.di_size = newlen; 962 } 963 964 if (cowextsize) { 965 dest->i_d.di_cowextsize = cowextsize; 966 dest->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; 967 } 968 969 xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE); 970 971 error = xfs_trans_commit(tp); 972 if (error) 973 goto out_error; 974 return error; 975 976 out_error: 977 trace_xfs_reflink_update_inode_size_error(dest, error, _RET_IP_); 978 return error; 979 } 980 981 /* 982 * Do we have enough reserve in this AG to handle a reflink? The refcount 983 * btree already reserved all the space it needs, but the rmap btree can grow 984 * infinitely, so we won't allow more reflinks when the AG is down to the 985 * btree reserves. 986 */ 987 static int 988 xfs_reflink_ag_has_free_space( 989 struct xfs_mount *mp, 990 xfs_agnumber_t agno) 991 { 992 struct xfs_perag *pag; 993 int error = 0; 994 995 if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) 996 return 0; 997 998 pag = xfs_perag_get(mp, agno); 999 if (xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) || 1000 xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA)) 1001 error = -ENOSPC; 1002 xfs_perag_put(pag); 1003 return error; 1004 } 1005 1006 /* 1007 * Unmap a range of blocks from a file, then map other blocks into the hole. 1008 * The range to unmap is (destoff : destoff + srcioff + irec->br_blockcount). 1009 * The extent irec is mapped into dest at irec->br_startoff. 1010 */ 1011 STATIC int 1012 xfs_reflink_remap_extent( 1013 struct xfs_inode *ip, 1014 struct xfs_bmbt_irec *irec, 1015 xfs_fileoff_t destoff, 1016 xfs_off_t new_isize) 1017 { 1018 struct xfs_mount *mp = ip->i_mount; 1019 bool real_extent = xfs_bmap_is_real_extent(irec); 1020 struct xfs_trans *tp; 1021 unsigned int resblks; 1022 struct xfs_bmbt_irec uirec; 1023 xfs_filblks_t rlen; 1024 xfs_filblks_t unmap_len; 1025 xfs_off_t newlen; 1026 int error; 1027 1028 unmap_len = irec->br_startoff + irec->br_blockcount - destoff; 1029 trace_xfs_reflink_punch_range(ip, destoff, unmap_len); 1030 1031 /* No reflinking if we're low on space */ 1032 if (real_extent) { 1033 error = xfs_reflink_ag_has_free_space(mp, 1034 XFS_FSB_TO_AGNO(mp, irec->br_startblock)); 1035 if (error) 1036 goto out; 1037 } 1038 1039 /* Start a rolling transaction to switch the mappings */ 1040 resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK); 1041 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); 1042 if (error) 1043 goto out; 1044 1045 xfs_ilock(ip, XFS_ILOCK_EXCL); 1046 xfs_trans_ijoin(tp, ip, 0); 1047 1048 /* If we're not just clearing space, then do we have enough quota? */ 1049 if (real_extent) { 1050 error = xfs_trans_reserve_quota_nblks(tp, ip, 1051 irec->br_blockcount, 0, XFS_QMOPT_RES_REGBLKS); 1052 if (error) 1053 goto out_cancel; 1054 } 1055 1056 trace_xfs_reflink_remap(ip, irec->br_startoff, 1057 irec->br_blockcount, irec->br_startblock); 1058 1059 /* Unmap the old blocks in the data fork. */ 1060 rlen = unmap_len; 1061 while (rlen) { 1062 ASSERT(tp->t_firstblock == NULLFSBLOCK); 1063 error = __xfs_bunmapi(tp, ip, destoff, &rlen, 0, 1); 1064 if (error) 1065 goto out_cancel; 1066 1067 /* 1068 * Trim the extent to whatever got unmapped. 1069 * Remember, bunmapi works backwards. 1070 */ 1071 uirec.br_startblock = irec->br_startblock + rlen; 1072 uirec.br_startoff = irec->br_startoff + rlen; 1073 uirec.br_blockcount = unmap_len - rlen; 1074 unmap_len = rlen; 1075 1076 /* If this isn't a real mapping, we're done. */ 1077 if (!real_extent || uirec.br_blockcount == 0) 1078 goto next_extent; 1079 1080 trace_xfs_reflink_remap(ip, uirec.br_startoff, 1081 uirec.br_blockcount, uirec.br_startblock); 1082 1083 /* Update the refcount tree */ 1084 error = xfs_refcount_increase_extent(tp, &uirec); 1085 if (error) 1086 goto out_cancel; 1087 1088 /* Map the new blocks into the data fork. */ 1089 error = xfs_bmap_map_extent(tp, ip, &uirec); 1090 if (error) 1091 goto out_cancel; 1092 1093 /* Update quota accounting. */ 1094 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1095 uirec.br_blockcount); 1096 1097 /* Update dest isize if needed. */ 1098 newlen = XFS_FSB_TO_B(mp, 1099 uirec.br_startoff + uirec.br_blockcount); 1100 newlen = min_t(xfs_off_t, newlen, new_isize); 1101 if (newlen > i_size_read(VFS_I(ip))) { 1102 trace_xfs_reflink_update_inode_size(ip, newlen); 1103 i_size_write(VFS_I(ip), newlen); 1104 ip->i_d.di_size = newlen; 1105 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1106 } 1107 1108 next_extent: 1109 /* Process all the deferred stuff. */ 1110 error = xfs_defer_finish(&tp); 1111 if (error) 1112 goto out_cancel; 1113 } 1114 1115 error = xfs_trans_commit(tp); 1116 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1117 if (error) 1118 goto out; 1119 return 0; 1120 1121 out_cancel: 1122 xfs_trans_cancel(tp); 1123 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1124 out: 1125 trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_); 1126 return error; 1127 } 1128 1129 /* 1130 * Iteratively remap one file's extents (and holes) to another's. 1131 */ 1132 int 1133 xfs_reflink_remap_blocks( 1134 struct xfs_inode *src, 1135 loff_t pos_in, 1136 struct xfs_inode *dest, 1137 loff_t pos_out, 1138 loff_t remap_len, 1139 loff_t *remapped) 1140 { 1141 struct xfs_bmbt_irec imap; 1142 xfs_fileoff_t srcoff; 1143 xfs_fileoff_t destoff; 1144 xfs_filblks_t len; 1145 xfs_filblks_t range_len; 1146 xfs_filblks_t remapped_len = 0; 1147 xfs_off_t new_isize = pos_out + remap_len; 1148 int nimaps; 1149 int error = 0; 1150 1151 destoff = XFS_B_TO_FSBT(src->i_mount, pos_out); 1152 srcoff = XFS_B_TO_FSBT(src->i_mount, pos_in); 1153 len = XFS_B_TO_FSB(src->i_mount, remap_len); 1154 1155 /* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */ 1156 while (len) { 1157 uint lock_mode; 1158 1159 trace_xfs_reflink_remap_blocks_loop(src, srcoff, len, 1160 dest, destoff); 1161 1162 /* Read extent from the source file */ 1163 nimaps = 1; 1164 lock_mode = xfs_ilock_data_map_shared(src); 1165 error = xfs_bmapi_read(src, srcoff, len, &imap, &nimaps, 0); 1166 xfs_iunlock(src, lock_mode); 1167 if (error) 1168 break; 1169 ASSERT(nimaps == 1); 1170 1171 trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_DATA_FORK, 1172 &imap); 1173 1174 /* Translate imap into the destination file. */ 1175 range_len = imap.br_startoff + imap.br_blockcount - srcoff; 1176 imap.br_startoff += destoff - srcoff; 1177 1178 /* Clear dest from destoff to the end of imap and map it in. */ 1179 error = xfs_reflink_remap_extent(dest, &imap, destoff, 1180 new_isize); 1181 if (error) 1182 break; 1183 1184 if (fatal_signal_pending(current)) { 1185 error = -EINTR; 1186 break; 1187 } 1188 1189 /* Advance drange/srange */ 1190 srcoff += range_len; 1191 destoff += range_len; 1192 len -= range_len; 1193 remapped_len += range_len; 1194 } 1195 1196 if (error) 1197 trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_); 1198 *remapped = min_t(loff_t, remap_len, 1199 XFS_FSB_TO_B(src->i_mount, remapped_len)); 1200 return error; 1201 } 1202 1203 /* 1204 * Grab the exclusive iolock for a data copy from src to dest, making 1205 * sure to abide vfs locking order (lowest pointer value goes first) and 1206 * breaking the pnfs layout leases on dest before proceeding. The loop 1207 * is needed because we cannot call the blocking break_layout() with the 1208 * src iolock held, and therefore have to back out both locks. 1209 */ 1210 static int 1211 xfs_iolock_two_inodes_and_break_layout( 1212 struct inode *src, 1213 struct inode *dest) 1214 { 1215 int error; 1216 1217 retry: 1218 if (src < dest) { 1219 inode_lock_shared(src); 1220 inode_lock_nested(dest, I_MUTEX_NONDIR2); 1221 } else { 1222 /* src >= dest */ 1223 inode_lock(dest); 1224 } 1225 1226 error = break_layout(dest, false); 1227 if (error == -EWOULDBLOCK) { 1228 inode_unlock(dest); 1229 if (src < dest) 1230 inode_unlock_shared(src); 1231 error = break_layout(dest, true); 1232 if (error) 1233 return error; 1234 goto retry; 1235 } 1236 if (error) { 1237 inode_unlock(dest); 1238 if (src < dest) 1239 inode_unlock_shared(src); 1240 return error; 1241 } 1242 if (src > dest) 1243 inode_lock_shared_nested(src, I_MUTEX_NONDIR2); 1244 return 0; 1245 } 1246 1247 /* Unlock both inodes after they've been prepped for a range clone. */ 1248 void 1249 xfs_reflink_remap_unlock( 1250 struct file *file_in, 1251 struct file *file_out) 1252 { 1253 struct inode *inode_in = file_inode(file_in); 1254 struct xfs_inode *src = XFS_I(inode_in); 1255 struct inode *inode_out = file_inode(file_out); 1256 struct xfs_inode *dest = XFS_I(inode_out); 1257 bool same_inode = (inode_in == inode_out); 1258 1259 xfs_iunlock(dest, XFS_MMAPLOCK_EXCL); 1260 if (!same_inode) 1261 xfs_iunlock(src, XFS_MMAPLOCK_SHARED); 1262 inode_unlock(inode_out); 1263 if (!same_inode) 1264 inode_unlock_shared(inode_in); 1265 } 1266 1267 /* 1268 * If we're reflinking to a point past the destination file's EOF, we must 1269 * zero any speculative post-EOF preallocations that sit between the old EOF 1270 * and the destination file offset. 1271 */ 1272 static int 1273 xfs_reflink_zero_posteof( 1274 struct xfs_inode *ip, 1275 loff_t pos) 1276 { 1277 loff_t isize = i_size_read(VFS_I(ip)); 1278 1279 if (pos <= isize) 1280 return 0; 1281 1282 trace_xfs_zero_eof(ip, isize, pos - isize); 1283 return iomap_zero_range(VFS_I(ip), isize, pos - isize, NULL, 1284 &xfs_iomap_ops); 1285 } 1286 1287 /* 1288 * Prepare two files for range cloning. Upon a successful return both inodes 1289 * will have the iolock and mmaplock held, the page cache of the out file will 1290 * be truncated, and any leases on the out file will have been broken. This 1291 * function borrows heavily from xfs_file_aio_write_checks. 1292 * 1293 * The VFS allows partial EOF blocks to "match" for dedupe even though it hasn't 1294 * checked that the bytes beyond EOF physically match. Hence we cannot use the 1295 * EOF block in the source dedupe range because it's not a complete block match, 1296 * hence can introduce a corruption into the file that has it's block replaced. 1297 * 1298 * In similar fashion, the VFS file cloning also allows partial EOF blocks to be 1299 * "block aligned" for the purposes of cloning entire files. However, if the 1300 * source file range includes the EOF block and it lands within the existing EOF 1301 * of the destination file, then we can expose stale data from beyond the source 1302 * file EOF in the destination file. 1303 * 1304 * XFS doesn't support partial block sharing, so in both cases we have check 1305 * these cases ourselves. For dedupe, we can simply round the length to dedupe 1306 * down to the previous whole block and ignore the partial EOF block. While this 1307 * means we can't dedupe the last block of a file, this is an acceptible 1308 * tradeoff for simplicity on implementation. 1309 * 1310 * For cloning, we want to share the partial EOF block if it is also the new EOF 1311 * block of the destination file. If the partial EOF block lies inside the 1312 * existing destination EOF, then we have to abort the clone to avoid exposing 1313 * stale data in the destination file. Hence we reject these clone attempts with 1314 * -EINVAL in this case. 1315 */ 1316 int 1317 xfs_reflink_remap_prep( 1318 struct file *file_in, 1319 loff_t pos_in, 1320 struct file *file_out, 1321 loff_t pos_out, 1322 loff_t *len, 1323 unsigned int remap_flags) 1324 { 1325 struct inode *inode_in = file_inode(file_in); 1326 struct xfs_inode *src = XFS_I(inode_in); 1327 struct inode *inode_out = file_inode(file_out); 1328 struct xfs_inode *dest = XFS_I(inode_out); 1329 bool same_inode = (inode_in == inode_out); 1330 ssize_t ret; 1331 1332 /* Lock both files against IO */ 1333 ret = xfs_iolock_two_inodes_and_break_layout(inode_in, inode_out); 1334 if (ret) 1335 return ret; 1336 if (same_inode) 1337 xfs_ilock(src, XFS_MMAPLOCK_EXCL); 1338 else 1339 xfs_lock_two_inodes(src, XFS_MMAPLOCK_SHARED, dest, 1340 XFS_MMAPLOCK_EXCL); 1341 1342 /* Check file eligibility and prepare for block sharing. */ 1343 ret = -EINVAL; 1344 /* Don't reflink realtime inodes */ 1345 if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest)) 1346 goto out_unlock; 1347 1348 /* Don't share DAX file data for now. */ 1349 if (IS_DAX(inode_in) || IS_DAX(inode_out)) 1350 goto out_unlock; 1351 1352 ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, 1353 len, remap_flags); 1354 if (ret < 0 || *len == 0) 1355 goto out_unlock; 1356 1357 /* Attach dquots to dest inode before changing block map */ 1358 ret = xfs_qm_dqattach(dest); 1359 if (ret) 1360 goto out_unlock; 1361 1362 /* 1363 * Zero existing post-eof speculative preallocations in the destination 1364 * file. 1365 */ 1366 ret = xfs_reflink_zero_posteof(dest, pos_out); 1367 if (ret) 1368 goto out_unlock; 1369 1370 /* Set flags and remap blocks. */ 1371 ret = xfs_reflink_set_inode_flag(src, dest); 1372 if (ret) 1373 goto out_unlock; 1374 1375 /* 1376 * If pos_out > EOF, we may have dirtied blocks between EOF and 1377 * pos_out. In that case, we need to extend the flush and unmap to cover 1378 * from EOF to the end of the copy length. 1379 */ 1380 if (pos_out > XFS_ISIZE(dest)) { 1381 loff_t flen = *len + (pos_out - XFS_ISIZE(dest)); 1382 ret = xfs_flush_unmap_range(dest, XFS_ISIZE(dest), flen); 1383 } else { 1384 ret = xfs_flush_unmap_range(dest, pos_out, *len); 1385 } 1386 if (ret) 1387 goto out_unlock; 1388 1389 return 1; 1390 out_unlock: 1391 xfs_reflink_remap_unlock(file_in, file_out); 1392 return ret; 1393 } 1394 1395 /* 1396 * The user wants to preemptively CoW all shared blocks in this file, 1397 * which enables us to turn off the reflink flag. Iterate all 1398 * extents which are not prealloc/delalloc to see which ranges are 1399 * mentioned in the refcount tree, then read those blocks into the 1400 * pagecache, dirty them, fsync them back out, and then we can update 1401 * the inode flag. What happens if we run out of memory? :) 1402 */ 1403 STATIC int 1404 xfs_reflink_dirty_extents( 1405 struct xfs_inode *ip, 1406 xfs_fileoff_t fbno, 1407 xfs_filblks_t end, 1408 xfs_off_t isize) 1409 { 1410 struct xfs_mount *mp = ip->i_mount; 1411 xfs_agnumber_t agno; 1412 xfs_agblock_t agbno; 1413 xfs_extlen_t aglen; 1414 xfs_agblock_t rbno; 1415 xfs_extlen_t rlen; 1416 xfs_off_t fpos; 1417 xfs_off_t flen; 1418 struct xfs_bmbt_irec map[2]; 1419 int nmaps; 1420 int error = 0; 1421 1422 while (end - fbno > 0) { 1423 nmaps = 1; 1424 /* 1425 * Look for extents in the file. Skip holes, delalloc, or 1426 * unwritten extents; they can't be reflinked. 1427 */ 1428 error = xfs_bmapi_read(ip, fbno, end - fbno, map, &nmaps, 0); 1429 if (error) 1430 goto out; 1431 if (nmaps == 0) 1432 break; 1433 if (!xfs_bmap_is_real_extent(&map[0])) 1434 goto next; 1435 1436 map[1] = map[0]; 1437 while (map[1].br_blockcount) { 1438 agno = XFS_FSB_TO_AGNO(mp, map[1].br_startblock); 1439 agbno = XFS_FSB_TO_AGBNO(mp, map[1].br_startblock); 1440 aglen = map[1].br_blockcount; 1441 1442 error = xfs_reflink_find_shared(mp, NULL, agno, agbno, 1443 aglen, &rbno, &rlen, true); 1444 if (error) 1445 goto out; 1446 if (rbno == NULLAGBLOCK) 1447 break; 1448 1449 /* Dirty the pages */ 1450 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1451 fpos = XFS_FSB_TO_B(mp, map[1].br_startoff + 1452 (rbno - agbno)); 1453 flen = XFS_FSB_TO_B(mp, rlen); 1454 if (fpos + flen > isize) 1455 flen = isize - fpos; 1456 error = iomap_file_dirty(VFS_I(ip), fpos, flen, 1457 &xfs_iomap_ops); 1458 xfs_ilock(ip, XFS_ILOCK_EXCL); 1459 if (error) 1460 goto out; 1461 1462 map[1].br_blockcount -= (rbno - agbno + rlen); 1463 map[1].br_startoff += (rbno - agbno + rlen); 1464 map[1].br_startblock += (rbno - agbno + rlen); 1465 } 1466 1467 next: 1468 fbno = map[0].br_startoff + map[0].br_blockcount; 1469 } 1470 out: 1471 return error; 1472 } 1473 1474 /* Does this inode need the reflink flag? */ 1475 int 1476 xfs_reflink_inode_has_shared_extents( 1477 struct xfs_trans *tp, 1478 struct xfs_inode *ip, 1479 bool *has_shared) 1480 { 1481 struct xfs_bmbt_irec got; 1482 struct xfs_mount *mp = ip->i_mount; 1483 struct xfs_ifork *ifp; 1484 xfs_agnumber_t agno; 1485 xfs_agblock_t agbno; 1486 xfs_extlen_t aglen; 1487 xfs_agblock_t rbno; 1488 xfs_extlen_t rlen; 1489 struct xfs_iext_cursor icur; 1490 bool found; 1491 int error; 1492 1493 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); 1494 if (!(ifp->if_flags & XFS_IFEXTENTS)) { 1495 error = xfs_iread_extents(tp, ip, XFS_DATA_FORK); 1496 if (error) 1497 return error; 1498 } 1499 1500 *has_shared = false; 1501 found = xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got); 1502 while (found) { 1503 if (isnullstartblock(got.br_startblock) || 1504 got.br_state != XFS_EXT_NORM) 1505 goto next; 1506 agno = XFS_FSB_TO_AGNO(mp, got.br_startblock); 1507 agbno = XFS_FSB_TO_AGBNO(mp, got.br_startblock); 1508 aglen = got.br_blockcount; 1509 1510 error = xfs_reflink_find_shared(mp, tp, agno, agbno, aglen, 1511 &rbno, &rlen, false); 1512 if (error) 1513 return error; 1514 /* Is there still a shared block here? */ 1515 if (rbno != NULLAGBLOCK) { 1516 *has_shared = true; 1517 return 0; 1518 } 1519 next: 1520 found = xfs_iext_next_extent(ifp, &icur, &got); 1521 } 1522 1523 return 0; 1524 } 1525 1526 /* 1527 * Clear the inode reflink flag if there are no shared extents. 1528 * 1529 * The caller is responsible for joining the inode to the transaction passed in. 1530 * The inode will be joined to the transaction that is returned to the caller. 1531 */ 1532 int 1533 xfs_reflink_clear_inode_flag( 1534 struct xfs_inode *ip, 1535 struct xfs_trans **tpp) 1536 { 1537 bool needs_flag; 1538 int error = 0; 1539 1540 ASSERT(xfs_is_reflink_inode(ip)); 1541 1542 error = xfs_reflink_inode_has_shared_extents(*tpp, ip, &needs_flag); 1543 if (error || needs_flag) 1544 return error; 1545 1546 /* 1547 * We didn't find any shared blocks so turn off the reflink flag. 1548 * First, get rid of any leftover CoW mappings. 1549 */ 1550 error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF, true); 1551 if (error) 1552 return error; 1553 1554 /* Clear the inode flag. */ 1555 trace_xfs_reflink_unset_inode_flag(ip); 1556 ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; 1557 xfs_inode_clear_cowblocks_tag(ip); 1558 xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE); 1559 1560 return error; 1561 } 1562 1563 /* 1564 * Clear the inode reflink flag if there are no shared extents and the size 1565 * hasn't changed. 1566 */ 1567 STATIC int 1568 xfs_reflink_try_clear_inode_flag( 1569 struct xfs_inode *ip) 1570 { 1571 struct xfs_mount *mp = ip->i_mount; 1572 struct xfs_trans *tp; 1573 int error = 0; 1574 1575 /* Start a rolling transaction to remove the mappings */ 1576 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp); 1577 if (error) 1578 return error; 1579 1580 xfs_ilock(ip, XFS_ILOCK_EXCL); 1581 xfs_trans_ijoin(tp, ip, 0); 1582 1583 error = xfs_reflink_clear_inode_flag(ip, &tp); 1584 if (error) 1585 goto cancel; 1586 1587 error = xfs_trans_commit(tp); 1588 if (error) 1589 goto out; 1590 1591 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1592 return 0; 1593 cancel: 1594 xfs_trans_cancel(tp); 1595 out: 1596 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1597 return error; 1598 } 1599 1600 /* 1601 * Pre-COW all shared blocks within a given byte range of a file and turn off 1602 * the reflink flag if we unshare all of the file's blocks. 1603 */ 1604 int 1605 xfs_reflink_unshare( 1606 struct xfs_inode *ip, 1607 xfs_off_t offset, 1608 xfs_off_t len) 1609 { 1610 struct xfs_mount *mp = ip->i_mount; 1611 xfs_fileoff_t fbno; 1612 xfs_filblks_t end; 1613 xfs_off_t isize; 1614 int error; 1615 1616 if (!xfs_is_reflink_inode(ip)) 1617 return 0; 1618 1619 trace_xfs_reflink_unshare(ip, offset, len); 1620 1621 inode_dio_wait(VFS_I(ip)); 1622 1623 /* Try to CoW the selected ranges */ 1624 xfs_ilock(ip, XFS_ILOCK_EXCL); 1625 fbno = XFS_B_TO_FSBT(mp, offset); 1626 isize = i_size_read(VFS_I(ip)); 1627 end = XFS_B_TO_FSB(mp, offset + len); 1628 error = xfs_reflink_dirty_extents(ip, fbno, end, isize); 1629 if (error) 1630 goto out_unlock; 1631 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1632 1633 /* Wait for the IO to finish */ 1634 error = filemap_write_and_wait(VFS_I(ip)->i_mapping); 1635 if (error) 1636 goto out; 1637 1638 /* Turn off the reflink flag if possible. */ 1639 error = xfs_reflink_try_clear_inode_flag(ip); 1640 if (error) 1641 goto out; 1642 1643 return 0; 1644 1645 out_unlock: 1646 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1647 out: 1648 trace_xfs_reflink_unshare_error(ip, error, _RET_IP_); 1649 return error; 1650 } 1651