1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (c) 2020-2024 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs.h" 7 #include "xfs_shared.h" 8 #include "xfs_format.h" 9 #include "xfs_log_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_defer.h" 13 #include "xfs_inode.h" 14 #include "xfs_trans.h" 15 #include "xfs_quota.h" 16 #include "xfs_bmap_util.h" 17 #include "xfs_reflink.h" 18 #include "xfs_trace.h" 19 #include "xfs_exchrange.h" 20 #include "xfs_exchmaps.h" 21 #include "xfs_sb.h" 22 #include "xfs_icache.h" 23 #include "xfs_log.h" 24 #include "xfs_rtbitmap.h" 25 #include <linux/fsnotify.h> 26 27 /* Lock (and optionally join) two inodes for a file range exchange. */ 28 void 29 xfs_exchrange_ilock( 30 struct xfs_trans *tp, 31 struct xfs_inode *ip1, 32 struct xfs_inode *ip2) 33 { 34 if (ip1 != ip2) 35 xfs_lock_two_inodes(ip1, XFS_ILOCK_EXCL, 36 ip2, XFS_ILOCK_EXCL); 37 else 38 xfs_ilock(ip1, XFS_ILOCK_EXCL); 39 if (tp) { 40 xfs_trans_ijoin(tp, ip1, 0); 41 if (ip2 != ip1) 42 xfs_trans_ijoin(tp, ip2, 0); 43 } 44 45 } 46 47 /* Unlock two inodes after a file range exchange operation. */ 48 void 49 xfs_exchrange_iunlock( 50 struct xfs_inode *ip1, 51 struct xfs_inode *ip2) 52 { 53 if (ip2 != ip1) 54 xfs_iunlock(ip2, XFS_ILOCK_EXCL); 55 xfs_iunlock(ip1, XFS_ILOCK_EXCL); 56 } 57 58 /* 59 * Estimate the resource requirements to exchange file contents between the two 60 * files. The caller is required to hold the IOLOCK and the MMAPLOCK and to 61 * have flushed both inodes' pagecache and active direct-ios. 62 */ 63 int 64 xfs_exchrange_estimate( 65 struct xfs_exchmaps_req *req) 66 { 67 int error; 68 69 xfs_exchrange_ilock(NULL, req->ip1, req->ip2); 70 error = xfs_exchmaps_estimate(req); 71 xfs_exchrange_iunlock(req->ip1, req->ip2); 72 return error; 73 } 74 75 /* 76 * Check that file2's metadata agree with the snapshot that we took for the 77 * range commit request. 78 * 79 * This should be called after the filesystem has locked /all/ inode metadata 80 * against modification. 81 */ 82 STATIC int 83 xfs_exchrange_check_freshness( 84 const struct xfs_exchrange *fxr, 85 struct xfs_inode *ip2) 86 { 87 struct inode *inode2 = VFS_I(ip2); 88 struct timespec64 ctime = inode_get_ctime(inode2); 89 struct timespec64 mtime = inode_get_mtime(inode2); 90 91 trace_xfs_exchrange_freshness(fxr, ip2); 92 93 /* Check that file2 hasn't otherwise been modified. */ 94 if (fxr->file2_ino != ip2->i_ino || 95 fxr->file2_gen != inode2->i_generation || 96 !timespec64_equal(&fxr->file2_ctime, &ctime) || 97 !timespec64_equal(&fxr->file2_mtime, &mtime)) 98 return -EBUSY; 99 100 return 0; 101 } 102 103 #define QRETRY_IP1 (0x1) 104 #define QRETRY_IP2 (0x2) 105 106 /* 107 * Obtain a quota reservation to make sure we don't hit EDQUOT. We can skip 108 * this if quota enforcement is disabled or if both inodes' dquots are the 109 * same. The qretry structure must be initialized to zeroes before the first 110 * call to this function. 111 */ 112 STATIC int 113 xfs_exchrange_reserve_quota( 114 struct xfs_trans *tp, 115 const struct xfs_exchmaps_req *req, 116 unsigned int *qretry) 117 { 118 int64_t ddelta, rdelta; 119 int ip1_error = 0; 120 int error; 121 122 ASSERT(!xfs_is_metadir_inode(req->ip1)); 123 ASSERT(!xfs_is_metadir_inode(req->ip2)); 124 125 /* 126 * Don't bother with a quota reservation if we're not enforcing them 127 * or the two inodes have the same dquots. 128 */ 129 if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 || 130 (req->ip1->i_udquot == req->ip2->i_udquot && 131 req->ip1->i_gdquot == req->ip2->i_gdquot && 132 req->ip1->i_pdquot == req->ip2->i_pdquot)) 133 return 0; 134 135 *qretry = 0; 136 137 /* 138 * For each file, compute the net gain in the number of regular blocks 139 * that will be mapped into that file and reserve that much quota. The 140 * quota counts must be able to absorb at least that much space. 141 */ 142 ddelta = req->ip2_bcount - req->ip1_bcount; 143 rdelta = req->ip2_rtbcount - req->ip1_rtbcount; 144 if (ddelta > 0 || rdelta > 0) { 145 error = xfs_trans_reserve_quota_nblks(tp, req->ip1, 146 ddelta > 0 ? ddelta : 0, 147 rdelta > 0 ? rdelta : 0, 148 false); 149 if (error == -EDQUOT || error == -ENOSPC) { 150 /* 151 * Save this error and see what happens if we try to 152 * reserve quota for ip2. Then report both. 153 */ 154 *qretry |= QRETRY_IP1; 155 ip1_error = error; 156 error = 0; 157 } 158 if (error) 159 return error; 160 } 161 if (ddelta < 0 || rdelta < 0) { 162 error = xfs_trans_reserve_quota_nblks(tp, req->ip2, 163 ddelta < 0 ? -ddelta : 0, 164 rdelta < 0 ? -rdelta : 0, 165 false); 166 if (error == -EDQUOT || error == -ENOSPC) 167 *qretry |= QRETRY_IP2; 168 if (error) 169 return error; 170 } 171 if (ip1_error) 172 return ip1_error; 173 174 /* 175 * For each file, forcibly reserve the gross gain in mapped blocks so 176 * that we don't trip over any quota block reservation assertions. 177 * We must reserve the gross gain because the quota code subtracts from 178 * bcount the number of blocks that we unmap; it does not add that 179 * quantity back to the quota block reservation. 180 */ 181 error = xfs_trans_reserve_quota_nblks(tp, req->ip1, req->ip1_bcount, 182 req->ip1_rtbcount, true); 183 if (error) 184 return error; 185 186 return xfs_trans_reserve_quota_nblks(tp, req->ip2, req->ip2_bcount, 187 req->ip2_rtbcount, true); 188 } 189 190 /* Exchange the mappings (and hence the contents) of two files' forks. */ 191 STATIC int 192 xfs_exchrange_mappings( 193 const struct xfs_exchrange *fxr, 194 struct xfs_inode *ip1, 195 struct xfs_inode *ip2) 196 { 197 struct xfs_mount *mp = ip1->i_mount; 198 struct xfs_exchmaps_req req = { 199 .ip1 = ip1, 200 .ip2 = ip2, 201 .startoff1 = XFS_B_TO_FSBT(mp, fxr->file1_offset), 202 .startoff2 = XFS_B_TO_FSBT(mp, fxr->file2_offset), 203 .blockcount = XFS_B_TO_FSB(mp, fxr->length), 204 }; 205 struct xfs_trans *tp; 206 unsigned int qretry; 207 bool retried = false; 208 int error; 209 210 trace_xfs_exchrange_mappings(fxr, ip1, ip2); 211 212 if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) 213 req.flags |= XFS_EXCHMAPS_SET_SIZES; 214 if (fxr->flags & XFS_EXCHANGE_RANGE_FILE1_WRITTEN) 215 req.flags |= XFS_EXCHMAPS_INO1_WRITTEN; 216 217 /* 218 * Round the request length up to the nearest file allocation unit. 219 * The prep function already checked that the request offsets and 220 * length in @fxr are safe to round up. 221 */ 222 if (xfs_inode_has_bigrtalloc(ip2)) 223 req.blockcount = xfs_blen_roundup_rtx(mp, req.blockcount); 224 225 error = xfs_exchrange_estimate(&req); 226 if (error) 227 return error; 228 229 retry: 230 /* Allocate the transaction, lock the inodes, and join them. */ 231 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, req.resblks, 0, 232 XFS_TRANS_RES_FDBLKS, &tp); 233 if (error) 234 return error; 235 236 xfs_exchrange_ilock(tp, ip1, ip2); 237 238 trace_xfs_exchrange_before(ip2, 2); 239 trace_xfs_exchrange_before(ip1, 1); 240 241 error = xfs_exchmaps_check_forks(mp, &req); 242 if (error) 243 goto out_trans_cancel; 244 245 /* 246 * Reserve ourselves some quota if any of them are in enforcing mode. 247 * In theory we only need enough to satisfy the change in the number 248 * of blocks between the two ranges being remapped. 249 */ 250 error = xfs_exchrange_reserve_quota(tp, &req, &qretry); 251 if ((error == -EDQUOT || error == -ENOSPC) && !retried) { 252 xfs_trans_cancel(tp); 253 xfs_exchrange_iunlock(ip1, ip2); 254 if (qretry & QRETRY_IP1) 255 xfs_blockgc_free_quota(ip1, 0); 256 if (qretry & QRETRY_IP2) 257 xfs_blockgc_free_quota(ip2, 0); 258 retried = true; 259 goto retry; 260 } 261 if (error) 262 goto out_trans_cancel; 263 264 /* If we got this far on a dry run, all parameters are ok. */ 265 if (fxr->flags & XFS_EXCHANGE_RANGE_DRY_RUN) 266 goto out_trans_cancel; 267 268 /* Update the mtime and ctime of both files. */ 269 if (fxr->flags & __XFS_EXCHANGE_RANGE_UPD_CMTIME1) 270 xfs_trans_ichgtime(tp, ip1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 271 if (fxr->flags & __XFS_EXCHANGE_RANGE_UPD_CMTIME2) 272 xfs_trans_ichgtime(tp, ip2, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 273 274 xfs_exchange_mappings(tp, &req); 275 276 /* 277 * Force the log to persist metadata updates if the caller or the 278 * administrator requires this. The generic prep function already 279 * flushed the relevant parts of the page cache. 280 */ 281 if (xfs_has_wsync(mp) || (fxr->flags & XFS_EXCHANGE_RANGE_DSYNC)) 282 xfs_trans_set_sync(tp); 283 284 error = xfs_trans_commit(tp); 285 286 trace_xfs_exchrange_after(ip2, 2); 287 trace_xfs_exchrange_after(ip1, 1); 288 289 if (error) 290 goto out_unlock; 291 292 /* 293 * If the caller wanted us to exchange the contents of two complete 294 * files of unequal length, exchange the incore sizes now. This should 295 * be safe because we flushed both files' page caches, exchanged all 296 * the mappings, and updated the ondisk sizes. 297 */ 298 if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) { 299 loff_t temp; 300 301 temp = i_size_read(VFS_I(ip2)); 302 i_size_write(VFS_I(ip2), i_size_read(VFS_I(ip1))); 303 i_size_write(VFS_I(ip1), temp); 304 } 305 306 out_unlock: 307 xfs_exchrange_iunlock(ip1, ip2); 308 return error; 309 310 out_trans_cancel: 311 xfs_trans_cancel(tp); 312 goto out_unlock; 313 } 314 315 /* 316 * Generic code for exchanging ranges of two files via XFS_IOC_EXCHANGE_RANGE. 317 * This part deals with struct file objects and byte ranges and does not deal 318 * with XFS-specific data structures such as xfs_inodes and block ranges. This 319 * separation may some day facilitate porting to another filesystem. 320 * 321 * The goal is to exchange fxr.length bytes starting at fxr.file1_offset in 322 * file1 with the same number of bytes starting at fxr.file2_offset in file2. 323 * Implementations must call xfs_exchange_range_prep to prepare the two 324 * files prior to taking locks; and they must update the inode change and mod 325 * times of both files as part of the metadata update. The timestamp update 326 * and freshness checks must be done atomically as part of the data exchange 327 * operation to ensure correctness of the freshness check. 328 * xfs_exchange_range_finish must be called after the operation completes 329 * successfully but before locks are dropped. 330 */ 331 332 /* Verify that we have security clearance to perform this operation. */ 333 static int 334 xfs_exchange_range_verify_area( 335 struct xfs_exchrange *fxr) 336 { 337 int ret; 338 339 ret = remap_verify_area(fxr->file1, fxr->file1_offset, fxr->length, 340 true); 341 if (ret) 342 return ret; 343 344 return remap_verify_area(fxr->file2, fxr->file2_offset, fxr->length, 345 true); 346 } 347 348 /* 349 * Performs necessary checks before doing a range exchange, having stabilized 350 * mutable inode attributes via i_rwsem. 351 */ 352 static inline int 353 xfs_exchange_range_checks( 354 struct xfs_exchrange *fxr, 355 unsigned int alloc_unit) 356 { 357 struct inode *inode1 = file_inode(fxr->file1); 358 struct inode *inode2 = file_inode(fxr->file2); 359 uint64_t allocmask = alloc_unit - 1; 360 int64_t test_len; 361 uint64_t blen; 362 loff_t size1, size2, tmp; 363 int error; 364 365 /* Don't touch certain kinds of inodes */ 366 if (IS_IMMUTABLE(inode1) || IS_IMMUTABLE(inode2)) 367 return -EPERM; 368 if (IS_SWAPFILE(inode1) || IS_SWAPFILE(inode2)) 369 return -ETXTBSY; 370 371 size1 = i_size_read(inode1); 372 size2 = i_size_read(inode2); 373 374 /* Ranges cannot start after EOF. */ 375 if (fxr->file1_offset > size1 || fxr->file2_offset > size2) 376 return -EINVAL; 377 378 /* 379 * If the caller said to exchange to EOF, we set the length of the 380 * request large enough to cover everything to the end of both files. 381 */ 382 if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) { 383 fxr->length = max_t(int64_t, size1 - fxr->file1_offset, 384 size2 - fxr->file2_offset); 385 386 error = xfs_exchange_range_verify_area(fxr); 387 if (error) 388 return error; 389 } 390 391 /* 392 * The start of both ranges must be aligned to the file allocation 393 * unit. 394 */ 395 if (!IS_ALIGNED(fxr->file1_offset, alloc_unit) || 396 !IS_ALIGNED(fxr->file2_offset, alloc_unit)) 397 return -EINVAL; 398 399 /* Ensure offsets don't wrap. */ 400 if (check_add_overflow(fxr->file1_offset, fxr->length, &tmp) || 401 check_add_overflow(fxr->file2_offset, fxr->length, &tmp)) 402 return -EINVAL; 403 404 /* 405 * We require both ranges to end within EOF, unless we're exchanging 406 * to EOF. 407 */ 408 if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) && 409 (fxr->file1_offset + fxr->length > size1 || 410 fxr->file2_offset + fxr->length > size2)) 411 return -EINVAL; 412 413 /* 414 * Make sure we don't hit any file size limits. If we hit any size 415 * limits such that test_length was adjusted, we abort the whole 416 * operation. 417 */ 418 test_len = fxr->length; 419 error = generic_write_check_limits(fxr->file2, fxr->file2_offset, 420 &test_len); 421 if (error) 422 return error; 423 error = generic_write_check_limits(fxr->file1, fxr->file1_offset, 424 &test_len); 425 if (error) 426 return error; 427 if (test_len != fxr->length) 428 return -EINVAL; 429 430 /* 431 * If the user wanted us to exchange up to the infile's EOF, round up 432 * to the next allocation unit boundary for this check. Do the same 433 * for the outfile. 434 * 435 * Otherwise, reject the range length if it's not aligned to an 436 * allocation unit. 437 */ 438 if (fxr->file1_offset + fxr->length == size1) 439 blen = ALIGN(size1, alloc_unit) - fxr->file1_offset; 440 else if (fxr->file2_offset + fxr->length == size2) 441 blen = ALIGN(size2, alloc_unit) - fxr->file2_offset; 442 else if (!IS_ALIGNED(fxr->length, alloc_unit)) 443 return -EINVAL; 444 else 445 blen = fxr->length; 446 447 /* Don't allow overlapped exchanges within the same file. */ 448 if (inode1 == inode2 && 449 fxr->file2_offset + blen > fxr->file1_offset && 450 fxr->file1_offset + blen > fxr->file2_offset) 451 return -EINVAL; 452 453 /* 454 * Ensure that we don't exchange a partial EOF block into the middle of 455 * another file. 456 */ 457 if ((fxr->length & allocmask) == 0) 458 return 0; 459 460 blen = fxr->length; 461 if (fxr->file2_offset + blen < size2) 462 blen &= ~allocmask; 463 464 if (fxr->file1_offset + blen < size1) 465 blen &= ~allocmask; 466 467 return blen == fxr->length ? 0 : -EINVAL; 468 } 469 470 /* 471 * Check that the two inodes are eligible for range exchanges, the ranges make 472 * sense, and then flush all dirty data. Caller must ensure that the inodes 473 * have been locked against any other modifications. 474 */ 475 static inline int 476 xfs_exchange_range_prep( 477 struct xfs_exchrange *fxr, 478 unsigned int alloc_unit) 479 { 480 struct inode *inode1 = file_inode(fxr->file1); 481 struct inode *inode2 = file_inode(fxr->file2); 482 bool same_inode = (inode1 == inode2); 483 int error; 484 485 /* Check that we don't violate system file offset limits. */ 486 error = xfs_exchange_range_checks(fxr, alloc_unit); 487 if (error || fxr->length == 0) 488 return error; 489 490 /* Wait for the completion of any pending IOs on both files */ 491 inode_dio_wait(inode1); 492 if (!same_inode) 493 inode_dio_wait(inode2); 494 495 error = filemap_write_and_wait_range(inode1->i_mapping, 496 fxr->file1_offset, 497 fxr->file1_offset + fxr->length - 1); 498 if (error) 499 return error; 500 501 error = filemap_write_and_wait_range(inode2->i_mapping, 502 fxr->file2_offset, 503 fxr->file2_offset + fxr->length - 1); 504 if (error) 505 return error; 506 507 /* 508 * If the files or inodes involved require synchronous writes, amend 509 * the request to force the filesystem to flush all data and metadata 510 * to disk after the operation completes. 511 */ 512 if (((fxr->file1->f_flags | fxr->file2->f_flags) & O_SYNC) || 513 IS_SYNC(inode1) || IS_SYNC(inode2)) 514 fxr->flags |= XFS_EXCHANGE_RANGE_DSYNC; 515 516 return 0; 517 } 518 519 /* 520 * Finish a range exchange operation, if it was successful. Caller must ensure 521 * that the inodes are still locked against any other modifications. 522 */ 523 static inline int 524 xfs_exchange_range_finish( 525 struct xfs_exchrange *fxr) 526 { 527 int error; 528 529 error = file_remove_privs(fxr->file1); 530 if (error) 531 return error; 532 if (file_inode(fxr->file1) == file_inode(fxr->file2)) 533 return 0; 534 535 return file_remove_privs(fxr->file2); 536 } 537 538 /* 539 * Check the alignment of an exchange request when the allocation unit size 540 * isn't a power of two. The generic file-level helpers use (fast) 541 * bitmask-based alignment checks, but here we have to use slow long division. 542 */ 543 static int 544 xfs_exchrange_check_rtalign( 545 const struct xfs_exchrange *fxr, 546 struct xfs_inode *ip1, 547 struct xfs_inode *ip2, 548 unsigned int alloc_unit) 549 { 550 uint64_t length = fxr->length; 551 uint64_t blen; 552 loff_t size1, size2; 553 554 size1 = i_size_read(VFS_I(ip1)); 555 size2 = i_size_read(VFS_I(ip2)); 556 557 /* The start of both ranges must be aligned to a rt extent. */ 558 if (!isaligned_64(fxr->file1_offset, alloc_unit) || 559 !isaligned_64(fxr->file2_offset, alloc_unit)) 560 return -EINVAL; 561 562 if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) 563 length = max_t(int64_t, size1 - fxr->file1_offset, 564 size2 - fxr->file2_offset); 565 566 /* 567 * If the user wanted us to exchange up to the infile's EOF, round up 568 * to the next rt extent boundary for this check. Do the same for the 569 * outfile. 570 * 571 * Otherwise, reject the range length if it's not rt extent aligned. 572 * We already confirmed the starting offsets' rt extent block 573 * alignment. 574 */ 575 if (fxr->file1_offset + length == size1) 576 blen = roundup_64(size1, alloc_unit) - fxr->file1_offset; 577 else if (fxr->file2_offset + length == size2) 578 blen = roundup_64(size2, alloc_unit) - fxr->file2_offset; 579 else if (!isaligned_64(length, alloc_unit)) 580 return -EINVAL; 581 else 582 blen = length; 583 584 /* Don't allow overlapped exchanges within the same file. */ 585 if (ip1 == ip2 && 586 fxr->file2_offset + blen > fxr->file1_offset && 587 fxr->file1_offset + blen > fxr->file2_offset) 588 return -EINVAL; 589 590 /* 591 * Ensure that we don't exchange a partial EOF rt extent into the 592 * middle of another file. 593 */ 594 if (isaligned_64(length, alloc_unit)) 595 return 0; 596 597 blen = length; 598 if (fxr->file2_offset + length < size2) 599 blen = rounddown_64(blen, alloc_unit); 600 601 if (fxr->file1_offset + blen < size1) 602 blen = rounddown_64(blen, alloc_unit); 603 604 return blen == length ? 0 : -EINVAL; 605 } 606 607 /* Prepare two files to have their data exchanged. */ 608 STATIC int 609 xfs_exchrange_prep( 610 struct xfs_exchrange *fxr, 611 struct xfs_inode *ip1, 612 struct xfs_inode *ip2) 613 { 614 struct xfs_mount *mp = ip2->i_mount; 615 unsigned int alloc_unit = xfs_inode_alloc_unitsize(ip2); 616 int error; 617 618 trace_xfs_exchrange_prep(fxr, ip1, ip2); 619 620 /* Verify both files are either real-time or non-realtime */ 621 if (XFS_IS_REALTIME_INODE(ip1) != XFS_IS_REALTIME_INODE(ip2)) 622 return -EINVAL; 623 624 /* Check non-power of two alignment issues, if necessary. */ 625 if (!is_power_of_2(alloc_unit)) { 626 error = xfs_exchrange_check_rtalign(fxr, ip1, ip2, alloc_unit); 627 if (error) 628 return error; 629 630 /* 631 * Do the generic file-level checks with the regular block 632 * alignment. 633 */ 634 alloc_unit = mp->m_sb.sb_blocksize; 635 } 636 637 error = xfs_exchange_range_prep(fxr, alloc_unit); 638 if (error || fxr->length == 0) 639 return error; 640 641 if (fxr->flags & __XFS_EXCHANGE_RANGE_CHECK_FRESH2) { 642 error = xfs_exchrange_check_freshness(fxr, ip2); 643 if (error) 644 return error; 645 } 646 647 /* Attach dquots to both inodes before changing block maps. */ 648 error = xfs_qm_dqattach(ip2); 649 if (error) 650 return error; 651 error = xfs_qm_dqattach(ip1); 652 if (error) 653 return error; 654 655 trace_xfs_exchrange_flush(fxr, ip1, ip2); 656 657 /* Flush the relevant ranges of both files. */ 658 error = xfs_flush_unmap_range(ip2, fxr->file2_offset, fxr->length); 659 if (error) 660 return error; 661 error = xfs_flush_unmap_range(ip1, fxr->file1_offset, fxr->length); 662 if (error) 663 return error; 664 665 /* 666 * Cancel CoW fork preallocations for the ranges of both files. The 667 * prep function should have flushed all the dirty data, so the only 668 * CoW mappings remaining should be speculative. 669 */ 670 if (xfs_inode_has_cow_data(ip1)) { 671 error = xfs_reflink_cancel_cow_range(ip1, fxr->file1_offset, 672 fxr->length, true); 673 if (error) 674 return error; 675 } 676 677 if (xfs_inode_has_cow_data(ip2)) { 678 error = xfs_reflink_cancel_cow_range(ip2, fxr->file2_offset, 679 fxr->length, true); 680 if (error) 681 return error; 682 } 683 684 return 0; 685 } 686 687 /* 688 * Exchange contents of files. This is the binding between the generic 689 * file-level concepts and the XFS inode-specific implementation. 690 */ 691 STATIC int 692 xfs_exchrange_contents( 693 struct xfs_exchrange *fxr) 694 { 695 struct inode *inode1 = file_inode(fxr->file1); 696 struct inode *inode2 = file_inode(fxr->file2); 697 struct xfs_inode *ip1 = XFS_I(inode1); 698 struct xfs_inode *ip2 = XFS_I(inode2); 699 struct xfs_mount *mp = ip1->i_mount; 700 int error; 701 702 if (!xfs_has_exchange_range(mp)) 703 return -EOPNOTSUPP; 704 705 if (fxr->flags & ~(XFS_EXCHANGE_RANGE_ALL_FLAGS | 706 XFS_EXCHANGE_RANGE_PRIV_FLAGS)) 707 return -EINVAL; 708 709 if (xfs_is_shutdown(mp)) 710 return -EIO; 711 712 /* Lock both files against IO */ 713 error = xfs_ilock2_io_mmap(ip1, ip2); 714 if (error) 715 goto out_err; 716 717 /* Prepare and then exchange file contents. */ 718 error = xfs_exchrange_prep(fxr, ip1, ip2); 719 if (error) 720 goto out_unlock; 721 722 error = xfs_exchrange_mappings(fxr, ip1, ip2); 723 if (error) 724 goto out_unlock; 725 726 /* 727 * Finish the exchange by removing special file privileges like any 728 * other file write would do. This may involve turning on support for 729 * logged xattrs if either file has security capabilities. 730 */ 731 error = xfs_exchange_range_finish(fxr); 732 if (error) 733 goto out_unlock; 734 735 out_unlock: 736 xfs_iunlock2_io_mmap(ip1, ip2); 737 out_err: 738 if (error) 739 trace_xfs_exchrange_error(ip2, error, _RET_IP_); 740 return error; 741 } 742 743 /* Exchange parts of two files. */ 744 static int 745 xfs_exchange_range( 746 struct xfs_exchrange *fxr) 747 { 748 struct inode *inode1 = file_inode(fxr->file1); 749 struct inode *inode2 = file_inode(fxr->file2); 750 int ret; 751 752 BUILD_BUG_ON(XFS_EXCHANGE_RANGE_ALL_FLAGS & 753 XFS_EXCHANGE_RANGE_PRIV_FLAGS); 754 755 /* Both files must be on the same mount/filesystem. */ 756 if (fxr->file1->f_path.mnt != fxr->file2->f_path.mnt) 757 return -EXDEV; 758 759 if (fxr->flags & ~(XFS_EXCHANGE_RANGE_ALL_FLAGS | 760 __XFS_EXCHANGE_RANGE_CHECK_FRESH2)) 761 return -EINVAL; 762 763 /* Userspace requests only honored for regular files. */ 764 if (S_ISDIR(inode1->i_mode) || S_ISDIR(inode2->i_mode)) 765 return -EISDIR; 766 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode)) 767 return -EINVAL; 768 769 /* Both files must be opened for read and write. */ 770 if (!(fxr->file1->f_mode & FMODE_READ) || 771 !(fxr->file1->f_mode & FMODE_WRITE) || 772 !(fxr->file2->f_mode & FMODE_READ) || 773 !(fxr->file2->f_mode & FMODE_WRITE)) 774 return -EBADF; 775 776 /* Neither file can be opened append-only. */ 777 if ((fxr->file1->f_flags & O_APPEND) || 778 (fxr->file2->f_flags & O_APPEND)) 779 return -EBADF; 780 781 /* 782 * If we're not exchanging to EOF, we can check the areas before 783 * stabilizing both files' i_size. 784 */ 785 if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)) { 786 ret = xfs_exchange_range_verify_area(fxr); 787 if (ret) 788 return ret; 789 } 790 791 /* Update cmtime if the fd/inode don't forbid it. */ 792 if (!(fxr->file1->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode1)) 793 fxr->flags |= __XFS_EXCHANGE_RANGE_UPD_CMTIME1; 794 if (!(fxr->file2->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode2)) 795 fxr->flags |= __XFS_EXCHANGE_RANGE_UPD_CMTIME2; 796 797 file_start_write(fxr->file2); 798 ret = xfs_exchrange_contents(fxr); 799 file_end_write(fxr->file2); 800 if (ret) 801 return ret; 802 803 fsnotify_modify(fxr->file1); 804 if (fxr->file2 != fxr->file1) 805 fsnotify_modify(fxr->file2); 806 return 0; 807 } 808 809 /* Collect exchange-range arguments from userspace. */ 810 long 811 xfs_ioc_exchange_range( 812 struct file *file, 813 struct xfs_exchange_range __user *argp) 814 { 815 struct xfs_exchrange fxr = { 816 .file2 = file, 817 }; 818 struct xfs_exchange_range args; 819 820 if (copy_from_user(&args, argp, sizeof(args))) 821 return -EFAULT; 822 if (memchr_inv(&args.pad, 0, sizeof(args.pad))) 823 return -EINVAL; 824 if (args.flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS) 825 return -EINVAL; 826 827 fxr.file1_offset = args.file1_offset; 828 fxr.file2_offset = args.file2_offset; 829 fxr.length = args.length; 830 fxr.flags = args.flags; 831 832 CLASS(fd, file1)(args.file1_fd); 833 if (fd_empty(file1)) 834 return -EBADF; 835 fxr.file1 = fd_file(file1); 836 837 return xfs_exchange_range(&fxr); 838 } 839 840 /* Opaque freshness blob for XFS_IOC_COMMIT_RANGE */ 841 struct xfs_commit_range_fresh { 842 xfs_fsid_t fsid; /* m_fixedfsid */ 843 __u64 file2_ino; /* inode number */ 844 __s64 file2_mtime; /* modification time */ 845 __s64 file2_ctime; /* change time */ 846 __s32 file2_mtime_nsec; /* mod time, nsec */ 847 __s32 file2_ctime_nsec; /* change time, nsec */ 848 __u32 file2_gen; /* inode generation */ 849 __u32 magic; /* zero */ 850 }; 851 #define XCR_FRESH_MAGIC 0x444F524B /* DORK */ 852 853 /* Set up a commitrange operation by sampling file2's write-related attrs */ 854 long 855 xfs_ioc_start_commit( 856 struct file *file, 857 struct xfs_commit_range __user *argp) 858 { 859 struct xfs_commit_range args = { }; 860 struct kstat kstat = { }; 861 struct xfs_commit_range_fresh *kern_f; 862 struct xfs_commit_range_fresh __user *user_f; 863 struct inode *inode2 = file_inode(file); 864 struct xfs_inode *ip2 = XFS_I(inode2); 865 const unsigned int lockflags = XFS_IOLOCK_SHARED | 866 XFS_MMAPLOCK_SHARED | 867 XFS_ILOCK_SHARED; 868 869 BUILD_BUG_ON(sizeof(struct xfs_commit_range_fresh) != 870 sizeof(args.file2_freshness)); 871 872 kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness; 873 874 memcpy(&kern_f->fsid, ip2->i_mount->m_fixedfsid, sizeof(xfs_fsid_t)); 875 876 xfs_ilock(ip2, lockflags); 877 /* Force writing of a distinct ctime if any writes happen. */ 878 fill_mg_cmtime(&kstat, STATX_CTIME | STATX_MTIME, inode2); 879 kern_f->file2_ctime = kstat.ctime.tv_sec; 880 kern_f->file2_ctime_nsec = kstat.ctime.tv_nsec; 881 kern_f->file2_mtime = kstat.mtime.tv_sec; 882 kern_f->file2_mtime_nsec = kstat.mtime.tv_nsec; 883 kern_f->file2_ino = ip2->i_ino; 884 kern_f->file2_gen = inode2->i_generation; 885 kern_f->magic = XCR_FRESH_MAGIC; 886 xfs_iunlock(ip2, lockflags); 887 888 user_f = (struct xfs_commit_range_fresh __user *)&argp->file2_freshness; 889 if (copy_to_user(user_f, kern_f, sizeof(*kern_f))) 890 return -EFAULT; 891 892 return 0; 893 } 894 895 /* 896 * Exchange file1 and file2 contents if file2 has not been written since the 897 * start commit operation. 898 */ 899 long 900 xfs_ioc_commit_range( 901 struct file *file, 902 struct xfs_commit_range __user *argp) 903 { 904 struct xfs_exchrange fxr = { 905 .file2 = file, 906 }; 907 struct xfs_commit_range args; 908 struct xfs_commit_range_fresh *kern_f; 909 struct xfs_inode *ip2 = XFS_I(file_inode(file)); 910 struct xfs_mount *mp = ip2->i_mount; 911 912 kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness; 913 914 if (copy_from_user(&args, argp, sizeof(args))) 915 return -EFAULT; 916 if (args.flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS) 917 return -EINVAL; 918 if (kern_f->magic != XCR_FRESH_MAGIC) 919 return -EBUSY; 920 if (memcmp(&kern_f->fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t))) 921 return -EBUSY; 922 923 fxr.file1_offset = args.file1_offset; 924 fxr.file2_offset = args.file2_offset; 925 fxr.length = args.length; 926 fxr.flags = args.flags | __XFS_EXCHANGE_RANGE_CHECK_FRESH2; 927 fxr.file2_ino = kern_f->file2_ino; 928 fxr.file2_gen = kern_f->file2_gen; 929 fxr.file2_mtime.tv_sec = kern_f->file2_mtime; 930 fxr.file2_mtime.tv_nsec = kern_f->file2_mtime_nsec; 931 fxr.file2_ctime.tv_sec = kern_f->file2_ctime; 932 fxr.file2_ctime.tv_nsec = kern_f->file2_ctime_nsec; 933 934 CLASS(fd, file1)(args.file1_fd); 935 if (fd_empty(file1)) 936 return -EBADF; 937 fxr.file1 = fd_file(file1); 938 939 return xfs_exchange_range(&fxr); 940 } 941