1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (c) 2020-2024 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs.h" 7 #include "xfs_shared.h" 8 #include "xfs_format.h" 9 #include "xfs_log_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_defer.h" 13 #include "xfs_inode.h" 14 #include "xfs_trans.h" 15 #include "xfs_quota.h" 16 #include "xfs_bmap_util.h" 17 #include "xfs_reflink.h" 18 #include "xfs_trace.h" 19 #include "xfs_exchrange.h" 20 #include "xfs_exchmaps.h" 21 #include "xfs_sb.h" 22 #include "xfs_icache.h" 23 #include "xfs_log.h" 24 #include "xfs_rtbitmap.h" 25 #include <linux/fsnotify.h> 26 27 /* Lock (and optionally join) two inodes for a file range exchange. */ 28 void 29 xfs_exchrange_ilock( 30 struct xfs_trans *tp, 31 struct xfs_inode *ip1, 32 struct xfs_inode *ip2) 33 { 34 if (ip1 != ip2) 35 xfs_lock_two_inodes(ip1, XFS_ILOCK_EXCL, 36 ip2, XFS_ILOCK_EXCL); 37 else 38 xfs_ilock(ip1, XFS_ILOCK_EXCL); 39 if (tp) { 40 xfs_trans_ijoin(tp, ip1, 0); 41 if (ip2 != ip1) 42 xfs_trans_ijoin(tp, ip2, 0); 43 } 44 45 } 46 47 /* Unlock two inodes after a file range exchange operation. */ 48 void 49 xfs_exchrange_iunlock( 50 struct xfs_inode *ip1, 51 struct xfs_inode *ip2) 52 { 53 if (ip2 != ip1) 54 xfs_iunlock(ip2, XFS_ILOCK_EXCL); 55 xfs_iunlock(ip1, XFS_ILOCK_EXCL); 56 } 57 58 /* 59 * Estimate the resource requirements to exchange file contents between the two 60 * files. The caller is required to hold the IOLOCK and the MMAPLOCK and to 61 * have flushed both inodes' pagecache and active direct-ios. 62 */ 63 int 64 xfs_exchrange_estimate( 65 struct xfs_exchmaps_req *req) 66 { 67 int error; 68 69 xfs_exchrange_ilock(NULL, req->ip1, req->ip2); 70 error = xfs_exchmaps_estimate(req); 71 xfs_exchrange_iunlock(req->ip1, req->ip2); 72 return error; 73 } 74 75 /* 76 * Check that file2's metadata agree with the snapshot that we took for the 77 * range commit request. 78 * 79 * This should be called after the filesystem has locked /all/ inode metadata 80 * against modification. 81 */ 82 STATIC int 83 xfs_exchrange_check_freshness( 84 const struct xfs_exchrange *fxr, 85 struct xfs_inode *ip2) 86 { 87 struct inode *inode2 = VFS_I(ip2); 88 struct timespec64 ctime = inode_get_ctime(inode2); 89 struct timespec64 mtime = inode_get_mtime(inode2); 90 91 trace_xfs_exchrange_freshness(fxr, ip2); 92 93 /* Check that file2 hasn't otherwise been modified. */ 94 if (fxr->file2_ino != ip2->i_ino || 95 fxr->file2_gen != inode2->i_generation || 96 !timespec64_equal(&fxr->file2_ctime, &ctime) || 97 !timespec64_equal(&fxr->file2_mtime, &mtime)) 98 return -EBUSY; 99 100 return 0; 101 } 102 103 #define QRETRY_IP1 (0x1) 104 #define QRETRY_IP2 (0x2) 105 106 /* 107 * Obtain a quota reservation to make sure we don't hit EDQUOT. We can skip 108 * this if quota enforcement is disabled or if both inodes' dquots are the 109 * same. The qretry structure must be initialized to zeroes before the first 110 * call to this function. 111 */ 112 STATIC int 113 xfs_exchrange_reserve_quota( 114 struct xfs_trans *tp, 115 const struct xfs_exchmaps_req *req, 116 unsigned int *qretry) 117 { 118 int64_t ddelta, rdelta; 119 int ip1_error = 0; 120 int error; 121 122 /* 123 * Don't bother with a quota reservation if we're not enforcing them 124 * or the two inodes have the same dquots. 125 */ 126 if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 || 127 (req->ip1->i_udquot == req->ip2->i_udquot && 128 req->ip1->i_gdquot == req->ip2->i_gdquot && 129 req->ip1->i_pdquot == req->ip2->i_pdquot)) 130 return 0; 131 132 *qretry = 0; 133 134 /* 135 * For each file, compute the net gain in the number of regular blocks 136 * that will be mapped into that file and reserve that much quota. The 137 * quota counts must be able to absorb at least that much space. 138 */ 139 ddelta = req->ip2_bcount - req->ip1_bcount; 140 rdelta = req->ip2_rtbcount - req->ip1_rtbcount; 141 if (ddelta > 0 || rdelta > 0) { 142 error = xfs_trans_reserve_quota_nblks(tp, req->ip1, 143 ddelta > 0 ? ddelta : 0, 144 rdelta > 0 ? rdelta : 0, 145 false); 146 if (error == -EDQUOT || error == -ENOSPC) { 147 /* 148 * Save this error and see what happens if we try to 149 * reserve quota for ip2. Then report both. 150 */ 151 *qretry |= QRETRY_IP1; 152 ip1_error = error; 153 error = 0; 154 } 155 if (error) 156 return error; 157 } 158 if (ddelta < 0 || rdelta < 0) { 159 error = xfs_trans_reserve_quota_nblks(tp, req->ip2, 160 ddelta < 0 ? -ddelta : 0, 161 rdelta < 0 ? -rdelta : 0, 162 false); 163 if (error == -EDQUOT || error == -ENOSPC) 164 *qretry |= QRETRY_IP2; 165 if (error) 166 return error; 167 } 168 if (ip1_error) 169 return ip1_error; 170 171 /* 172 * For each file, forcibly reserve the gross gain in mapped blocks so 173 * that we don't trip over any quota block reservation assertions. 174 * We must reserve the gross gain because the quota code subtracts from 175 * bcount the number of blocks that we unmap; it does not add that 176 * quantity back to the quota block reservation. 177 */ 178 error = xfs_trans_reserve_quota_nblks(tp, req->ip1, req->ip1_bcount, 179 req->ip1_rtbcount, true); 180 if (error) 181 return error; 182 183 return xfs_trans_reserve_quota_nblks(tp, req->ip2, req->ip2_bcount, 184 req->ip2_rtbcount, true); 185 } 186 187 /* Exchange the mappings (and hence the contents) of two files' forks. */ 188 STATIC int 189 xfs_exchrange_mappings( 190 const struct xfs_exchrange *fxr, 191 struct xfs_inode *ip1, 192 struct xfs_inode *ip2) 193 { 194 struct xfs_mount *mp = ip1->i_mount; 195 struct xfs_exchmaps_req req = { 196 .ip1 = ip1, 197 .ip2 = ip2, 198 .startoff1 = XFS_B_TO_FSBT(mp, fxr->file1_offset), 199 .startoff2 = XFS_B_TO_FSBT(mp, fxr->file2_offset), 200 .blockcount = XFS_B_TO_FSB(mp, fxr->length), 201 }; 202 struct xfs_trans *tp; 203 unsigned int qretry; 204 bool retried = false; 205 int error; 206 207 trace_xfs_exchrange_mappings(fxr, ip1, ip2); 208 209 if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) 210 req.flags |= XFS_EXCHMAPS_SET_SIZES; 211 if (fxr->flags & XFS_EXCHANGE_RANGE_FILE1_WRITTEN) 212 req.flags |= XFS_EXCHMAPS_INO1_WRITTEN; 213 214 /* 215 * Round the request length up to the nearest file allocation unit. 216 * The prep function already checked that the request offsets and 217 * length in @fxr are safe to round up. 218 */ 219 if (xfs_inode_has_bigrtalloc(ip2)) 220 req.blockcount = xfs_blen_roundup_rtx(mp, req.blockcount); 221 222 error = xfs_exchrange_estimate(&req); 223 if (error) 224 return error; 225 226 retry: 227 /* Allocate the transaction, lock the inodes, and join them. */ 228 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, req.resblks, 0, 229 XFS_TRANS_RES_FDBLKS, &tp); 230 if (error) 231 return error; 232 233 xfs_exchrange_ilock(tp, ip1, ip2); 234 235 trace_xfs_exchrange_before(ip2, 2); 236 trace_xfs_exchrange_before(ip1, 1); 237 238 error = xfs_exchmaps_check_forks(mp, &req); 239 if (error) 240 goto out_trans_cancel; 241 242 /* 243 * Reserve ourselves some quota if any of them are in enforcing mode. 244 * In theory we only need enough to satisfy the change in the number 245 * of blocks between the two ranges being remapped. 246 */ 247 error = xfs_exchrange_reserve_quota(tp, &req, &qretry); 248 if ((error == -EDQUOT || error == -ENOSPC) && !retried) { 249 xfs_trans_cancel(tp); 250 xfs_exchrange_iunlock(ip1, ip2); 251 if (qretry & QRETRY_IP1) 252 xfs_blockgc_free_quota(ip1, 0); 253 if (qretry & QRETRY_IP2) 254 xfs_blockgc_free_quota(ip2, 0); 255 retried = true; 256 goto retry; 257 } 258 if (error) 259 goto out_trans_cancel; 260 261 /* If we got this far on a dry run, all parameters are ok. */ 262 if (fxr->flags & XFS_EXCHANGE_RANGE_DRY_RUN) 263 goto out_trans_cancel; 264 265 /* Update the mtime and ctime of both files. */ 266 if (fxr->flags & __XFS_EXCHANGE_RANGE_UPD_CMTIME1) 267 xfs_trans_ichgtime(tp, ip1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 268 if (fxr->flags & __XFS_EXCHANGE_RANGE_UPD_CMTIME2) 269 xfs_trans_ichgtime(tp, ip2, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 270 271 xfs_exchange_mappings(tp, &req); 272 273 /* 274 * Force the log to persist metadata updates if the caller or the 275 * administrator requires this. The generic prep function already 276 * flushed the relevant parts of the page cache. 277 */ 278 if (xfs_has_wsync(mp) || (fxr->flags & XFS_EXCHANGE_RANGE_DSYNC)) 279 xfs_trans_set_sync(tp); 280 281 error = xfs_trans_commit(tp); 282 283 trace_xfs_exchrange_after(ip2, 2); 284 trace_xfs_exchrange_after(ip1, 1); 285 286 if (error) 287 goto out_unlock; 288 289 /* 290 * If the caller wanted us to exchange the contents of two complete 291 * files of unequal length, exchange the incore sizes now. This should 292 * be safe because we flushed both files' page caches, exchanged all 293 * the mappings, and updated the ondisk sizes. 294 */ 295 if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) { 296 loff_t temp; 297 298 temp = i_size_read(VFS_I(ip2)); 299 i_size_write(VFS_I(ip2), i_size_read(VFS_I(ip1))); 300 i_size_write(VFS_I(ip1), temp); 301 } 302 303 out_unlock: 304 xfs_exchrange_iunlock(ip1, ip2); 305 return error; 306 307 out_trans_cancel: 308 xfs_trans_cancel(tp); 309 goto out_unlock; 310 } 311 312 /* 313 * Generic code for exchanging ranges of two files via XFS_IOC_EXCHANGE_RANGE. 314 * This part deals with struct file objects and byte ranges and does not deal 315 * with XFS-specific data structures such as xfs_inodes and block ranges. This 316 * separation may some day facilitate porting to another filesystem. 317 * 318 * The goal is to exchange fxr.length bytes starting at fxr.file1_offset in 319 * file1 with the same number of bytes starting at fxr.file2_offset in file2. 320 * Implementations must call xfs_exchange_range_prep to prepare the two 321 * files prior to taking locks; and they must update the inode change and mod 322 * times of both files as part of the metadata update. The timestamp update 323 * and freshness checks must be done atomically as part of the data exchange 324 * operation to ensure correctness of the freshness check. 325 * xfs_exchange_range_finish must be called after the operation completes 326 * successfully but before locks are dropped. 327 */ 328 329 /* Verify that we have security clearance to perform this operation. */ 330 static int 331 xfs_exchange_range_verify_area( 332 struct xfs_exchrange *fxr) 333 { 334 int ret; 335 336 ret = remap_verify_area(fxr->file1, fxr->file1_offset, fxr->length, 337 true); 338 if (ret) 339 return ret; 340 341 return remap_verify_area(fxr->file2, fxr->file2_offset, fxr->length, 342 true); 343 } 344 345 /* 346 * Performs necessary checks before doing a range exchange, having stabilized 347 * mutable inode attributes via i_rwsem. 348 */ 349 static inline int 350 xfs_exchange_range_checks( 351 struct xfs_exchrange *fxr, 352 unsigned int alloc_unit) 353 { 354 struct inode *inode1 = file_inode(fxr->file1); 355 struct inode *inode2 = file_inode(fxr->file2); 356 uint64_t allocmask = alloc_unit - 1; 357 int64_t test_len; 358 uint64_t blen; 359 loff_t size1, size2, tmp; 360 int error; 361 362 /* Don't touch certain kinds of inodes */ 363 if (IS_IMMUTABLE(inode1) || IS_IMMUTABLE(inode2)) 364 return -EPERM; 365 if (IS_SWAPFILE(inode1) || IS_SWAPFILE(inode2)) 366 return -ETXTBSY; 367 368 size1 = i_size_read(inode1); 369 size2 = i_size_read(inode2); 370 371 /* Ranges cannot start after EOF. */ 372 if (fxr->file1_offset > size1 || fxr->file2_offset > size2) 373 return -EINVAL; 374 375 /* 376 * If the caller said to exchange to EOF, we set the length of the 377 * request large enough to cover everything to the end of both files. 378 */ 379 if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) { 380 fxr->length = max_t(int64_t, size1 - fxr->file1_offset, 381 size2 - fxr->file2_offset); 382 383 error = xfs_exchange_range_verify_area(fxr); 384 if (error) 385 return error; 386 } 387 388 /* 389 * The start of both ranges must be aligned to the file allocation 390 * unit. 391 */ 392 if (!IS_ALIGNED(fxr->file1_offset, alloc_unit) || 393 !IS_ALIGNED(fxr->file2_offset, alloc_unit)) 394 return -EINVAL; 395 396 /* Ensure offsets don't wrap. */ 397 if (check_add_overflow(fxr->file1_offset, fxr->length, &tmp) || 398 check_add_overflow(fxr->file2_offset, fxr->length, &tmp)) 399 return -EINVAL; 400 401 /* 402 * We require both ranges to end within EOF, unless we're exchanging 403 * to EOF. 404 */ 405 if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) && 406 (fxr->file1_offset + fxr->length > size1 || 407 fxr->file2_offset + fxr->length > size2)) 408 return -EINVAL; 409 410 /* 411 * Make sure we don't hit any file size limits. If we hit any size 412 * limits such that test_length was adjusted, we abort the whole 413 * operation. 414 */ 415 test_len = fxr->length; 416 error = generic_write_check_limits(fxr->file2, fxr->file2_offset, 417 &test_len); 418 if (error) 419 return error; 420 error = generic_write_check_limits(fxr->file1, fxr->file1_offset, 421 &test_len); 422 if (error) 423 return error; 424 if (test_len != fxr->length) 425 return -EINVAL; 426 427 /* 428 * If the user wanted us to exchange up to the infile's EOF, round up 429 * to the next allocation unit boundary for this check. Do the same 430 * for the outfile. 431 * 432 * Otherwise, reject the range length if it's not aligned to an 433 * allocation unit. 434 */ 435 if (fxr->file1_offset + fxr->length == size1) 436 blen = ALIGN(size1, alloc_unit) - fxr->file1_offset; 437 else if (fxr->file2_offset + fxr->length == size2) 438 blen = ALIGN(size2, alloc_unit) - fxr->file2_offset; 439 else if (!IS_ALIGNED(fxr->length, alloc_unit)) 440 return -EINVAL; 441 else 442 blen = fxr->length; 443 444 /* Don't allow overlapped exchanges within the same file. */ 445 if (inode1 == inode2 && 446 fxr->file2_offset + blen > fxr->file1_offset && 447 fxr->file1_offset + blen > fxr->file2_offset) 448 return -EINVAL; 449 450 /* 451 * Ensure that we don't exchange a partial EOF block into the middle of 452 * another file. 453 */ 454 if ((fxr->length & allocmask) == 0) 455 return 0; 456 457 blen = fxr->length; 458 if (fxr->file2_offset + blen < size2) 459 blen &= ~allocmask; 460 461 if (fxr->file1_offset + blen < size1) 462 blen &= ~allocmask; 463 464 return blen == fxr->length ? 0 : -EINVAL; 465 } 466 467 /* 468 * Check that the two inodes are eligible for range exchanges, the ranges make 469 * sense, and then flush all dirty data. Caller must ensure that the inodes 470 * have been locked against any other modifications. 471 */ 472 static inline int 473 xfs_exchange_range_prep( 474 struct xfs_exchrange *fxr, 475 unsigned int alloc_unit) 476 { 477 struct inode *inode1 = file_inode(fxr->file1); 478 struct inode *inode2 = file_inode(fxr->file2); 479 bool same_inode = (inode1 == inode2); 480 int error; 481 482 /* Check that we don't violate system file offset limits. */ 483 error = xfs_exchange_range_checks(fxr, alloc_unit); 484 if (error || fxr->length == 0) 485 return error; 486 487 /* Wait for the completion of any pending IOs on both files */ 488 inode_dio_wait(inode1); 489 if (!same_inode) 490 inode_dio_wait(inode2); 491 492 error = filemap_write_and_wait_range(inode1->i_mapping, 493 fxr->file1_offset, 494 fxr->file1_offset + fxr->length - 1); 495 if (error) 496 return error; 497 498 error = filemap_write_and_wait_range(inode2->i_mapping, 499 fxr->file2_offset, 500 fxr->file2_offset + fxr->length - 1); 501 if (error) 502 return error; 503 504 /* 505 * If the files or inodes involved require synchronous writes, amend 506 * the request to force the filesystem to flush all data and metadata 507 * to disk after the operation completes. 508 */ 509 if (((fxr->file1->f_flags | fxr->file2->f_flags) & O_SYNC) || 510 IS_SYNC(inode1) || IS_SYNC(inode2)) 511 fxr->flags |= XFS_EXCHANGE_RANGE_DSYNC; 512 513 return 0; 514 } 515 516 /* 517 * Finish a range exchange operation, if it was successful. Caller must ensure 518 * that the inodes are still locked against any other modifications. 519 */ 520 static inline int 521 xfs_exchange_range_finish( 522 struct xfs_exchrange *fxr) 523 { 524 int error; 525 526 error = file_remove_privs(fxr->file1); 527 if (error) 528 return error; 529 if (file_inode(fxr->file1) == file_inode(fxr->file2)) 530 return 0; 531 532 return file_remove_privs(fxr->file2); 533 } 534 535 /* 536 * Check the alignment of an exchange request when the allocation unit size 537 * isn't a power of two. The generic file-level helpers use (fast) 538 * bitmask-based alignment checks, but here we have to use slow long division. 539 */ 540 static int 541 xfs_exchrange_check_rtalign( 542 const struct xfs_exchrange *fxr, 543 struct xfs_inode *ip1, 544 struct xfs_inode *ip2, 545 unsigned int alloc_unit) 546 { 547 uint64_t length = fxr->length; 548 uint64_t blen; 549 loff_t size1, size2; 550 551 size1 = i_size_read(VFS_I(ip1)); 552 size2 = i_size_read(VFS_I(ip2)); 553 554 /* The start of both ranges must be aligned to a rt extent. */ 555 if (!isaligned_64(fxr->file1_offset, alloc_unit) || 556 !isaligned_64(fxr->file2_offset, alloc_unit)) 557 return -EINVAL; 558 559 if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) 560 length = max_t(int64_t, size1 - fxr->file1_offset, 561 size2 - fxr->file2_offset); 562 563 /* 564 * If the user wanted us to exchange up to the infile's EOF, round up 565 * to the next rt extent boundary for this check. Do the same for the 566 * outfile. 567 * 568 * Otherwise, reject the range length if it's not rt extent aligned. 569 * We already confirmed the starting offsets' rt extent block 570 * alignment. 571 */ 572 if (fxr->file1_offset + length == size1) 573 blen = roundup_64(size1, alloc_unit) - fxr->file1_offset; 574 else if (fxr->file2_offset + length == size2) 575 blen = roundup_64(size2, alloc_unit) - fxr->file2_offset; 576 else if (!isaligned_64(length, alloc_unit)) 577 return -EINVAL; 578 else 579 blen = length; 580 581 /* Don't allow overlapped exchanges within the same file. */ 582 if (ip1 == ip2 && 583 fxr->file2_offset + blen > fxr->file1_offset && 584 fxr->file1_offset + blen > fxr->file2_offset) 585 return -EINVAL; 586 587 /* 588 * Ensure that we don't exchange a partial EOF rt extent into the 589 * middle of another file. 590 */ 591 if (isaligned_64(length, alloc_unit)) 592 return 0; 593 594 blen = length; 595 if (fxr->file2_offset + length < size2) 596 blen = rounddown_64(blen, alloc_unit); 597 598 if (fxr->file1_offset + blen < size1) 599 blen = rounddown_64(blen, alloc_unit); 600 601 return blen == length ? 0 : -EINVAL; 602 } 603 604 /* Prepare two files to have their data exchanged. */ 605 STATIC int 606 xfs_exchrange_prep( 607 struct xfs_exchrange *fxr, 608 struct xfs_inode *ip1, 609 struct xfs_inode *ip2) 610 { 611 struct xfs_mount *mp = ip2->i_mount; 612 unsigned int alloc_unit = xfs_inode_alloc_unitsize(ip2); 613 int error; 614 615 trace_xfs_exchrange_prep(fxr, ip1, ip2); 616 617 /* Verify both files are either real-time or non-realtime */ 618 if (XFS_IS_REALTIME_INODE(ip1) != XFS_IS_REALTIME_INODE(ip2)) 619 return -EINVAL; 620 621 /* Check non-power of two alignment issues, if necessary. */ 622 if (!is_power_of_2(alloc_unit)) { 623 error = xfs_exchrange_check_rtalign(fxr, ip1, ip2, alloc_unit); 624 if (error) 625 return error; 626 627 /* 628 * Do the generic file-level checks with the regular block 629 * alignment. 630 */ 631 alloc_unit = mp->m_sb.sb_blocksize; 632 } 633 634 error = xfs_exchange_range_prep(fxr, alloc_unit); 635 if (error || fxr->length == 0) 636 return error; 637 638 if (fxr->flags & __XFS_EXCHANGE_RANGE_CHECK_FRESH2) { 639 error = xfs_exchrange_check_freshness(fxr, ip2); 640 if (error) 641 return error; 642 } 643 644 /* Attach dquots to both inodes before changing block maps. */ 645 error = xfs_qm_dqattach(ip2); 646 if (error) 647 return error; 648 error = xfs_qm_dqattach(ip1); 649 if (error) 650 return error; 651 652 trace_xfs_exchrange_flush(fxr, ip1, ip2); 653 654 /* Flush the relevant ranges of both files. */ 655 error = xfs_flush_unmap_range(ip2, fxr->file2_offset, fxr->length); 656 if (error) 657 return error; 658 error = xfs_flush_unmap_range(ip1, fxr->file1_offset, fxr->length); 659 if (error) 660 return error; 661 662 /* 663 * Cancel CoW fork preallocations for the ranges of both files. The 664 * prep function should have flushed all the dirty data, so the only 665 * CoW mappings remaining should be speculative. 666 */ 667 if (xfs_inode_has_cow_data(ip1)) { 668 error = xfs_reflink_cancel_cow_range(ip1, fxr->file1_offset, 669 fxr->length, true); 670 if (error) 671 return error; 672 } 673 674 if (xfs_inode_has_cow_data(ip2)) { 675 error = xfs_reflink_cancel_cow_range(ip2, fxr->file2_offset, 676 fxr->length, true); 677 if (error) 678 return error; 679 } 680 681 return 0; 682 } 683 684 /* 685 * Exchange contents of files. This is the binding between the generic 686 * file-level concepts and the XFS inode-specific implementation. 687 */ 688 STATIC int 689 xfs_exchrange_contents( 690 struct xfs_exchrange *fxr) 691 { 692 struct inode *inode1 = file_inode(fxr->file1); 693 struct inode *inode2 = file_inode(fxr->file2); 694 struct xfs_inode *ip1 = XFS_I(inode1); 695 struct xfs_inode *ip2 = XFS_I(inode2); 696 struct xfs_mount *mp = ip1->i_mount; 697 int error; 698 699 if (!xfs_has_exchange_range(mp)) 700 return -EOPNOTSUPP; 701 702 if (fxr->flags & ~(XFS_EXCHANGE_RANGE_ALL_FLAGS | 703 XFS_EXCHANGE_RANGE_PRIV_FLAGS)) 704 return -EINVAL; 705 706 if (xfs_is_shutdown(mp)) 707 return -EIO; 708 709 /* Lock both files against IO */ 710 error = xfs_ilock2_io_mmap(ip1, ip2); 711 if (error) 712 goto out_err; 713 714 /* Prepare and then exchange file contents. */ 715 error = xfs_exchrange_prep(fxr, ip1, ip2); 716 if (error) 717 goto out_unlock; 718 719 error = xfs_exchrange_mappings(fxr, ip1, ip2); 720 if (error) 721 goto out_unlock; 722 723 /* 724 * Finish the exchange by removing special file privileges like any 725 * other file write would do. This may involve turning on support for 726 * logged xattrs if either file has security capabilities. 727 */ 728 error = xfs_exchange_range_finish(fxr); 729 if (error) 730 goto out_unlock; 731 732 out_unlock: 733 xfs_iunlock2_io_mmap(ip1, ip2); 734 out_err: 735 if (error) 736 trace_xfs_exchrange_error(ip2, error, _RET_IP_); 737 return error; 738 } 739 740 /* Exchange parts of two files. */ 741 static int 742 xfs_exchange_range( 743 struct xfs_exchrange *fxr) 744 { 745 struct inode *inode1 = file_inode(fxr->file1); 746 struct inode *inode2 = file_inode(fxr->file2); 747 int ret; 748 749 BUILD_BUG_ON(XFS_EXCHANGE_RANGE_ALL_FLAGS & 750 XFS_EXCHANGE_RANGE_PRIV_FLAGS); 751 752 /* Both files must be on the same mount/filesystem. */ 753 if (fxr->file1->f_path.mnt != fxr->file2->f_path.mnt) 754 return -EXDEV; 755 756 if (fxr->flags & ~(XFS_EXCHANGE_RANGE_ALL_FLAGS | 757 __XFS_EXCHANGE_RANGE_CHECK_FRESH2)) 758 return -EINVAL; 759 760 /* Userspace requests only honored for regular files. */ 761 if (S_ISDIR(inode1->i_mode) || S_ISDIR(inode2->i_mode)) 762 return -EISDIR; 763 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode)) 764 return -EINVAL; 765 766 /* Both files must be opened for read and write. */ 767 if (!(fxr->file1->f_mode & FMODE_READ) || 768 !(fxr->file1->f_mode & FMODE_WRITE) || 769 !(fxr->file2->f_mode & FMODE_READ) || 770 !(fxr->file2->f_mode & FMODE_WRITE)) 771 return -EBADF; 772 773 /* Neither file can be opened append-only. */ 774 if ((fxr->file1->f_flags & O_APPEND) || 775 (fxr->file2->f_flags & O_APPEND)) 776 return -EBADF; 777 778 /* 779 * If we're not exchanging to EOF, we can check the areas before 780 * stabilizing both files' i_size. 781 */ 782 if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)) { 783 ret = xfs_exchange_range_verify_area(fxr); 784 if (ret) 785 return ret; 786 } 787 788 /* Update cmtime if the fd/inode don't forbid it. */ 789 if (!(fxr->file1->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode1)) 790 fxr->flags |= __XFS_EXCHANGE_RANGE_UPD_CMTIME1; 791 if (!(fxr->file2->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode2)) 792 fxr->flags |= __XFS_EXCHANGE_RANGE_UPD_CMTIME2; 793 794 file_start_write(fxr->file2); 795 ret = xfs_exchrange_contents(fxr); 796 file_end_write(fxr->file2); 797 if (ret) 798 return ret; 799 800 fsnotify_modify(fxr->file1); 801 if (fxr->file2 != fxr->file1) 802 fsnotify_modify(fxr->file2); 803 return 0; 804 } 805 806 /* Collect exchange-range arguments from userspace. */ 807 long 808 xfs_ioc_exchange_range( 809 struct file *file, 810 struct xfs_exchange_range __user *argp) 811 { 812 struct xfs_exchrange fxr = { 813 .file2 = file, 814 }; 815 struct xfs_exchange_range args; 816 817 if (copy_from_user(&args, argp, sizeof(args))) 818 return -EFAULT; 819 if (memchr_inv(&args.pad, 0, sizeof(args.pad))) 820 return -EINVAL; 821 if (args.flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS) 822 return -EINVAL; 823 824 fxr.file1_offset = args.file1_offset; 825 fxr.file2_offset = args.file2_offset; 826 fxr.length = args.length; 827 fxr.flags = args.flags; 828 829 CLASS(fd, file1)(args.file1_fd); 830 if (fd_empty(file1)) 831 return -EBADF; 832 fxr.file1 = fd_file(file1); 833 834 return xfs_exchange_range(&fxr); 835 } 836 837 /* Opaque freshness blob for XFS_IOC_COMMIT_RANGE */ 838 struct xfs_commit_range_fresh { 839 xfs_fsid_t fsid; /* m_fixedfsid */ 840 __u64 file2_ino; /* inode number */ 841 __s64 file2_mtime; /* modification time */ 842 __s64 file2_ctime; /* change time */ 843 __s32 file2_mtime_nsec; /* mod time, nsec */ 844 __s32 file2_ctime_nsec; /* change time, nsec */ 845 __u32 file2_gen; /* inode generation */ 846 __u32 magic; /* zero */ 847 }; 848 #define XCR_FRESH_MAGIC 0x444F524B /* DORK */ 849 850 /* Set up a commitrange operation by sampling file2's write-related attrs */ 851 long 852 xfs_ioc_start_commit( 853 struct file *file, 854 struct xfs_commit_range __user *argp) 855 { 856 struct xfs_commit_range args = { }; 857 struct kstat kstat = { }; 858 struct xfs_commit_range_fresh *kern_f; 859 struct xfs_commit_range_fresh __user *user_f; 860 struct inode *inode2 = file_inode(file); 861 struct xfs_inode *ip2 = XFS_I(inode2); 862 const unsigned int lockflags = XFS_IOLOCK_SHARED | 863 XFS_MMAPLOCK_SHARED | 864 XFS_ILOCK_SHARED; 865 866 BUILD_BUG_ON(sizeof(struct xfs_commit_range_fresh) != 867 sizeof(args.file2_freshness)); 868 869 kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness; 870 871 memcpy(&kern_f->fsid, ip2->i_mount->m_fixedfsid, sizeof(xfs_fsid_t)); 872 873 xfs_ilock(ip2, lockflags); 874 /* Force writing of a distinct ctime if any writes happen. */ 875 fill_mg_cmtime(&kstat, STATX_CTIME | STATX_MTIME, inode2); 876 kern_f->file2_ctime = kstat.ctime.tv_sec; 877 kern_f->file2_ctime_nsec = kstat.ctime.tv_nsec; 878 kern_f->file2_mtime = kstat.mtime.tv_sec; 879 kern_f->file2_mtime_nsec = kstat.mtime.tv_nsec; 880 kern_f->file2_ino = ip2->i_ino; 881 kern_f->file2_gen = inode2->i_generation; 882 kern_f->magic = XCR_FRESH_MAGIC; 883 xfs_iunlock(ip2, lockflags); 884 885 user_f = (struct xfs_commit_range_fresh __user *)&argp->file2_freshness; 886 if (copy_to_user(user_f, kern_f, sizeof(*kern_f))) 887 return -EFAULT; 888 889 return 0; 890 } 891 892 /* 893 * Exchange file1 and file2 contents if file2 has not been written since the 894 * start commit operation. 895 */ 896 long 897 xfs_ioc_commit_range( 898 struct file *file, 899 struct xfs_commit_range __user *argp) 900 { 901 struct xfs_exchrange fxr = { 902 .file2 = file, 903 }; 904 struct xfs_commit_range args; 905 struct xfs_commit_range_fresh *kern_f; 906 struct xfs_inode *ip2 = XFS_I(file_inode(file)); 907 struct xfs_mount *mp = ip2->i_mount; 908 909 kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness; 910 911 if (copy_from_user(&args, argp, sizeof(args))) 912 return -EFAULT; 913 if (args.flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS) 914 return -EINVAL; 915 if (kern_f->magic != XCR_FRESH_MAGIC) 916 return -EBUSY; 917 if (memcmp(&kern_f->fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t))) 918 return -EBUSY; 919 920 fxr.file1_offset = args.file1_offset; 921 fxr.file2_offset = args.file2_offset; 922 fxr.length = args.length; 923 fxr.flags = args.flags | __XFS_EXCHANGE_RANGE_CHECK_FRESH2; 924 fxr.file2_ino = kern_f->file2_ino; 925 fxr.file2_gen = kern_f->file2_gen; 926 fxr.file2_mtime.tv_sec = kern_f->file2_mtime; 927 fxr.file2_mtime.tv_nsec = kern_f->file2_mtime_nsec; 928 fxr.file2_ctime.tv_sec = kern_f->file2_ctime; 929 fxr.file2_ctime.tv_nsec = kern_f->file2_ctime_nsec; 930 931 CLASS(fd, file1)(args.file1_fd); 932 if (fd_empty(file1)) 933 return -EBADF; 934 fxr.file1 = fd_file(file1); 935 936 return xfs_exchange_range(&fxr); 937 } 938