1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (c) 2020-2024 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs.h" 7 #include "xfs_shared.h" 8 #include "xfs_format.h" 9 #include "xfs_log_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_defer.h" 13 #include "xfs_inode.h" 14 #include "xfs_trans.h" 15 #include "xfs_quota.h" 16 #include "xfs_bmap_util.h" 17 #include "xfs_reflink.h" 18 #include "xfs_trace.h" 19 #include "xfs_exchrange.h" 20 #include "xfs_exchmaps.h" 21 #include "xfs_sb.h" 22 #include "xfs_icache.h" 23 #include "xfs_log.h" 24 #include "xfs_rtbitmap.h" 25 #include <linux/fsnotify.h> 26 27 /* Lock (and optionally join) two inodes for a file range exchange. */ 28 void 29 xfs_exchrange_ilock( 30 struct xfs_trans *tp, 31 struct xfs_inode *ip1, 32 struct xfs_inode *ip2) 33 { 34 if (ip1 != ip2) 35 xfs_lock_two_inodes(ip1, XFS_ILOCK_EXCL, 36 ip2, XFS_ILOCK_EXCL); 37 else 38 xfs_ilock(ip1, XFS_ILOCK_EXCL); 39 if (tp) { 40 xfs_trans_ijoin(tp, ip1, 0); 41 if (ip2 != ip1) 42 xfs_trans_ijoin(tp, ip2, 0); 43 } 44 45 } 46 47 /* Unlock two inodes after a file range exchange operation. */ 48 void 49 xfs_exchrange_iunlock( 50 struct xfs_inode *ip1, 51 struct xfs_inode *ip2) 52 { 53 if (ip2 != ip1) 54 xfs_iunlock(ip2, XFS_ILOCK_EXCL); 55 xfs_iunlock(ip1, XFS_ILOCK_EXCL); 56 } 57 58 /* 59 * Estimate the resource requirements to exchange file contents between the two 60 * files. The caller is required to hold the IOLOCK and the MMAPLOCK and to 61 * have flushed both inodes' pagecache and active direct-ios. 62 */ 63 int 64 xfs_exchrange_estimate( 65 struct xfs_exchmaps_req *req) 66 { 67 int error; 68 69 xfs_exchrange_ilock(NULL, req->ip1, req->ip2); 70 error = xfs_exchmaps_estimate(req); 71 xfs_exchrange_iunlock(req->ip1, req->ip2); 72 return error; 73 } 74 75 #define QRETRY_IP1 (0x1) 76 #define QRETRY_IP2 (0x2) 77 78 /* 79 * Obtain a quota reservation to make sure we don't hit EDQUOT. We can skip 80 * this if quota enforcement is disabled or if both inodes' dquots are the 81 * same. The qretry structure must be initialized to zeroes before the first 82 * call to this function. 83 */ 84 STATIC int 85 xfs_exchrange_reserve_quota( 86 struct xfs_trans *tp, 87 const struct xfs_exchmaps_req *req, 88 unsigned int *qretry) 89 { 90 int64_t ddelta, rdelta; 91 int ip1_error = 0; 92 int error; 93 94 /* 95 * Don't bother with a quota reservation if we're not enforcing them 96 * or the two inodes have the same dquots. 97 */ 98 if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 || 99 (req->ip1->i_udquot == req->ip2->i_udquot && 100 req->ip1->i_gdquot == req->ip2->i_gdquot && 101 req->ip1->i_pdquot == req->ip2->i_pdquot)) 102 return 0; 103 104 *qretry = 0; 105 106 /* 107 * For each file, compute the net gain in the number of regular blocks 108 * that will be mapped into that file and reserve that much quota. The 109 * quota counts must be able to absorb at least that much space. 110 */ 111 ddelta = req->ip2_bcount - req->ip1_bcount; 112 rdelta = req->ip2_rtbcount - req->ip1_rtbcount; 113 if (ddelta > 0 || rdelta > 0) { 114 error = xfs_trans_reserve_quota_nblks(tp, req->ip1, 115 ddelta > 0 ? ddelta : 0, 116 rdelta > 0 ? rdelta : 0, 117 false); 118 if (error == -EDQUOT || error == -ENOSPC) { 119 /* 120 * Save this error and see what happens if we try to 121 * reserve quota for ip2. Then report both. 122 */ 123 *qretry |= QRETRY_IP1; 124 ip1_error = error; 125 error = 0; 126 } 127 if (error) 128 return error; 129 } 130 if (ddelta < 0 || rdelta < 0) { 131 error = xfs_trans_reserve_quota_nblks(tp, req->ip2, 132 ddelta < 0 ? -ddelta : 0, 133 rdelta < 0 ? -rdelta : 0, 134 false); 135 if (error == -EDQUOT || error == -ENOSPC) 136 *qretry |= QRETRY_IP2; 137 if (error) 138 return error; 139 } 140 if (ip1_error) 141 return ip1_error; 142 143 /* 144 * For each file, forcibly reserve the gross gain in mapped blocks so 145 * that we don't trip over any quota block reservation assertions. 146 * We must reserve the gross gain because the quota code subtracts from 147 * bcount the number of blocks that we unmap; it does not add that 148 * quantity back to the quota block reservation. 149 */ 150 error = xfs_trans_reserve_quota_nblks(tp, req->ip1, req->ip1_bcount, 151 req->ip1_rtbcount, true); 152 if (error) 153 return error; 154 155 return xfs_trans_reserve_quota_nblks(tp, req->ip2, req->ip2_bcount, 156 req->ip2_rtbcount, true); 157 } 158 159 /* Exchange the mappings (and hence the contents) of two files' forks. */ 160 STATIC int 161 xfs_exchrange_mappings( 162 const struct xfs_exchrange *fxr, 163 struct xfs_inode *ip1, 164 struct xfs_inode *ip2) 165 { 166 struct xfs_mount *mp = ip1->i_mount; 167 struct xfs_exchmaps_req req = { 168 .ip1 = ip1, 169 .ip2 = ip2, 170 .startoff1 = XFS_B_TO_FSBT(mp, fxr->file1_offset), 171 .startoff2 = XFS_B_TO_FSBT(mp, fxr->file2_offset), 172 .blockcount = XFS_B_TO_FSB(mp, fxr->length), 173 }; 174 struct xfs_trans *tp; 175 unsigned int qretry; 176 bool retried = false; 177 int error; 178 179 trace_xfs_exchrange_mappings(fxr, ip1, ip2); 180 181 if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) 182 req.flags |= XFS_EXCHMAPS_SET_SIZES; 183 if (fxr->flags & XFS_EXCHANGE_RANGE_FILE1_WRITTEN) 184 req.flags |= XFS_EXCHMAPS_INO1_WRITTEN; 185 186 /* 187 * Round the request length up to the nearest file allocation unit. 188 * The prep function already checked that the request offsets and 189 * length in @fxr are safe to round up. 190 */ 191 if (xfs_inode_has_bigrtalloc(ip2)) 192 req.blockcount = xfs_rtb_roundup_rtx(mp, req.blockcount); 193 194 error = xfs_exchrange_estimate(&req); 195 if (error) 196 return error; 197 198 retry: 199 /* Allocate the transaction, lock the inodes, and join them. */ 200 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, req.resblks, 0, 201 XFS_TRANS_RES_FDBLKS, &tp); 202 if (error) 203 return error; 204 205 xfs_exchrange_ilock(tp, ip1, ip2); 206 207 trace_xfs_exchrange_before(ip2, 2); 208 trace_xfs_exchrange_before(ip1, 1); 209 210 error = xfs_exchmaps_check_forks(mp, &req); 211 if (error) 212 goto out_trans_cancel; 213 214 /* 215 * Reserve ourselves some quota if any of them are in enforcing mode. 216 * In theory we only need enough to satisfy the change in the number 217 * of blocks between the two ranges being remapped. 218 */ 219 error = xfs_exchrange_reserve_quota(tp, &req, &qretry); 220 if ((error == -EDQUOT || error == -ENOSPC) && !retried) { 221 xfs_trans_cancel(tp); 222 xfs_exchrange_iunlock(ip1, ip2); 223 if (qretry & QRETRY_IP1) 224 xfs_blockgc_free_quota(ip1, 0); 225 if (qretry & QRETRY_IP2) 226 xfs_blockgc_free_quota(ip2, 0); 227 retried = true; 228 goto retry; 229 } 230 if (error) 231 goto out_trans_cancel; 232 233 /* If we got this far on a dry run, all parameters are ok. */ 234 if (fxr->flags & XFS_EXCHANGE_RANGE_DRY_RUN) 235 goto out_trans_cancel; 236 237 /* Update the mtime and ctime of both files. */ 238 if (fxr->flags & __XFS_EXCHANGE_RANGE_UPD_CMTIME1) 239 xfs_trans_ichgtime(tp, ip1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 240 if (fxr->flags & __XFS_EXCHANGE_RANGE_UPD_CMTIME2) 241 xfs_trans_ichgtime(tp, ip2, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 242 243 xfs_exchange_mappings(tp, &req); 244 245 /* 246 * Force the log to persist metadata updates if the caller or the 247 * administrator requires this. The generic prep function already 248 * flushed the relevant parts of the page cache. 249 */ 250 if (xfs_has_wsync(mp) || (fxr->flags & XFS_EXCHANGE_RANGE_DSYNC)) 251 xfs_trans_set_sync(tp); 252 253 error = xfs_trans_commit(tp); 254 255 trace_xfs_exchrange_after(ip2, 2); 256 trace_xfs_exchrange_after(ip1, 1); 257 258 if (error) 259 goto out_unlock; 260 261 /* 262 * If the caller wanted us to exchange the contents of two complete 263 * files of unequal length, exchange the incore sizes now. This should 264 * be safe because we flushed both files' page caches, exchanged all 265 * the mappings, and updated the ondisk sizes. 266 */ 267 if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) { 268 loff_t temp; 269 270 temp = i_size_read(VFS_I(ip2)); 271 i_size_write(VFS_I(ip2), i_size_read(VFS_I(ip1))); 272 i_size_write(VFS_I(ip1), temp); 273 } 274 275 out_unlock: 276 xfs_exchrange_iunlock(ip1, ip2); 277 return error; 278 279 out_trans_cancel: 280 xfs_trans_cancel(tp); 281 goto out_unlock; 282 } 283 284 /* 285 * Generic code for exchanging ranges of two files via XFS_IOC_EXCHANGE_RANGE. 286 * This part deals with struct file objects and byte ranges and does not deal 287 * with XFS-specific data structures such as xfs_inodes and block ranges. This 288 * separation may some day facilitate porting to another filesystem. 289 * 290 * The goal is to exchange fxr.length bytes starting at fxr.file1_offset in 291 * file1 with the same number of bytes starting at fxr.file2_offset in file2. 292 * Implementations must call xfs_exchange_range_prep to prepare the two 293 * files prior to taking locks; and they must update the inode change and mod 294 * times of both files as part of the metadata update. The timestamp update 295 * and freshness checks must be done atomically as part of the data exchange 296 * operation to ensure correctness of the freshness check. 297 * xfs_exchange_range_finish must be called after the operation completes 298 * successfully but before locks are dropped. 299 */ 300 301 /* Verify that we have security clearance to perform this operation. */ 302 static int 303 xfs_exchange_range_verify_area( 304 struct xfs_exchrange *fxr) 305 { 306 int ret; 307 308 ret = remap_verify_area(fxr->file1, fxr->file1_offset, fxr->length, 309 true); 310 if (ret) 311 return ret; 312 313 return remap_verify_area(fxr->file2, fxr->file2_offset, fxr->length, 314 true); 315 } 316 317 /* 318 * Performs necessary checks before doing a range exchange, having stabilized 319 * mutable inode attributes via i_rwsem. 320 */ 321 static inline int 322 xfs_exchange_range_checks( 323 struct xfs_exchrange *fxr, 324 unsigned int alloc_unit) 325 { 326 struct inode *inode1 = file_inode(fxr->file1); 327 struct inode *inode2 = file_inode(fxr->file2); 328 uint64_t allocmask = alloc_unit - 1; 329 int64_t test_len; 330 uint64_t blen; 331 loff_t size1, size2, tmp; 332 int error; 333 334 /* Don't touch certain kinds of inodes */ 335 if (IS_IMMUTABLE(inode1) || IS_IMMUTABLE(inode2)) 336 return -EPERM; 337 if (IS_SWAPFILE(inode1) || IS_SWAPFILE(inode2)) 338 return -ETXTBSY; 339 340 size1 = i_size_read(inode1); 341 size2 = i_size_read(inode2); 342 343 /* Ranges cannot start after EOF. */ 344 if (fxr->file1_offset > size1 || fxr->file2_offset > size2) 345 return -EINVAL; 346 347 /* 348 * If the caller said to exchange to EOF, we set the length of the 349 * request large enough to cover everything to the end of both files. 350 */ 351 if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) { 352 fxr->length = max_t(int64_t, size1 - fxr->file1_offset, 353 size2 - fxr->file2_offset); 354 355 error = xfs_exchange_range_verify_area(fxr); 356 if (error) 357 return error; 358 } 359 360 /* 361 * The start of both ranges must be aligned to the file allocation 362 * unit. 363 */ 364 if (!IS_ALIGNED(fxr->file1_offset, alloc_unit) || 365 !IS_ALIGNED(fxr->file2_offset, alloc_unit)) 366 return -EINVAL; 367 368 /* Ensure offsets don't wrap. */ 369 if (check_add_overflow(fxr->file1_offset, fxr->length, &tmp) || 370 check_add_overflow(fxr->file2_offset, fxr->length, &tmp)) 371 return -EINVAL; 372 373 /* 374 * We require both ranges to end within EOF, unless we're exchanging 375 * to EOF. 376 */ 377 if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) && 378 (fxr->file1_offset + fxr->length > size1 || 379 fxr->file2_offset + fxr->length > size2)) 380 return -EINVAL; 381 382 /* 383 * Make sure we don't hit any file size limits. If we hit any size 384 * limits such that test_length was adjusted, we abort the whole 385 * operation. 386 */ 387 test_len = fxr->length; 388 error = generic_write_check_limits(fxr->file2, fxr->file2_offset, 389 &test_len); 390 if (error) 391 return error; 392 error = generic_write_check_limits(fxr->file1, fxr->file1_offset, 393 &test_len); 394 if (error) 395 return error; 396 if (test_len != fxr->length) 397 return -EINVAL; 398 399 /* 400 * If the user wanted us to exchange up to the infile's EOF, round up 401 * to the next allocation unit boundary for this check. Do the same 402 * for the outfile. 403 * 404 * Otherwise, reject the range length if it's not aligned to an 405 * allocation unit. 406 */ 407 if (fxr->file1_offset + fxr->length == size1) 408 blen = ALIGN(size1, alloc_unit) - fxr->file1_offset; 409 else if (fxr->file2_offset + fxr->length == size2) 410 blen = ALIGN(size2, alloc_unit) - fxr->file2_offset; 411 else if (!IS_ALIGNED(fxr->length, alloc_unit)) 412 return -EINVAL; 413 else 414 blen = fxr->length; 415 416 /* Don't allow overlapped exchanges within the same file. */ 417 if (inode1 == inode2 && 418 fxr->file2_offset + blen > fxr->file1_offset && 419 fxr->file1_offset + blen > fxr->file2_offset) 420 return -EINVAL; 421 422 /* 423 * Ensure that we don't exchange a partial EOF block into the middle of 424 * another file. 425 */ 426 if ((fxr->length & allocmask) == 0) 427 return 0; 428 429 blen = fxr->length; 430 if (fxr->file2_offset + blen < size2) 431 blen &= ~allocmask; 432 433 if (fxr->file1_offset + blen < size1) 434 blen &= ~allocmask; 435 436 return blen == fxr->length ? 0 : -EINVAL; 437 } 438 439 /* 440 * Check that the two inodes are eligible for range exchanges, the ranges make 441 * sense, and then flush all dirty data. Caller must ensure that the inodes 442 * have been locked against any other modifications. 443 */ 444 static inline int 445 xfs_exchange_range_prep( 446 struct xfs_exchrange *fxr, 447 unsigned int alloc_unit) 448 { 449 struct inode *inode1 = file_inode(fxr->file1); 450 struct inode *inode2 = file_inode(fxr->file2); 451 bool same_inode = (inode1 == inode2); 452 int error; 453 454 /* Check that we don't violate system file offset limits. */ 455 error = xfs_exchange_range_checks(fxr, alloc_unit); 456 if (error || fxr->length == 0) 457 return error; 458 459 /* Wait for the completion of any pending IOs on both files */ 460 inode_dio_wait(inode1); 461 if (!same_inode) 462 inode_dio_wait(inode2); 463 464 error = filemap_write_and_wait_range(inode1->i_mapping, 465 fxr->file1_offset, 466 fxr->file1_offset + fxr->length - 1); 467 if (error) 468 return error; 469 470 error = filemap_write_and_wait_range(inode2->i_mapping, 471 fxr->file2_offset, 472 fxr->file2_offset + fxr->length - 1); 473 if (error) 474 return error; 475 476 /* 477 * If the files or inodes involved require synchronous writes, amend 478 * the request to force the filesystem to flush all data and metadata 479 * to disk after the operation completes. 480 */ 481 if (((fxr->file1->f_flags | fxr->file2->f_flags) & O_SYNC) || 482 IS_SYNC(inode1) || IS_SYNC(inode2)) 483 fxr->flags |= XFS_EXCHANGE_RANGE_DSYNC; 484 485 return 0; 486 } 487 488 /* 489 * Finish a range exchange operation, if it was successful. Caller must ensure 490 * that the inodes are still locked against any other modifications. 491 */ 492 static inline int 493 xfs_exchange_range_finish( 494 struct xfs_exchrange *fxr) 495 { 496 int error; 497 498 error = file_remove_privs(fxr->file1); 499 if (error) 500 return error; 501 if (file_inode(fxr->file1) == file_inode(fxr->file2)) 502 return 0; 503 504 return file_remove_privs(fxr->file2); 505 } 506 507 /* 508 * Check the alignment of an exchange request when the allocation unit size 509 * isn't a power of two. The generic file-level helpers use (fast) 510 * bitmask-based alignment checks, but here we have to use slow long division. 511 */ 512 static int 513 xfs_exchrange_check_rtalign( 514 const struct xfs_exchrange *fxr, 515 struct xfs_inode *ip1, 516 struct xfs_inode *ip2, 517 unsigned int alloc_unit) 518 { 519 uint64_t length = fxr->length; 520 uint64_t blen; 521 loff_t size1, size2; 522 523 size1 = i_size_read(VFS_I(ip1)); 524 size2 = i_size_read(VFS_I(ip2)); 525 526 /* The start of both ranges must be aligned to a rt extent. */ 527 if (!isaligned_64(fxr->file1_offset, alloc_unit) || 528 !isaligned_64(fxr->file2_offset, alloc_unit)) 529 return -EINVAL; 530 531 if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) 532 length = max_t(int64_t, size1 - fxr->file1_offset, 533 size2 - fxr->file2_offset); 534 535 /* 536 * If the user wanted us to exchange up to the infile's EOF, round up 537 * to the next rt extent boundary for this check. Do the same for the 538 * outfile. 539 * 540 * Otherwise, reject the range length if it's not rt extent aligned. 541 * We already confirmed the starting offsets' rt extent block 542 * alignment. 543 */ 544 if (fxr->file1_offset + length == size1) 545 blen = roundup_64(size1, alloc_unit) - fxr->file1_offset; 546 else if (fxr->file2_offset + length == size2) 547 blen = roundup_64(size2, alloc_unit) - fxr->file2_offset; 548 else if (!isaligned_64(length, alloc_unit)) 549 return -EINVAL; 550 else 551 blen = length; 552 553 /* Don't allow overlapped exchanges within the same file. */ 554 if (ip1 == ip2 && 555 fxr->file2_offset + blen > fxr->file1_offset && 556 fxr->file1_offset + blen > fxr->file2_offset) 557 return -EINVAL; 558 559 /* 560 * Ensure that we don't exchange a partial EOF rt extent into the 561 * middle of another file. 562 */ 563 if (isaligned_64(length, alloc_unit)) 564 return 0; 565 566 blen = length; 567 if (fxr->file2_offset + length < size2) 568 blen = rounddown_64(blen, alloc_unit); 569 570 if (fxr->file1_offset + blen < size1) 571 blen = rounddown_64(blen, alloc_unit); 572 573 return blen == length ? 0 : -EINVAL; 574 } 575 576 /* Prepare two files to have their data exchanged. */ 577 STATIC int 578 xfs_exchrange_prep( 579 struct xfs_exchrange *fxr, 580 struct xfs_inode *ip1, 581 struct xfs_inode *ip2) 582 { 583 struct xfs_mount *mp = ip2->i_mount; 584 unsigned int alloc_unit = xfs_inode_alloc_unitsize(ip2); 585 int error; 586 587 trace_xfs_exchrange_prep(fxr, ip1, ip2); 588 589 /* Verify both files are either real-time or non-realtime */ 590 if (XFS_IS_REALTIME_INODE(ip1) != XFS_IS_REALTIME_INODE(ip2)) 591 return -EINVAL; 592 593 /* Check non-power of two alignment issues, if necessary. */ 594 if (!is_power_of_2(alloc_unit)) { 595 error = xfs_exchrange_check_rtalign(fxr, ip1, ip2, alloc_unit); 596 if (error) 597 return error; 598 599 /* 600 * Do the generic file-level checks with the regular block 601 * alignment. 602 */ 603 alloc_unit = mp->m_sb.sb_blocksize; 604 } 605 606 error = xfs_exchange_range_prep(fxr, alloc_unit); 607 if (error || fxr->length == 0) 608 return error; 609 610 /* Attach dquots to both inodes before changing block maps. */ 611 error = xfs_qm_dqattach(ip2); 612 if (error) 613 return error; 614 error = xfs_qm_dqattach(ip1); 615 if (error) 616 return error; 617 618 trace_xfs_exchrange_flush(fxr, ip1, ip2); 619 620 /* Flush the relevant ranges of both files. */ 621 error = xfs_flush_unmap_range(ip2, fxr->file2_offset, fxr->length); 622 if (error) 623 return error; 624 error = xfs_flush_unmap_range(ip1, fxr->file1_offset, fxr->length); 625 if (error) 626 return error; 627 628 /* 629 * Cancel CoW fork preallocations for the ranges of both files. The 630 * prep function should have flushed all the dirty data, so the only 631 * CoW mappings remaining should be speculative. 632 */ 633 if (xfs_inode_has_cow_data(ip1)) { 634 error = xfs_reflink_cancel_cow_range(ip1, fxr->file1_offset, 635 fxr->length, true); 636 if (error) 637 return error; 638 } 639 640 if (xfs_inode_has_cow_data(ip2)) { 641 error = xfs_reflink_cancel_cow_range(ip2, fxr->file2_offset, 642 fxr->length, true); 643 if (error) 644 return error; 645 } 646 647 return 0; 648 } 649 650 /* 651 * Exchange contents of files. This is the binding between the generic 652 * file-level concepts and the XFS inode-specific implementation. 653 */ 654 STATIC int 655 xfs_exchrange_contents( 656 struct xfs_exchrange *fxr) 657 { 658 struct inode *inode1 = file_inode(fxr->file1); 659 struct inode *inode2 = file_inode(fxr->file2); 660 struct xfs_inode *ip1 = XFS_I(inode1); 661 struct xfs_inode *ip2 = XFS_I(inode2); 662 struct xfs_mount *mp = ip1->i_mount; 663 int error; 664 665 if (!xfs_has_exchange_range(mp)) 666 return -EOPNOTSUPP; 667 668 if (fxr->flags & ~(XFS_EXCHANGE_RANGE_ALL_FLAGS | 669 XFS_EXCHANGE_RANGE_PRIV_FLAGS)) 670 return -EINVAL; 671 672 if (xfs_is_shutdown(mp)) 673 return -EIO; 674 675 /* Lock both files against IO */ 676 error = xfs_ilock2_io_mmap(ip1, ip2); 677 if (error) 678 goto out_err; 679 680 /* Prepare and then exchange file contents. */ 681 error = xfs_exchrange_prep(fxr, ip1, ip2); 682 if (error) 683 goto out_unlock; 684 685 error = xfs_exchrange_mappings(fxr, ip1, ip2); 686 if (error) 687 goto out_unlock; 688 689 /* 690 * Finish the exchange by removing special file privileges like any 691 * other file write would do. This may involve turning on support for 692 * logged xattrs if either file has security capabilities. 693 */ 694 error = xfs_exchange_range_finish(fxr); 695 if (error) 696 goto out_unlock; 697 698 out_unlock: 699 xfs_iunlock2_io_mmap(ip1, ip2); 700 out_err: 701 if (error) 702 trace_xfs_exchrange_error(ip2, error, _RET_IP_); 703 return error; 704 } 705 706 /* Exchange parts of two files. */ 707 static int 708 xfs_exchange_range( 709 struct xfs_exchrange *fxr) 710 { 711 struct inode *inode1 = file_inode(fxr->file1); 712 struct inode *inode2 = file_inode(fxr->file2); 713 int ret; 714 715 BUILD_BUG_ON(XFS_EXCHANGE_RANGE_ALL_FLAGS & 716 XFS_EXCHANGE_RANGE_PRIV_FLAGS); 717 718 /* Both files must be on the same mount/filesystem. */ 719 if (fxr->file1->f_path.mnt != fxr->file2->f_path.mnt) 720 return -EXDEV; 721 722 if (fxr->flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS) 723 return -EINVAL; 724 725 /* Userspace requests only honored for regular files. */ 726 if (S_ISDIR(inode1->i_mode) || S_ISDIR(inode2->i_mode)) 727 return -EISDIR; 728 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode)) 729 return -EINVAL; 730 731 /* Both files must be opened for read and write. */ 732 if (!(fxr->file1->f_mode & FMODE_READ) || 733 !(fxr->file1->f_mode & FMODE_WRITE) || 734 !(fxr->file2->f_mode & FMODE_READ) || 735 !(fxr->file2->f_mode & FMODE_WRITE)) 736 return -EBADF; 737 738 /* Neither file can be opened append-only. */ 739 if ((fxr->file1->f_flags & O_APPEND) || 740 (fxr->file2->f_flags & O_APPEND)) 741 return -EBADF; 742 743 /* 744 * If we're not exchanging to EOF, we can check the areas before 745 * stabilizing both files' i_size. 746 */ 747 if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)) { 748 ret = xfs_exchange_range_verify_area(fxr); 749 if (ret) 750 return ret; 751 } 752 753 /* Update cmtime if the fd/inode don't forbid it. */ 754 if (!(fxr->file1->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode1)) 755 fxr->flags |= __XFS_EXCHANGE_RANGE_UPD_CMTIME1; 756 if (!(fxr->file2->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode2)) 757 fxr->flags |= __XFS_EXCHANGE_RANGE_UPD_CMTIME2; 758 759 file_start_write(fxr->file2); 760 ret = xfs_exchrange_contents(fxr); 761 file_end_write(fxr->file2); 762 if (ret) 763 return ret; 764 765 fsnotify_modify(fxr->file1); 766 if (fxr->file2 != fxr->file1) 767 fsnotify_modify(fxr->file2); 768 return 0; 769 } 770 771 /* Collect exchange-range arguments from userspace. */ 772 long 773 xfs_ioc_exchange_range( 774 struct file *file, 775 struct xfs_exchange_range __user *argp) 776 { 777 struct xfs_exchrange fxr = { 778 .file2 = file, 779 }; 780 struct xfs_exchange_range args; 781 struct fd file1; 782 int error; 783 784 if (copy_from_user(&args, argp, sizeof(args))) 785 return -EFAULT; 786 if (memchr_inv(&args.pad, 0, sizeof(args.pad))) 787 return -EINVAL; 788 if (args.flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS) 789 return -EINVAL; 790 791 fxr.file1_offset = args.file1_offset; 792 fxr.file2_offset = args.file2_offset; 793 fxr.length = args.length; 794 fxr.flags = args.flags; 795 796 file1 = fdget(args.file1_fd); 797 if (!fd_file(file1)) 798 return -EBADF; 799 fxr.file1 = fd_file(file1); 800 801 error = xfs_exchange_range(&fxr); 802 fdput(file1); 803 return error; 804 } 805