1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (c) 2020-2024 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_defer.h" 14 #include "xfs_inode.h" 15 #include "xfs_trans.h" 16 #include "xfs_bmap.h" 17 #include "xfs_icache.h" 18 #include "xfs_quota.h" 19 #include "xfs_exchmaps.h" 20 #include "xfs_trace.h" 21 #include "xfs_bmap_btree.h" 22 #include "xfs_trans_space.h" 23 #include "xfs_error.h" 24 #include "xfs_errortag.h" 25 #include "xfs_health.h" 26 #include "xfs_exchmaps_item.h" 27 #include "xfs_da_format.h" 28 #include "xfs_da_btree.h" 29 #include "xfs_attr_leaf.h" 30 #include "xfs_attr.h" 31 #include "xfs_dir2_priv.h" 32 #include "xfs_dir2.h" 33 #include "xfs_symlink_remote.h" 34 35 struct kmem_cache *xfs_exchmaps_intent_cache; 36 37 /* bmbt mappings adjacent to a pair of records. */ 38 struct xfs_exchmaps_adjacent { 39 struct xfs_bmbt_irec left1; 40 struct xfs_bmbt_irec right1; 41 struct xfs_bmbt_irec left2; 42 struct xfs_bmbt_irec right2; 43 }; 44 45 #define ADJACENT_INIT { \ 46 .left1 = { .br_startblock = HOLESTARTBLOCK }, \ 47 .right1 = { .br_startblock = HOLESTARTBLOCK }, \ 48 .left2 = { .br_startblock = HOLESTARTBLOCK }, \ 49 .right2 = { .br_startblock = HOLESTARTBLOCK }, \ 50 } 51 52 /* Information to reset reflink flag / CoW fork state after an exchange. */ 53 54 /* 55 * If the reflink flag is set on either inode, make sure it has an incore CoW 56 * fork, since all reflink inodes must have them. If there's a CoW fork and it 57 * has mappings in it, make sure the inodes are tagged appropriately so that 58 * speculative preallocations can be GC'd if we run low of space. 59 */ 60 static inline void 61 xfs_exchmaps_ensure_cowfork( 62 struct xfs_inode *ip) 63 { 64 struct xfs_ifork *cfork; 65 66 if (xfs_is_reflink_inode(ip)) 67 xfs_ifork_init_cow(ip); 68 69 cfork = xfs_ifork_ptr(ip, XFS_COW_FORK); 70 if (!cfork) 71 return; 72 if (cfork->if_bytes > 0) 73 xfs_inode_set_cowblocks_tag(ip); 74 else 75 xfs_inode_clear_cowblocks_tag(ip); 76 } 77 78 /* 79 * Adjust the on-disk inode size upwards if needed so that we never add 80 * mappings into the file past EOF. This is crucial so that log recovery won't 81 * get confused by the sudden appearance of post-eof mappings. 82 */ 83 STATIC void 84 xfs_exchmaps_update_size( 85 struct xfs_trans *tp, 86 struct xfs_inode *ip, 87 struct xfs_bmbt_irec *imap, 88 xfs_fsize_t new_isize) 89 { 90 struct xfs_mount *mp = tp->t_mountp; 91 xfs_fsize_t len; 92 93 if (new_isize < 0) 94 return; 95 96 len = min(XFS_FSB_TO_B(mp, imap->br_startoff + imap->br_blockcount), 97 new_isize); 98 99 if (len <= ip->i_disk_size) 100 return; 101 102 trace_xfs_exchmaps_update_inode_size(ip, len); 103 104 ip->i_disk_size = len; 105 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 106 } 107 108 /* Advance the incore state tracking after exchanging a mapping. */ 109 static inline void 110 xmi_advance( 111 struct xfs_exchmaps_intent *xmi, 112 const struct xfs_bmbt_irec *irec) 113 { 114 xmi->xmi_startoff1 += irec->br_blockcount; 115 xmi->xmi_startoff2 += irec->br_blockcount; 116 xmi->xmi_blockcount -= irec->br_blockcount; 117 } 118 119 /* Do we still have more mappings to exchange? */ 120 static inline bool 121 xmi_has_more_exchange_work(const struct xfs_exchmaps_intent *xmi) 122 { 123 return xmi->xmi_blockcount > 0; 124 } 125 126 /* Do we have post-operation cleanups to perform? */ 127 static inline bool 128 xmi_has_postop_work(const struct xfs_exchmaps_intent *xmi) 129 { 130 return xmi->xmi_flags & (XFS_EXCHMAPS_CLEAR_INO1_REFLINK | 131 XFS_EXCHMAPS_CLEAR_INO2_REFLINK | 132 __XFS_EXCHMAPS_INO2_SHORTFORM); 133 } 134 135 /* Check all mappings to make sure we can actually exchange them. */ 136 int 137 xfs_exchmaps_check_forks( 138 struct xfs_mount *mp, 139 const struct xfs_exchmaps_req *req) 140 { 141 struct xfs_ifork *ifp1, *ifp2; 142 int whichfork = xfs_exchmaps_reqfork(req); 143 144 /* No fork? */ 145 ifp1 = xfs_ifork_ptr(req->ip1, whichfork); 146 ifp2 = xfs_ifork_ptr(req->ip2, whichfork); 147 if (!ifp1 || !ifp2) 148 return -EINVAL; 149 150 /* We don't know how to exchange local format forks. */ 151 if (ifp1->if_format == XFS_DINODE_FMT_LOCAL || 152 ifp2->if_format == XFS_DINODE_FMT_LOCAL) 153 return -EINVAL; 154 155 return 0; 156 } 157 158 #ifdef CONFIG_XFS_QUOTA 159 /* Log the actual updates to the quota accounting. */ 160 static inline void 161 xfs_exchmaps_update_quota( 162 struct xfs_trans *tp, 163 struct xfs_exchmaps_intent *xmi, 164 struct xfs_bmbt_irec *irec1, 165 struct xfs_bmbt_irec *irec2) 166 { 167 int64_t ip1_delta = 0, ip2_delta = 0; 168 unsigned int qflag; 169 170 qflag = XFS_IS_REALTIME_INODE(xmi->xmi_ip1) ? XFS_TRANS_DQ_RTBCOUNT : 171 XFS_TRANS_DQ_BCOUNT; 172 173 if (xfs_bmap_is_real_extent(irec1)) { 174 ip1_delta -= irec1->br_blockcount; 175 ip2_delta += irec1->br_blockcount; 176 } 177 178 if (xfs_bmap_is_real_extent(irec2)) { 179 ip1_delta += irec2->br_blockcount; 180 ip2_delta -= irec2->br_blockcount; 181 } 182 183 xfs_trans_mod_dquot_byino(tp, xmi->xmi_ip1, qflag, ip1_delta); 184 xfs_trans_mod_dquot_byino(tp, xmi->xmi_ip2, qflag, ip2_delta); 185 } 186 #else 187 # define xfs_exchmaps_update_quota(tp, xmi, irec1, irec2) ((void)0) 188 #endif 189 190 /* Decide if we want to skip this mapping from file1. */ 191 static inline bool 192 xfs_exchmaps_can_skip_mapping( 193 struct xfs_exchmaps_intent *xmi, 194 struct xfs_bmbt_irec *irec) 195 { 196 struct xfs_mount *mp = xmi->xmi_ip1->i_mount; 197 198 /* Do not skip this mapping if the caller did not tell us to. */ 199 if (!(xmi->xmi_flags & XFS_EXCHMAPS_INO1_WRITTEN)) 200 return false; 201 202 /* Do not skip mapped, written mappings. */ 203 if (xfs_bmap_is_written_extent(irec)) 204 return false; 205 206 /* 207 * The mapping is unwritten or a hole. It cannot be a delalloc 208 * reservation because we already excluded those. It cannot be an 209 * unwritten extent with dirty page cache because we flushed the page 210 * cache. For files where the allocation unit is 1FSB (files on the 211 * data dev, rt files if the extent size is 1FSB), we can safely 212 * skip this mapping. 213 */ 214 if (!xfs_inode_has_bigrtalloc(xmi->xmi_ip1)) 215 return true; 216 217 /* 218 * For a realtime file with a multi-fsb allocation unit, the decision 219 * is trickier because we can only swap full allocation units. 220 * Unwritten mappings can appear in the middle of an rtx if the rtx is 221 * partially written, but they can also appear for preallocations. 222 * 223 * If the mapping is a hole, skip it entirely. Holes should align with 224 * rtx boundaries. 225 */ 226 if (!xfs_bmap_is_real_extent(irec)) 227 return true; 228 229 /* 230 * All mappings below this point are unwritten. 231 * 232 * - If the beginning is not aligned to an rtx, trim the end of the 233 * mapping so that it does not cross an rtx boundary, and swap it. 234 * 235 * - If both ends are aligned to an rtx, skip the entire mapping. 236 */ 237 if (!isaligned_64(irec->br_startoff, mp->m_sb.sb_rextsize)) { 238 xfs_fileoff_t new_end; 239 240 new_end = roundup_64(irec->br_startoff, mp->m_sb.sb_rextsize); 241 irec->br_blockcount = min(irec->br_blockcount, 242 new_end - irec->br_startoff); 243 return false; 244 } 245 if (isaligned_64(irec->br_blockcount, mp->m_sb.sb_rextsize)) 246 return true; 247 248 /* 249 * All mappings below this point are unwritten, start on an rtx 250 * boundary, and do not end on an rtx boundary. 251 * 252 * - If the mapping is longer than one rtx, trim the end of the mapping 253 * down to an rtx boundary and skip it. 254 * 255 * - The mapping is shorter than one rtx. Swap it. 256 */ 257 if (irec->br_blockcount > mp->m_sb.sb_rextsize) { 258 xfs_fileoff_t new_end; 259 260 new_end = rounddown_64(irec->br_startoff + irec->br_blockcount, 261 mp->m_sb.sb_rextsize); 262 irec->br_blockcount = new_end - irec->br_startoff; 263 return true; 264 } 265 266 return false; 267 } 268 269 /* 270 * Walk forward through the file ranges in @xmi until we find two different 271 * mappings to exchange. If there is work to do, return the mappings; 272 * otherwise we've reached the end of the range and xmi_blockcount will be 273 * zero. 274 * 275 * If the walk skips over a pair of mappings to the same storage, save them as 276 * the left records in @adj (if provided) so that the simulation phase can 277 * avoid an extra lookup. 278 */ 279 static int 280 xfs_exchmaps_find_mappings( 281 struct xfs_exchmaps_intent *xmi, 282 struct xfs_bmbt_irec *irec1, 283 struct xfs_bmbt_irec *irec2, 284 struct xfs_exchmaps_adjacent *adj) 285 { 286 int nimaps; 287 int bmap_flags; 288 int error; 289 290 bmap_flags = xfs_bmapi_aflag(xfs_exchmaps_whichfork(xmi)); 291 292 for (; xmi_has_more_exchange_work(xmi); xmi_advance(xmi, irec1)) { 293 /* Read mapping from the first file */ 294 nimaps = 1; 295 error = xfs_bmapi_read(xmi->xmi_ip1, xmi->xmi_startoff1, 296 xmi->xmi_blockcount, irec1, &nimaps, 297 bmap_flags); 298 if (error) 299 return error; 300 if (nimaps != 1 || 301 irec1->br_startblock == DELAYSTARTBLOCK || 302 irec1->br_startoff != xmi->xmi_startoff1) { 303 /* 304 * We should never get no mapping or a delalloc mapping 305 * or something that doesn't match what we asked for, 306 * since the caller flushed both inodes and we hold the 307 * ILOCKs for both inodes. 308 */ 309 ASSERT(0); 310 return -EINVAL; 311 } 312 313 if (xfs_exchmaps_can_skip_mapping(xmi, irec1)) { 314 trace_xfs_exchmaps_mapping1_skip(xmi->xmi_ip1, irec1); 315 continue; 316 } 317 318 /* Read mapping from the second file */ 319 nimaps = 1; 320 error = xfs_bmapi_read(xmi->xmi_ip2, xmi->xmi_startoff2, 321 irec1->br_blockcount, irec2, &nimaps, 322 bmap_flags); 323 if (error) 324 return error; 325 if (nimaps != 1 || 326 irec2->br_startblock == DELAYSTARTBLOCK || 327 irec2->br_startoff != xmi->xmi_startoff2) { 328 /* 329 * We should never get no mapping or a delalloc mapping 330 * or something that doesn't match what we asked for, 331 * since the caller flushed both inodes and we hold the 332 * ILOCKs for both inodes. 333 */ 334 ASSERT(0); 335 return -EINVAL; 336 } 337 338 /* 339 * We can only exchange as many blocks as the smaller of the 340 * two mapping maps. 341 */ 342 irec1->br_blockcount = min(irec1->br_blockcount, 343 irec2->br_blockcount); 344 345 trace_xfs_exchmaps_mapping1(xmi->xmi_ip1, irec1); 346 trace_xfs_exchmaps_mapping2(xmi->xmi_ip2, irec2); 347 348 /* We found something to exchange, so return it. */ 349 if (irec1->br_startblock != irec2->br_startblock) 350 return 0; 351 352 /* 353 * Two mappings pointing to the same physical block must not 354 * have different states; that's filesystem corruption. Move 355 * on to the next mapping if they're both holes or both point 356 * to the same physical space extent. 357 */ 358 if (irec1->br_state != irec2->br_state) { 359 xfs_bmap_mark_sick(xmi->xmi_ip1, 360 xfs_exchmaps_whichfork(xmi)); 361 xfs_bmap_mark_sick(xmi->xmi_ip2, 362 xfs_exchmaps_whichfork(xmi)); 363 return -EFSCORRUPTED; 364 } 365 366 /* 367 * Save the mappings if we're estimating work and skipping 368 * these identical mappings. 369 */ 370 if (adj) { 371 memcpy(&adj->left1, irec1, sizeof(*irec1)); 372 memcpy(&adj->left2, irec2, sizeof(*irec2)); 373 } 374 } 375 376 return 0; 377 } 378 379 /* Exchange these two mappings. */ 380 static void 381 xfs_exchmaps_one_step( 382 struct xfs_trans *tp, 383 struct xfs_exchmaps_intent *xmi, 384 struct xfs_bmbt_irec *irec1, 385 struct xfs_bmbt_irec *irec2) 386 { 387 int whichfork = xfs_exchmaps_whichfork(xmi); 388 389 xfs_exchmaps_update_quota(tp, xmi, irec1, irec2); 390 391 /* Remove both mappings. */ 392 xfs_bmap_unmap_extent(tp, xmi->xmi_ip1, whichfork, irec1); 393 xfs_bmap_unmap_extent(tp, xmi->xmi_ip2, whichfork, irec2); 394 395 /* 396 * Re-add both mappings. We exchange the file offsets between the two 397 * maps and add the opposite map, which has the effect of filling the 398 * logical offsets we just unmapped, but with with the physical mapping 399 * information exchanged. 400 */ 401 swap(irec1->br_startoff, irec2->br_startoff); 402 xfs_bmap_map_extent(tp, xmi->xmi_ip1, whichfork, irec2); 403 xfs_bmap_map_extent(tp, xmi->xmi_ip2, whichfork, irec1); 404 405 /* Make sure we're not adding mappings past EOF. */ 406 if (whichfork == XFS_DATA_FORK) { 407 xfs_exchmaps_update_size(tp, xmi->xmi_ip1, irec2, 408 xmi->xmi_isize1); 409 xfs_exchmaps_update_size(tp, xmi->xmi_ip2, irec1, 410 xmi->xmi_isize2); 411 } 412 413 /* 414 * Advance our cursor and exit. The caller (either defer ops or log 415 * recovery) will log the XMD item, and if *blockcount is nonzero, it 416 * will log a new XMI item for the remainder and call us back. 417 */ 418 xmi_advance(xmi, irec1); 419 } 420 421 /* Convert inode2's leaf attr fork back to shortform, if possible.. */ 422 STATIC int 423 xfs_exchmaps_attr_to_sf( 424 struct xfs_trans *tp, 425 struct xfs_exchmaps_intent *xmi) 426 { 427 struct xfs_da_args args = { 428 .dp = xmi->xmi_ip2, 429 .geo = tp->t_mountp->m_attr_geo, 430 .whichfork = XFS_ATTR_FORK, 431 .trans = tp, 432 .owner = xmi->xmi_ip2->i_ino, 433 }; 434 struct xfs_buf *bp; 435 int forkoff; 436 int error; 437 438 if (!xfs_attr_is_leaf(xmi->xmi_ip2)) 439 return 0; 440 441 error = xfs_attr3_leaf_read(tp, xmi->xmi_ip2, xmi->xmi_ip2->i_ino, 0, 442 &bp); 443 if (error) 444 return error; 445 446 forkoff = xfs_attr_shortform_allfit(bp, xmi->xmi_ip2); 447 if (forkoff == 0) 448 return 0; 449 450 return xfs_attr3_leaf_to_shortform(bp, &args, forkoff); 451 } 452 453 /* Convert inode2's block dir fork back to shortform, if possible.. */ 454 STATIC int 455 xfs_exchmaps_dir_to_sf( 456 struct xfs_trans *tp, 457 struct xfs_exchmaps_intent *xmi) 458 { 459 struct xfs_da_args args = { 460 .dp = xmi->xmi_ip2, 461 .geo = tp->t_mountp->m_dir_geo, 462 .whichfork = XFS_DATA_FORK, 463 .trans = tp, 464 .owner = xmi->xmi_ip2->i_ino, 465 }; 466 struct xfs_dir2_sf_hdr sfh; 467 struct xfs_buf *bp; 468 int size; 469 int error = 0; 470 471 if (xfs_dir2_format(&args, &error) != XFS_DIR2_FMT_BLOCK) 472 return error; 473 474 error = xfs_dir3_block_read(tp, xmi->xmi_ip2, xmi->xmi_ip2->i_ino, &bp); 475 if (error) 476 return error; 477 478 size = xfs_dir2_block_sfsize(xmi->xmi_ip2, bp->b_addr, &sfh); 479 if (size > xfs_inode_data_fork_size(xmi->xmi_ip2)) 480 return 0; 481 482 return xfs_dir2_block_to_sf(&args, bp, size, &sfh); 483 } 484 485 /* Convert inode2's remote symlink target back to shortform, if possible. */ 486 STATIC int 487 xfs_exchmaps_link_to_sf( 488 struct xfs_trans *tp, 489 struct xfs_exchmaps_intent *xmi) 490 { 491 struct xfs_inode *ip = xmi->xmi_ip2; 492 struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); 493 char *buf; 494 int error; 495 496 if (ifp->if_format == XFS_DINODE_FMT_LOCAL || 497 ip->i_disk_size > xfs_inode_data_fork_size(ip)) 498 return 0; 499 500 /* Read the current symlink target into a buffer. */ 501 buf = kmalloc(ip->i_disk_size + 1, 502 GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); 503 if (!buf) { 504 ASSERT(0); 505 return -ENOMEM; 506 } 507 508 error = xfs_symlink_remote_read(ip, buf); 509 if (error) 510 goto free; 511 512 /* Remove the blocks. */ 513 error = xfs_symlink_remote_truncate(tp, ip); 514 if (error) 515 goto free; 516 517 /* Convert fork to local format and log our changes. */ 518 xfs_idestroy_fork(ifp); 519 ifp->if_bytes = 0; 520 ifp->if_format = XFS_DINODE_FMT_LOCAL; 521 xfs_init_local_fork(ip, XFS_DATA_FORK, buf, ip->i_disk_size); 522 xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE); 523 free: 524 kfree(buf); 525 return error; 526 } 527 528 /* Clear the reflink flag after an exchange. */ 529 static inline void 530 xfs_exchmaps_clear_reflink( 531 struct xfs_trans *tp, 532 struct xfs_inode *ip) 533 { 534 trace_xfs_reflink_unset_inode_flag(ip); 535 536 ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; 537 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 538 } 539 540 /* Finish whatever work might come after an exchange operation. */ 541 static int 542 xfs_exchmaps_do_postop_work( 543 struct xfs_trans *tp, 544 struct xfs_exchmaps_intent *xmi) 545 { 546 if (xmi->xmi_flags & __XFS_EXCHMAPS_INO2_SHORTFORM) { 547 int error = 0; 548 549 if (xmi->xmi_flags & XFS_EXCHMAPS_ATTR_FORK) 550 error = xfs_exchmaps_attr_to_sf(tp, xmi); 551 else if (S_ISDIR(VFS_I(xmi->xmi_ip2)->i_mode)) 552 error = xfs_exchmaps_dir_to_sf(tp, xmi); 553 else if (S_ISLNK(VFS_I(xmi->xmi_ip2)->i_mode)) 554 error = xfs_exchmaps_link_to_sf(tp, xmi); 555 xmi->xmi_flags &= ~__XFS_EXCHMAPS_INO2_SHORTFORM; 556 if (error) 557 return error; 558 } 559 560 if (xmi->xmi_flags & XFS_EXCHMAPS_CLEAR_INO1_REFLINK) { 561 xfs_exchmaps_clear_reflink(tp, xmi->xmi_ip1); 562 xmi->xmi_flags &= ~XFS_EXCHMAPS_CLEAR_INO1_REFLINK; 563 } 564 565 if (xmi->xmi_flags & XFS_EXCHMAPS_CLEAR_INO2_REFLINK) { 566 xfs_exchmaps_clear_reflink(tp, xmi->xmi_ip2); 567 xmi->xmi_flags &= ~XFS_EXCHMAPS_CLEAR_INO2_REFLINK; 568 } 569 570 return 0; 571 } 572 573 /* Finish one step in a mapping exchange operation, possibly relogging. */ 574 int 575 xfs_exchmaps_finish_one( 576 struct xfs_trans *tp, 577 struct xfs_exchmaps_intent *xmi) 578 { 579 struct xfs_bmbt_irec irec1, irec2; 580 int error; 581 582 if (xmi_has_more_exchange_work(xmi)) { 583 /* 584 * If the operation state says that some range of the files 585 * have not yet been exchanged, look for mappings in that range 586 * to exchange. If we find some mappings, exchange them. 587 */ 588 error = xfs_exchmaps_find_mappings(xmi, &irec1, &irec2, NULL); 589 if (error) 590 return error; 591 592 if (xmi_has_more_exchange_work(xmi)) 593 xfs_exchmaps_one_step(tp, xmi, &irec1, &irec2); 594 595 /* 596 * If the caller asked us to exchange the file sizes after the 597 * exchange and either we just exchanged the last mappings in 598 * the range or we didn't find anything to exchange, update the 599 * ondisk file sizes. 600 */ 601 if ((xmi->xmi_flags & XFS_EXCHMAPS_SET_SIZES) && 602 !xmi_has_more_exchange_work(xmi)) { 603 xmi->xmi_ip1->i_disk_size = xmi->xmi_isize1; 604 xmi->xmi_ip2->i_disk_size = xmi->xmi_isize2; 605 606 xfs_trans_log_inode(tp, xmi->xmi_ip1, XFS_ILOG_CORE); 607 xfs_trans_log_inode(tp, xmi->xmi_ip2, XFS_ILOG_CORE); 608 } 609 } else if (xmi_has_postop_work(xmi)) { 610 /* 611 * Now that we're finished with the exchange operation, 612 * complete the post-op cleanup work. 613 */ 614 error = xfs_exchmaps_do_postop_work(tp, xmi); 615 if (error) 616 return error; 617 } 618 619 if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_EXCHMAPS_FINISH_ONE)) 620 return -EIO; 621 622 /* If we still have work to do, ask for a new transaction. */ 623 if (xmi_has_more_exchange_work(xmi) || xmi_has_postop_work(xmi)) { 624 trace_xfs_exchmaps_defer(tp->t_mountp, xmi); 625 return -EAGAIN; 626 } 627 628 /* 629 * If we reach here, we've finished all the exchange work and the post 630 * operation work. The last thing we need to do before returning to 631 * the caller is to make sure that COW forks are set up correctly. 632 */ 633 if (!(xmi->xmi_flags & XFS_EXCHMAPS_ATTR_FORK)) { 634 xfs_exchmaps_ensure_cowfork(xmi->xmi_ip1); 635 xfs_exchmaps_ensure_cowfork(xmi->xmi_ip2); 636 } 637 638 return 0; 639 } 640 641 /* 642 * Compute the amount of bmbt blocks we should reserve for each file. In the 643 * worst case, each exchange will fill a hole with a new mapping, which could 644 * result in a btree split every time we add a new leaf block. 645 */ 646 static inline uint64_t 647 xfs_exchmaps_bmbt_blocks( 648 struct xfs_mount *mp, 649 const struct xfs_exchmaps_req *req) 650 { 651 return howmany_64(req->nr_exchanges, 652 XFS_MAX_CONTIG_BMAPS_PER_BLOCK(mp)) * 653 XFS_EXTENTADD_SPACE_RES(mp, xfs_exchmaps_reqfork(req)); 654 } 655 656 /* Compute the space we should reserve for the rmap btree expansions. */ 657 static inline uint64_t 658 xfs_exchmaps_rmapbt_blocks( 659 struct xfs_mount *mp, 660 const struct xfs_exchmaps_req *req) 661 { 662 if (!xfs_has_rmapbt(mp)) 663 return 0; 664 if (XFS_IS_REALTIME_INODE(req->ip1)) 665 return howmany_64(req->nr_exchanges, 666 XFS_MAX_CONTIG_RTRMAPS_PER_BLOCK(mp)) * 667 XFS_RTRMAPADD_SPACE_RES(mp); 668 669 return howmany_64(req->nr_exchanges, 670 XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) * 671 XFS_RMAPADD_SPACE_RES(mp); 672 } 673 674 /* Estimate the bmbt and rmapbt overhead required to exchange mappings. */ 675 int 676 xfs_exchmaps_estimate_overhead( 677 struct xfs_exchmaps_req *req) 678 { 679 struct xfs_mount *mp = req->ip1->i_mount; 680 xfs_filblks_t bmbt_blocks; 681 xfs_filblks_t rmapbt_blocks; 682 xfs_filblks_t resblks = req->resblks; 683 684 /* 685 * Compute the number of bmbt and rmapbt blocks we might need to handle 686 * the estimated number of exchanges. 687 */ 688 bmbt_blocks = xfs_exchmaps_bmbt_blocks(mp, req); 689 rmapbt_blocks = xfs_exchmaps_rmapbt_blocks(mp, req); 690 691 trace_xfs_exchmaps_overhead(mp, bmbt_blocks, rmapbt_blocks); 692 693 /* Make sure the change in file block count doesn't overflow. */ 694 if (check_add_overflow(req->ip1_bcount, bmbt_blocks, &req->ip1_bcount)) 695 return -EFBIG; 696 if (check_add_overflow(req->ip2_bcount, bmbt_blocks, &req->ip2_bcount)) 697 return -EFBIG; 698 699 /* 700 * Add together the number of blocks we need to handle btree growth, 701 * then add it to the number of blocks we need to reserve to this 702 * transaction. 703 */ 704 if (check_add_overflow(resblks, bmbt_blocks, &resblks)) 705 return -ENOSPC; 706 if (check_add_overflow(resblks, bmbt_blocks, &resblks)) 707 return -ENOSPC; 708 if (check_add_overflow(resblks, rmapbt_blocks, &resblks)) 709 return -ENOSPC; 710 if (check_add_overflow(resblks, rmapbt_blocks, &resblks)) 711 return -ENOSPC; 712 713 /* Can't actually reserve more than UINT_MAX blocks. */ 714 if (req->resblks > UINT_MAX) 715 return -ENOSPC; 716 717 req->resblks = resblks; 718 trace_xfs_exchmaps_final_estimate(req); 719 return 0; 720 } 721 722 /* Decide if we can merge two real mappings. */ 723 static inline bool 724 xmi_can_merge( 725 const struct xfs_bmbt_irec *b1, 726 const struct xfs_bmbt_irec *b2) 727 { 728 /* Don't merge holes. */ 729 if (b1->br_startblock == HOLESTARTBLOCK || 730 b2->br_startblock == HOLESTARTBLOCK) 731 return false; 732 733 /* We don't merge holes. */ 734 if (!xfs_bmap_is_real_extent(b1) || !xfs_bmap_is_real_extent(b2)) 735 return false; 736 737 if (b1->br_startoff + b1->br_blockcount == b2->br_startoff && 738 b1->br_startblock + b1->br_blockcount == b2->br_startblock && 739 b1->br_state == b2->br_state && 740 b1->br_blockcount + b2->br_blockcount <= XFS_MAX_BMBT_EXTLEN) 741 return true; 742 743 return false; 744 } 745 746 /* 747 * Decide if we can merge three mappings. Caller must ensure all three 748 * mappings must not be holes or delalloc reservations. 749 */ 750 static inline bool 751 xmi_can_merge_all( 752 const struct xfs_bmbt_irec *l, 753 const struct xfs_bmbt_irec *m, 754 const struct xfs_bmbt_irec *r) 755 { 756 xfs_filblks_t new_len; 757 758 new_len = l->br_blockcount + m->br_blockcount + r->br_blockcount; 759 return new_len <= XFS_MAX_BMBT_EXTLEN; 760 } 761 762 #define CLEFT_CONTIG 0x01 763 #define CRIGHT_CONTIG 0x02 764 #define CHOLE 0x04 765 #define CBOTH_CONTIG (CLEFT_CONTIG | CRIGHT_CONTIG) 766 767 #define NLEFT_CONTIG 0x10 768 #define NRIGHT_CONTIG 0x20 769 #define NHOLE 0x40 770 #define NBOTH_CONTIG (NLEFT_CONTIG | NRIGHT_CONTIG) 771 772 /* Estimate the effect of a single exchange on mapping count. */ 773 static inline int 774 xmi_delta_nextents_step( 775 struct xfs_mount *mp, 776 const struct xfs_bmbt_irec *left, 777 const struct xfs_bmbt_irec *curr, 778 const struct xfs_bmbt_irec *new, 779 const struct xfs_bmbt_irec *right) 780 { 781 bool lhole, rhole, chole, nhole; 782 unsigned int state = 0; 783 int ret = 0; 784 785 lhole = left->br_startblock == HOLESTARTBLOCK; 786 rhole = right->br_startblock == HOLESTARTBLOCK; 787 chole = curr->br_startblock == HOLESTARTBLOCK; 788 nhole = new->br_startblock == HOLESTARTBLOCK; 789 790 if (chole) 791 state |= CHOLE; 792 if (!lhole && !chole && xmi_can_merge(left, curr)) 793 state |= CLEFT_CONTIG; 794 if (!rhole && !chole && xmi_can_merge(curr, right)) 795 state |= CRIGHT_CONTIG; 796 if ((state & CBOTH_CONTIG) == CBOTH_CONTIG && 797 !xmi_can_merge_all(left, curr, right)) 798 state &= ~CRIGHT_CONTIG; 799 800 if (nhole) 801 state |= NHOLE; 802 if (!lhole && !nhole && xmi_can_merge(left, new)) 803 state |= NLEFT_CONTIG; 804 if (!rhole && !nhole && xmi_can_merge(new, right)) 805 state |= NRIGHT_CONTIG; 806 if ((state & NBOTH_CONTIG) == NBOTH_CONTIG && 807 !xmi_can_merge_all(left, new, right)) 808 state &= ~NRIGHT_CONTIG; 809 810 switch (state & (CLEFT_CONTIG | CRIGHT_CONTIG | CHOLE)) { 811 case CLEFT_CONTIG | CRIGHT_CONTIG: 812 /* 813 * left/curr/right are the same mapping, so deleting curr 814 * causes 2 new mappings to be created. 815 */ 816 ret += 2; 817 break; 818 case 0: 819 /* 820 * curr is not contiguous with any mapping, so we remove curr 821 * completely 822 */ 823 ret--; 824 break; 825 case CHOLE: 826 /* hole, do nothing */ 827 break; 828 case CLEFT_CONTIG: 829 case CRIGHT_CONTIG: 830 /* trim either left or right, no change */ 831 break; 832 } 833 834 switch (state & (NLEFT_CONTIG | NRIGHT_CONTIG | NHOLE)) { 835 case NLEFT_CONTIG | NRIGHT_CONTIG: 836 /* 837 * left/curr/right will become the same mapping, so adding 838 * curr causes the deletion of right. 839 */ 840 ret--; 841 break; 842 case 0: 843 /* new is not contiguous with any mapping */ 844 ret++; 845 break; 846 case NHOLE: 847 /* hole, do nothing. */ 848 break; 849 case NLEFT_CONTIG: 850 case NRIGHT_CONTIG: 851 /* new is absorbed into left or right, no change */ 852 break; 853 } 854 855 trace_xfs_exchmaps_delta_nextents_step(mp, left, curr, new, right, ret, 856 state); 857 return ret; 858 } 859 860 /* Make sure we don't overflow the extent (mapping) counters. */ 861 static inline int 862 xmi_ensure_delta_nextents( 863 struct xfs_exchmaps_req *req, 864 struct xfs_inode *ip, 865 int64_t delta) 866 { 867 struct xfs_mount *mp = ip->i_mount; 868 int whichfork = xfs_exchmaps_reqfork(req); 869 struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); 870 uint64_t new_nextents; 871 xfs_extnum_t max_nextents; 872 873 if (delta < 0) 874 return 0; 875 876 /* 877 * It's always an error if the delta causes integer overflow. delta 878 * needs an explicit cast here to avoid warnings about implicit casts 879 * coded into the overflow check. 880 */ 881 if (check_add_overflow(ifp->if_nextents, (uint64_t)delta, 882 &new_nextents)) 883 return -EFBIG; 884 885 if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) && 886 new_nextents > 10) 887 return -EFBIG; 888 889 /* 890 * We always promote both inodes to have large extent counts if the 891 * superblock feature is enabled, so we only need to check against the 892 * theoretical maximum. 893 */ 894 max_nextents = xfs_iext_max_nextents(xfs_has_large_extent_counts(mp), 895 whichfork); 896 if (new_nextents > max_nextents) 897 return -EFBIG; 898 899 return 0; 900 } 901 902 /* Find the next mapping after irec. */ 903 static inline int 904 xmi_next( 905 struct xfs_inode *ip, 906 int bmap_flags, 907 const struct xfs_bmbt_irec *irec, 908 struct xfs_bmbt_irec *nrec) 909 { 910 xfs_fileoff_t off; 911 xfs_filblks_t blockcount; 912 int nimaps = 1; 913 int error; 914 915 off = irec->br_startoff + irec->br_blockcount; 916 blockcount = XFS_MAX_FILEOFF - off; 917 error = xfs_bmapi_read(ip, off, blockcount, nrec, &nimaps, bmap_flags); 918 if (error) 919 return error; 920 if (nrec->br_startblock == DELAYSTARTBLOCK || 921 nrec->br_startoff != off) { 922 /* 923 * If we don't get the mapping we want, return a zero-length 924 * mapping, which our estimator function will pretend is a hole. 925 * We shouldn't get delalloc reservations. 926 */ 927 nrec->br_startblock = HOLESTARTBLOCK; 928 } 929 930 return 0; 931 } 932 933 int __init 934 xfs_exchmaps_intent_init_cache(void) 935 { 936 xfs_exchmaps_intent_cache = kmem_cache_create("xfs_exchmaps_intent", 937 sizeof(struct xfs_exchmaps_intent), 938 0, 0, NULL); 939 940 return xfs_exchmaps_intent_cache != NULL ? 0 : -ENOMEM; 941 } 942 943 void 944 xfs_exchmaps_intent_destroy_cache(void) 945 { 946 kmem_cache_destroy(xfs_exchmaps_intent_cache); 947 xfs_exchmaps_intent_cache = NULL; 948 } 949 950 /* 951 * Decide if we will exchange the reflink flags between the two files after the 952 * exchange. The only time we want to do this is if we're exchanging all 953 * mappings under EOF and the inode reflink flags have different states. 954 */ 955 static inline bool 956 xmi_can_exchange_reflink_flags( 957 const struct xfs_exchmaps_req *req, 958 unsigned int reflink_state) 959 { 960 struct xfs_mount *mp = req->ip1->i_mount; 961 962 if (hweight32(reflink_state) != 1) 963 return false; 964 if (req->startoff1 != 0 || req->startoff2 != 0) 965 return false; 966 if (req->blockcount != XFS_B_TO_FSB(mp, req->ip1->i_disk_size)) 967 return false; 968 if (req->blockcount != XFS_B_TO_FSB(mp, req->ip2->i_disk_size)) 969 return false; 970 return true; 971 } 972 973 974 /* Allocate and initialize a new incore intent item from a request. */ 975 struct xfs_exchmaps_intent * 976 xfs_exchmaps_init_intent( 977 const struct xfs_exchmaps_req *req) 978 { 979 struct xfs_exchmaps_intent *xmi; 980 unsigned int rs = 0; 981 982 xmi = kmem_cache_zalloc(xfs_exchmaps_intent_cache, 983 GFP_NOFS | __GFP_NOFAIL); 984 INIT_LIST_HEAD(&xmi->xmi_list); 985 xmi->xmi_ip1 = req->ip1; 986 xmi->xmi_ip2 = req->ip2; 987 xmi->xmi_startoff1 = req->startoff1; 988 xmi->xmi_startoff2 = req->startoff2; 989 xmi->xmi_blockcount = req->blockcount; 990 xmi->xmi_isize1 = xmi->xmi_isize2 = -1; 991 xmi->xmi_flags = req->flags & XFS_EXCHMAPS_PARAMS; 992 993 if (xfs_exchmaps_whichfork(xmi) == XFS_ATTR_FORK) { 994 xmi->xmi_flags |= __XFS_EXCHMAPS_INO2_SHORTFORM; 995 return xmi; 996 } 997 998 if (req->flags & XFS_EXCHMAPS_SET_SIZES) { 999 xmi->xmi_flags |= XFS_EXCHMAPS_SET_SIZES; 1000 xmi->xmi_isize1 = req->ip2->i_disk_size; 1001 xmi->xmi_isize2 = req->ip1->i_disk_size; 1002 } 1003 1004 /* Record the state of each inode's reflink flag before the op. */ 1005 if (xfs_is_reflink_inode(req->ip1)) 1006 rs |= 1; 1007 if (xfs_is_reflink_inode(req->ip2)) 1008 rs |= 2; 1009 1010 /* 1011 * Figure out if we're clearing the reflink flags (which effectively 1012 * exchanges them) after the operation. 1013 */ 1014 if (xmi_can_exchange_reflink_flags(req, rs)) { 1015 if (rs & 1) 1016 xmi->xmi_flags |= XFS_EXCHMAPS_CLEAR_INO1_REFLINK; 1017 if (rs & 2) 1018 xmi->xmi_flags |= XFS_EXCHMAPS_CLEAR_INO2_REFLINK; 1019 } 1020 1021 if (S_ISDIR(VFS_I(xmi->xmi_ip2)->i_mode) || 1022 S_ISLNK(VFS_I(xmi->xmi_ip2)->i_mode)) 1023 xmi->xmi_flags |= __XFS_EXCHMAPS_INO2_SHORTFORM; 1024 1025 return xmi; 1026 } 1027 1028 /* 1029 * Estimate the number of exchange operations and the number of file blocks 1030 * in each file that will be affected by the exchange operation. 1031 */ 1032 int 1033 xfs_exchmaps_estimate( 1034 struct xfs_exchmaps_req *req) 1035 { 1036 struct xfs_exchmaps_intent *xmi; 1037 struct xfs_bmbt_irec irec1, irec2; 1038 struct xfs_exchmaps_adjacent adj = ADJACENT_INIT; 1039 xfs_filblks_t ip1_blocks = 0, ip2_blocks = 0; 1040 int64_t d_nexts1, d_nexts2; 1041 int bmap_flags; 1042 int error; 1043 1044 ASSERT(!(req->flags & ~XFS_EXCHMAPS_PARAMS)); 1045 1046 bmap_flags = xfs_bmapi_aflag(xfs_exchmaps_reqfork(req)); 1047 xmi = xfs_exchmaps_init_intent(req); 1048 1049 /* 1050 * To guard against the possibility of overflowing the extent counters, 1051 * we have to estimate an upper bound on the potential increase in that 1052 * counter. We can split the mapping at each end of the range, and for 1053 * each step of the exchange we can split the mapping that we're 1054 * working on if the mappings do not align. 1055 */ 1056 d_nexts1 = d_nexts2 = 3; 1057 1058 while (xmi_has_more_exchange_work(xmi)) { 1059 /* 1060 * Walk through the file ranges until we find something to 1061 * exchange. Because we're simulating the exchange, pass in 1062 * adj to capture skipped mappings for correct estimation of 1063 * bmbt record merges. 1064 */ 1065 error = xfs_exchmaps_find_mappings(xmi, &irec1, &irec2, &adj); 1066 if (error) 1067 goto out_free; 1068 if (!xmi_has_more_exchange_work(xmi)) 1069 break; 1070 1071 /* Update accounting. */ 1072 if (xfs_bmap_is_real_extent(&irec1)) 1073 ip1_blocks += irec1.br_blockcount; 1074 if (xfs_bmap_is_real_extent(&irec2)) 1075 ip2_blocks += irec2.br_blockcount; 1076 req->nr_exchanges++; 1077 1078 /* Read the next mappings from both files. */ 1079 error = xmi_next(req->ip1, bmap_flags, &irec1, &adj.right1); 1080 if (error) 1081 goto out_free; 1082 1083 error = xmi_next(req->ip2, bmap_flags, &irec2, &adj.right2); 1084 if (error) 1085 goto out_free; 1086 1087 /* Update extent count deltas. */ 1088 d_nexts1 += xmi_delta_nextents_step(req->ip1->i_mount, 1089 &adj.left1, &irec1, &irec2, &adj.right1); 1090 1091 d_nexts2 += xmi_delta_nextents_step(req->ip1->i_mount, 1092 &adj.left2, &irec2, &irec1, &adj.right2); 1093 1094 /* Now pretend we exchanged the mappings. */ 1095 if (xmi_can_merge(&adj.left2, &irec1)) 1096 adj.left2.br_blockcount += irec1.br_blockcount; 1097 else 1098 memcpy(&adj.left2, &irec1, sizeof(irec1)); 1099 1100 if (xmi_can_merge(&adj.left1, &irec2)) 1101 adj.left1.br_blockcount += irec2.br_blockcount; 1102 else 1103 memcpy(&adj.left1, &irec2, sizeof(irec2)); 1104 1105 xmi_advance(xmi, &irec1); 1106 } 1107 1108 /* Account for the blocks that are being exchanged. */ 1109 if (XFS_IS_REALTIME_INODE(req->ip1) && 1110 xfs_exchmaps_reqfork(req) == XFS_DATA_FORK) { 1111 req->ip1_rtbcount = ip1_blocks; 1112 req->ip2_rtbcount = ip2_blocks; 1113 } else { 1114 req->ip1_bcount = ip1_blocks; 1115 req->ip2_bcount = ip2_blocks; 1116 } 1117 1118 /* 1119 * Make sure that both forks have enough slack left in their extent 1120 * counters that the exchange operation will not overflow. 1121 */ 1122 trace_xfs_exchmaps_delta_nextents(req, d_nexts1, d_nexts2); 1123 if (req->ip1 == req->ip2) { 1124 error = xmi_ensure_delta_nextents(req, req->ip1, 1125 d_nexts1 + d_nexts2); 1126 } else { 1127 error = xmi_ensure_delta_nextents(req, req->ip1, d_nexts1); 1128 if (error) 1129 goto out_free; 1130 error = xmi_ensure_delta_nextents(req, req->ip2, d_nexts2); 1131 } 1132 if (error) 1133 goto out_free; 1134 1135 trace_xfs_exchmaps_initial_estimate(req); 1136 error = xfs_exchmaps_estimate_overhead(req); 1137 out_free: 1138 kmem_cache_free(xfs_exchmaps_intent_cache, xmi); 1139 return error; 1140 } 1141 1142 /* Set the reflink flag before an operation. */ 1143 static inline void 1144 xfs_exchmaps_set_reflink( 1145 struct xfs_trans *tp, 1146 struct xfs_inode *ip) 1147 { 1148 trace_xfs_reflink_set_inode_flag(ip); 1149 1150 ip->i_diflags2 |= XFS_DIFLAG2_REFLINK; 1151 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1152 } 1153 1154 /* 1155 * If either file has shared blocks and we're exchanging data forks, we must 1156 * flag the other file as having shared blocks so that we get the shared-block 1157 * rmap functions if we need to fix up the rmaps. 1158 */ 1159 void 1160 xfs_exchmaps_ensure_reflink( 1161 struct xfs_trans *tp, 1162 const struct xfs_exchmaps_intent *xmi) 1163 { 1164 unsigned int rs = 0; 1165 1166 if (xfs_is_reflink_inode(xmi->xmi_ip1)) 1167 rs |= 1; 1168 if (xfs_is_reflink_inode(xmi->xmi_ip2)) 1169 rs |= 2; 1170 1171 if ((rs & 1) && !xfs_is_reflink_inode(xmi->xmi_ip2)) 1172 xfs_exchmaps_set_reflink(tp, xmi->xmi_ip2); 1173 1174 if ((rs & 2) && !xfs_is_reflink_inode(xmi->xmi_ip1)) 1175 xfs_exchmaps_set_reflink(tp, xmi->xmi_ip1); 1176 } 1177 1178 /* Set the large extent count flag before an operation if needed. */ 1179 static inline void 1180 xfs_exchmaps_ensure_large_extent_counts( 1181 struct xfs_trans *tp, 1182 struct xfs_inode *ip) 1183 { 1184 if (xfs_inode_has_large_extent_counts(ip)) 1185 return; 1186 1187 ip->i_diflags2 |= XFS_DIFLAG2_NREXT64; 1188 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1189 } 1190 1191 /* Widen the extent counter fields of both inodes if necessary. */ 1192 void 1193 xfs_exchmaps_upgrade_extent_counts( 1194 struct xfs_trans *tp, 1195 const struct xfs_exchmaps_intent *xmi) 1196 { 1197 if (!xfs_has_large_extent_counts(tp->t_mountp)) 1198 return; 1199 1200 xfs_exchmaps_ensure_large_extent_counts(tp, xmi->xmi_ip1); 1201 xfs_exchmaps_ensure_large_extent_counts(tp, xmi->xmi_ip2); 1202 } 1203 1204 /* 1205 * Schedule an exchange a range of mappings from one inode to another. 1206 * 1207 * The use of file mapping exchange log intent items ensures the operation can 1208 * be resumed even if the system goes down. The caller must commit the 1209 * transaction to start the work. 1210 * 1211 * The caller must ensure the inodes must be joined to the transaction and 1212 * ILOCKd; they will still be joined to the transaction at exit. 1213 */ 1214 void 1215 xfs_exchange_mappings( 1216 struct xfs_trans *tp, 1217 const struct xfs_exchmaps_req *req) 1218 { 1219 struct xfs_exchmaps_intent *xmi; 1220 1221 BUILD_BUG_ON(XFS_EXCHMAPS_INTERNAL_FLAGS & XFS_EXCHMAPS_LOGGED_FLAGS); 1222 1223 xfs_assert_ilocked(req->ip1, XFS_ILOCK_EXCL); 1224 xfs_assert_ilocked(req->ip2, XFS_ILOCK_EXCL); 1225 ASSERT(!(req->flags & ~XFS_EXCHMAPS_LOGGED_FLAGS)); 1226 if (req->flags & XFS_EXCHMAPS_SET_SIZES) 1227 ASSERT(!(req->flags & XFS_EXCHMAPS_ATTR_FORK)); 1228 ASSERT(xfs_has_exchange_range(tp->t_mountp)); 1229 1230 if (req->blockcount == 0) 1231 return; 1232 1233 xmi = xfs_exchmaps_init_intent(req); 1234 xfs_exchmaps_defer_add(tp, xmi); 1235 xfs_exchmaps_ensure_reflink(tp, xmi); 1236 xfs_exchmaps_upgrade_extent_counts(tp, xmi); 1237 } 1238