1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (c) 2020-2024 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_defer.h" 14 #include "xfs_inode.h" 15 #include "xfs_trans.h" 16 #include "xfs_bmap.h" 17 #include "xfs_icache.h" 18 #include "xfs_quota.h" 19 #include "xfs_exchmaps.h" 20 #include "xfs_trace.h" 21 #include "xfs_bmap_btree.h" 22 #include "xfs_trans_space.h" 23 #include "xfs_error.h" 24 #include "xfs_errortag.h" 25 #include "xfs_health.h" 26 #include "xfs_exchmaps_item.h" 27 #include "xfs_da_format.h" 28 #include "xfs_da_btree.h" 29 #include "xfs_attr_leaf.h" 30 #include "xfs_attr.h" 31 #include "xfs_dir2_priv.h" 32 #include "xfs_dir2.h" 33 #include "xfs_symlink_remote.h" 34 35 struct kmem_cache *xfs_exchmaps_intent_cache; 36 37 /* bmbt mappings adjacent to a pair of records. */ 38 struct xfs_exchmaps_adjacent { 39 struct xfs_bmbt_irec left1; 40 struct xfs_bmbt_irec right1; 41 struct xfs_bmbt_irec left2; 42 struct xfs_bmbt_irec right2; 43 }; 44 45 #define ADJACENT_INIT { \ 46 .left1 = { .br_startblock = HOLESTARTBLOCK }, \ 47 .right1 = { .br_startblock = HOLESTARTBLOCK }, \ 48 .left2 = { .br_startblock = HOLESTARTBLOCK }, \ 49 .right2 = { .br_startblock = HOLESTARTBLOCK }, \ 50 } 51 52 /* Information to reset reflink flag / CoW fork state after an exchange. */ 53 54 /* 55 * If the reflink flag is set on either inode, make sure it has an incore CoW 56 * fork, since all reflink inodes must have them. If there's a CoW fork and it 57 * has mappings in it, make sure the inodes are tagged appropriately so that 58 * speculative preallocations can be GC'd if we run low of space. 59 */ 60 static inline void 61 xfs_exchmaps_ensure_cowfork( 62 struct xfs_inode *ip) 63 { 64 struct xfs_ifork *cfork; 65 66 if (xfs_is_reflink_inode(ip)) 67 xfs_ifork_init_cow(ip); 68 69 cfork = xfs_ifork_ptr(ip, XFS_COW_FORK); 70 if (!cfork) 71 return; 72 if (cfork->if_bytes > 0) 73 xfs_inode_set_cowblocks_tag(ip); 74 else 75 xfs_inode_clear_cowblocks_tag(ip); 76 } 77 78 /* 79 * Adjust the on-disk inode size upwards if needed so that we never add 80 * mappings into the file past EOF. This is crucial so that log recovery won't 81 * get confused by the sudden appearance of post-eof mappings. 82 */ 83 STATIC void 84 xfs_exchmaps_update_size( 85 struct xfs_trans *tp, 86 struct xfs_inode *ip, 87 struct xfs_bmbt_irec *imap, 88 xfs_fsize_t new_isize) 89 { 90 struct xfs_mount *mp = tp->t_mountp; 91 xfs_fsize_t len; 92 93 if (new_isize < 0) 94 return; 95 96 len = min(XFS_FSB_TO_B(mp, imap->br_startoff + imap->br_blockcount), 97 new_isize); 98 99 if (len <= ip->i_disk_size) 100 return; 101 102 trace_xfs_exchmaps_update_inode_size(ip, len); 103 104 ip->i_disk_size = len; 105 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 106 } 107 108 /* Advance the incore state tracking after exchanging a mapping. */ 109 static inline void 110 xmi_advance( 111 struct xfs_exchmaps_intent *xmi, 112 const struct xfs_bmbt_irec *irec) 113 { 114 xmi->xmi_startoff1 += irec->br_blockcount; 115 xmi->xmi_startoff2 += irec->br_blockcount; 116 xmi->xmi_blockcount -= irec->br_blockcount; 117 } 118 119 /* Do we still have more mappings to exchange? */ 120 static inline bool 121 xmi_has_more_exchange_work(const struct xfs_exchmaps_intent *xmi) 122 { 123 return xmi->xmi_blockcount > 0; 124 } 125 126 /* Do we have post-operation cleanups to perform? */ 127 static inline bool 128 xmi_has_postop_work(const struct xfs_exchmaps_intent *xmi) 129 { 130 return xmi->xmi_flags & (XFS_EXCHMAPS_CLEAR_INO1_REFLINK | 131 XFS_EXCHMAPS_CLEAR_INO2_REFLINK | 132 __XFS_EXCHMAPS_INO2_SHORTFORM); 133 } 134 135 /* Check all mappings to make sure we can actually exchange them. */ 136 int 137 xfs_exchmaps_check_forks( 138 struct xfs_mount *mp, 139 const struct xfs_exchmaps_req *req) 140 { 141 struct xfs_ifork *ifp1, *ifp2; 142 int whichfork = xfs_exchmaps_reqfork(req); 143 144 /* No fork? */ 145 ifp1 = xfs_ifork_ptr(req->ip1, whichfork); 146 ifp2 = xfs_ifork_ptr(req->ip2, whichfork); 147 if (!ifp1 || !ifp2) 148 return -EINVAL; 149 150 /* We don't know how to exchange local format forks. */ 151 if (ifp1->if_format == XFS_DINODE_FMT_LOCAL || 152 ifp2->if_format == XFS_DINODE_FMT_LOCAL) 153 return -EINVAL; 154 155 return 0; 156 } 157 158 #ifdef CONFIG_XFS_QUOTA 159 /* Log the actual updates to the quota accounting. */ 160 static inline void 161 xfs_exchmaps_update_quota( 162 struct xfs_trans *tp, 163 struct xfs_exchmaps_intent *xmi, 164 struct xfs_bmbt_irec *irec1, 165 struct xfs_bmbt_irec *irec2) 166 { 167 int64_t ip1_delta = 0, ip2_delta = 0; 168 unsigned int qflag; 169 170 qflag = XFS_IS_REALTIME_INODE(xmi->xmi_ip1) ? XFS_TRANS_DQ_RTBCOUNT : 171 XFS_TRANS_DQ_BCOUNT; 172 173 if (xfs_bmap_is_real_extent(irec1)) { 174 ip1_delta -= irec1->br_blockcount; 175 ip2_delta += irec1->br_blockcount; 176 } 177 178 if (xfs_bmap_is_real_extent(irec2)) { 179 ip1_delta += irec2->br_blockcount; 180 ip2_delta -= irec2->br_blockcount; 181 } 182 183 xfs_trans_mod_dquot_byino(tp, xmi->xmi_ip1, qflag, ip1_delta); 184 xfs_trans_mod_dquot_byino(tp, xmi->xmi_ip2, qflag, ip2_delta); 185 } 186 #else 187 # define xfs_exchmaps_update_quota(tp, xmi, irec1, irec2) ((void)0) 188 #endif 189 190 /* Decide if we want to skip this mapping from file1. */ 191 static inline bool 192 xfs_exchmaps_can_skip_mapping( 193 struct xfs_exchmaps_intent *xmi, 194 struct xfs_bmbt_irec *irec) 195 { 196 struct xfs_mount *mp = xmi->xmi_ip1->i_mount; 197 198 /* Do not skip this mapping if the caller did not tell us to. */ 199 if (!(xmi->xmi_flags & XFS_EXCHMAPS_INO1_WRITTEN)) 200 return false; 201 202 /* Do not skip mapped, written mappings. */ 203 if (xfs_bmap_is_written_extent(irec)) 204 return false; 205 206 /* 207 * The mapping is unwritten or a hole. It cannot be a delalloc 208 * reservation because we already excluded those. It cannot be an 209 * unwritten extent with dirty page cache because we flushed the page 210 * cache. For files where the allocation unit is 1FSB (files on the 211 * data dev, rt files if the extent size is 1FSB), we can safely 212 * skip this mapping. 213 */ 214 if (!xfs_inode_has_bigrtalloc(xmi->xmi_ip1)) 215 return true; 216 217 /* 218 * For a realtime file with a multi-fsb allocation unit, the decision 219 * is trickier because we can only swap full allocation units. 220 * Unwritten mappings can appear in the middle of an rtx if the rtx is 221 * partially written, but they can also appear for preallocations. 222 * 223 * If the mapping is a hole, skip it entirely. Holes should align with 224 * rtx boundaries. 225 */ 226 if (!xfs_bmap_is_real_extent(irec)) 227 return true; 228 229 /* 230 * All mappings below this point are unwritten. 231 * 232 * - If the beginning is not aligned to an rtx, trim the end of the 233 * mapping so that it does not cross an rtx boundary, and swap it. 234 * 235 * - If both ends are aligned to an rtx, skip the entire mapping. 236 */ 237 if (!isaligned_64(irec->br_startoff, mp->m_sb.sb_rextsize)) { 238 xfs_fileoff_t new_end; 239 240 new_end = roundup_64(irec->br_startoff, mp->m_sb.sb_rextsize); 241 irec->br_blockcount = min(irec->br_blockcount, 242 new_end - irec->br_startoff); 243 return false; 244 } 245 if (isaligned_64(irec->br_blockcount, mp->m_sb.sb_rextsize)) 246 return true; 247 248 /* 249 * All mappings below this point are unwritten, start on an rtx 250 * boundary, and do not end on an rtx boundary. 251 * 252 * - If the mapping is longer than one rtx, trim the end of the mapping 253 * down to an rtx boundary and skip it. 254 * 255 * - The mapping is shorter than one rtx. Swap it. 256 */ 257 if (irec->br_blockcount > mp->m_sb.sb_rextsize) { 258 xfs_fileoff_t new_end; 259 260 new_end = rounddown_64(irec->br_startoff + irec->br_blockcount, 261 mp->m_sb.sb_rextsize); 262 irec->br_blockcount = new_end - irec->br_startoff; 263 return true; 264 } 265 266 return false; 267 } 268 269 /* 270 * Walk forward through the file ranges in @xmi until we find two different 271 * mappings to exchange. If there is work to do, return the mappings; 272 * otherwise we've reached the end of the range and xmi_blockcount will be 273 * zero. 274 * 275 * If the walk skips over a pair of mappings to the same storage, save them as 276 * the left records in @adj (if provided) so that the simulation phase can 277 * avoid an extra lookup. 278 */ 279 static int 280 xfs_exchmaps_find_mappings( 281 struct xfs_exchmaps_intent *xmi, 282 struct xfs_bmbt_irec *irec1, 283 struct xfs_bmbt_irec *irec2, 284 struct xfs_exchmaps_adjacent *adj) 285 { 286 int nimaps; 287 int bmap_flags; 288 int error; 289 290 bmap_flags = xfs_bmapi_aflag(xfs_exchmaps_whichfork(xmi)); 291 292 for (; xmi_has_more_exchange_work(xmi); xmi_advance(xmi, irec1)) { 293 /* Read mapping from the first file */ 294 nimaps = 1; 295 error = xfs_bmapi_read(xmi->xmi_ip1, xmi->xmi_startoff1, 296 xmi->xmi_blockcount, irec1, &nimaps, 297 bmap_flags); 298 if (error) 299 return error; 300 if (nimaps != 1 || 301 irec1->br_startblock == DELAYSTARTBLOCK || 302 irec1->br_startoff != xmi->xmi_startoff1) { 303 /* 304 * We should never get no mapping or a delalloc mapping 305 * or something that doesn't match what we asked for, 306 * since the caller flushed both inodes and we hold the 307 * ILOCKs for both inodes. 308 */ 309 ASSERT(0); 310 return -EINVAL; 311 } 312 313 if (xfs_exchmaps_can_skip_mapping(xmi, irec1)) { 314 trace_xfs_exchmaps_mapping1_skip(xmi->xmi_ip1, irec1); 315 continue; 316 } 317 318 /* Read mapping from the second file */ 319 nimaps = 1; 320 error = xfs_bmapi_read(xmi->xmi_ip2, xmi->xmi_startoff2, 321 irec1->br_blockcount, irec2, &nimaps, 322 bmap_flags); 323 if (error) 324 return error; 325 if (nimaps != 1 || 326 irec2->br_startblock == DELAYSTARTBLOCK || 327 irec2->br_startoff != xmi->xmi_startoff2) { 328 /* 329 * We should never get no mapping or a delalloc mapping 330 * or something that doesn't match what we asked for, 331 * since the caller flushed both inodes and we hold the 332 * ILOCKs for both inodes. 333 */ 334 ASSERT(0); 335 return -EINVAL; 336 } 337 338 /* 339 * We can only exchange as many blocks as the smaller of the 340 * two mapping maps. 341 */ 342 irec1->br_blockcount = min(irec1->br_blockcount, 343 irec2->br_blockcount); 344 345 trace_xfs_exchmaps_mapping1(xmi->xmi_ip1, irec1); 346 trace_xfs_exchmaps_mapping2(xmi->xmi_ip2, irec2); 347 348 /* We found something to exchange, so return it. */ 349 if (irec1->br_startblock != irec2->br_startblock) 350 return 0; 351 352 /* 353 * Two mappings pointing to the same physical block must not 354 * have different states; that's filesystem corruption. Move 355 * on to the next mapping if they're both holes or both point 356 * to the same physical space extent. 357 */ 358 if (irec1->br_state != irec2->br_state) { 359 xfs_bmap_mark_sick(xmi->xmi_ip1, 360 xfs_exchmaps_whichfork(xmi)); 361 xfs_bmap_mark_sick(xmi->xmi_ip2, 362 xfs_exchmaps_whichfork(xmi)); 363 return -EFSCORRUPTED; 364 } 365 366 /* 367 * Save the mappings if we're estimating work and skipping 368 * these identical mappings. 369 */ 370 if (adj) { 371 memcpy(&adj->left1, irec1, sizeof(*irec1)); 372 memcpy(&adj->left2, irec2, sizeof(*irec2)); 373 } 374 } 375 376 return 0; 377 } 378 379 /* Exchange these two mappings. */ 380 static void 381 xfs_exchmaps_one_step( 382 struct xfs_trans *tp, 383 struct xfs_exchmaps_intent *xmi, 384 struct xfs_bmbt_irec *irec1, 385 struct xfs_bmbt_irec *irec2) 386 { 387 int whichfork = xfs_exchmaps_whichfork(xmi); 388 389 xfs_exchmaps_update_quota(tp, xmi, irec1, irec2); 390 391 /* Remove both mappings. */ 392 xfs_bmap_unmap_extent(tp, xmi->xmi_ip1, whichfork, irec1); 393 xfs_bmap_unmap_extent(tp, xmi->xmi_ip2, whichfork, irec2); 394 395 /* 396 * Re-add both mappings. We exchange the file offsets between the two 397 * maps and add the opposite map, which has the effect of filling the 398 * logical offsets we just unmapped, but with with the physical mapping 399 * information exchanged. 400 */ 401 swap(irec1->br_startoff, irec2->br_startoff); 402 xfs_bmap_map_extent(tp, xmi->xmi_ip1, whichfork, irec2); 403 xfs_bmap_map_extent(tp, xmi->xmi_ip2, whichfork, irec1); 404 405 /* Make sure we're not adding mappings past EOF. */ 406 if (whichfork == XFS_DATA_FORK) { 407 xfs_exchmaps_update_size(tp, xmi->xmi_ip1, irec2, 408 xmi->xmi_isize1); 409 xfs_exchmaps_update_size(tp, xmi->xmi_ip2, irec1, 410 xmi->xmi_isize2); 411 } 412 413 /* 414 * Advance our cursor and exit. The caller (either defer ops or log 415 * recovery) will log the XMD item, and if *blockcount is nonzero, it 416 * will log a new XMI item for the remainder and call us back. 417 */ 418 xmi_advance(xmi, irec1); 419 } 420 421 /* Convert inode2's leaf attr fork back to shortform, if possible.. */ 422 STATIC int 423 xfs_exchmaps_attr_to_sf( 424 struct xfs_trans *tp, 425 struct xfs_exchmaps_intent *xmi) 426 { 427 struct xfs_da_args args = { 428 .dp = xmi->xmi_ip2, 429 .geo = tp->t_mountp->m_attr_geo, 430 .whichfork = XFS_ATTR_FORK, 431 .trans = tp, 432 .owner = xmi->xmi_ip2->i_ino, 433 }; 434 struct xfs_buf *bp; 435 int forkoff; 436 int error; 437 438 if (!xfs_attr_is_leaf(xmi->xmi_ip2)) 439 return 0; 440 441 error = xfs_attr3_leaf_read(tp, xmi->xmi_ip2, xmi->xmi_ip2->i_ino, 0, 442 &bp); 443 if (error) 444 return error; 445 446 forkoff = xfs_attr_shortform_allfit(bp, xmi->xmi_ip2); 447 if (forkoff == 0) 448 return 0; 449 450 return xfs_attr3_leaf_to_shortform(bp, &args, forkoff); 451 } 452 453 /* Convert inode2's block dir fork back to shortform, if possible.. */ 454 STATIC int 455 xfs_exchmaps_dir_to_sf( 456 struct xfs_trans *tp, 457 struct xfs_exchmaps_intent *xmi) 458 { 459 struct xfs_da_args args = { 460 .dp = xmi->xmi_ip2, 461 .geo = tp->t_mountp->m_dir_geo, 462 .whichfork = XFS_DATA_FORK, 463 .trans = tp, 464 .owner = xmi->xmi_ip2->i_ino, 465 }; 466 struct xfs_dir2_sf_hdr sfh; 467 struct xfs_buf *bp; 468 int size; 469 int error = 0; 470 471 if (xfs_dir2_format(&args, &error) != XFS_DIR2_FMT_BLOCK) 472 return error; 473 474 error = xfs_dir3_block_read(tp, xmi->xmi_ip2, xmi->xmi_ip2->i_ino, &bp); 475 if (error) 476 return error; 477 478 size = xfs_dir2_block_sfsize(xmi->xmi_ip2, bp->b_addr, &sfh); 479 if (size > xfs_inode_data_fork_size(xmi->xmi_ip2)) 480 return 0; 481 482 return xfs_dir2_block_to_sf(&args, bp, size, &sfh); 483 } 484 485 /* Convert inode2's remote symlink target back to shortform, if possible. */ 486 STATIC int 487 xfs_exchmaps_link_to_sf( 488 struct xfs_trans *tp, 489 struct xfs_exchmaps_intent *xmi) 490 { 491 struct xfs_inode *ip = xmi->xmi_ip2; 492 struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); 493 char *buf; 494 int error; 495 496 if (ifp->if_format == XFS_DINODE_FMT_LOCAL || 497 ip->i_disk_size > xfs_inode_data_fork_size(ip)) 498 return 0; 499 500 /* Read the current symlink target into a buffer. */ 501 buf = kmalloc(ip->i_disk_size + 1, 502 GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); 503 if (!buf) { 504 ASSERT(0); 505 return -ENOMEM; 506 } 507 508 error = xfs_symlink_remote_read(ip, buf); 509 if (error) 510 goto free; 511 512 /* Remove the blocks. */ 513 error = xfs_symlink_remote_truncate(tp, ip); 514 if (error) 515 goto free; 516 517 /* Convert fork to local format and log our changes. */ 518 xfs_idestroy_fork(ifp); 519 ifp->if_bytes = 0; 520 ifp->if_format = XFS_DINODE_FMT_LOCAL; 521 xfs_init_local_fork(ip, XFS_DATA_FORK, buf, ip->i_disk_size); 522 xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE); 523 free: 524 kfree(buf); 525 return error; 526 } 527 528 /* Clear the reflink flag after an exchange. */ 529 static inline void 530 xfs_exchmaps_clear_reflink( 531 struct xfs_trans *tp, 532 struct xfs_inode *ip) 533 { 534 trace_xfs_reflink_unset_inode_flag(ip); 535 536 ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; 537 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 538 } 539 540 /* Finish whatever work might come after an exchange operation. */ 541 static int 542 xfs_exchmaps_do_postop_work( 543 struct xfs_trans *tp, 544 struct xfs_exchmaps_intent *xmi) 545 { 546 if (xmi->xmi_flags & __XFS_EXCHMAPS_INO2_SHORTFORM) { 547 int error = 0; 548 549 if (xmi->xmi_flags & XFS_EXCHMAPS_ATTR_FORK) 550 error = xfs_exchmaps_attr_to_sf(tp, xmi); 551 else if (S_ISDIR(VFS_I(xmi->xmi_ip2)->i_mode)) 552 error = xfs_exchmaps_dir_to_sf(tp, xmi); 553 else if (S_ISLNK(VFS_I(xmi->xmi_ip2)->i_mode)) 554 error = xfs_exchmaps_link_to_sf(tp, xmi); 555 xmi->xmi_flags &= ~__XFS_EXCHMAPS_INO2_SHORTFORM; 556 if (error) 557 return error; 558 } 559 560 if (xmi->xmi_flags & XFS_EXCHMAPS_CLEAR_INO1_REFLINK) { 561 xfs_exchmaps_clear_reflink(tp, xmi->xmi_ip1); 562 xmi->xmi_flags &= ~XFS_EXCHMAPS_CLEAR_INO1_REFLINK; 563 } 564 565 if (xmi->xmi_flags & XFS_EXCHMAPS_CLEAR_INO2_REFLINK) { 566 xfs_exchmaps_clear_reflink(tp, xmi->xmi_ip2); 567 xmi->xmi_flags &= ~XFS_EXCHMAPS_CLEAR_INO2_REFLINK; 568 } 569 570 return 0; 571 } 572 573 /* Finish one step in a mapping exchange operation, possibly relogging. */ 574 int 575 xfs_exchmaps_finish_one( 576 struct xfs_trans *tp, 577 struct xfs_exchmaps_intent *xmi) 578 { 579 struct xfs_bmbt_irec irec1, irec2; 580 int error; 581 582 if (xmi_has_more_exchange_work(xmi)) { 583 /* 584 * If the operation state says that some range of the files 585 * have not yet been exchanged, look for mappings in that range 586 * to exchange. If we find some mappings, exchange them. 587 */ 588 error = xfs_exchmaps_find_mappings(xmi, &irec1, &irec2, NULL); 589 if (error) 590 return error; 591 592 if (xmi_has_more_exchange_work(xmi)) 593 xfs_exchmaps_one_step(tp, xmi, &irec1, &irec2); 594 595 /* 596 * If the caller asked us to exchange the file sizes after the 597 * exchange and either we just exchanged the last mappings in 598 * the range or we didn't find anything to exchange, update the 599 * ondisk file sizes. 600 */ 601 if ((xmi->xmi_flags & XFS_EXCHMAPS_SET_SIZES) && 602 !xmi_has_more_exchange_work(xmi)) { 603 xmi->xmi_ip1->i_disk_size = xmi->xmi_isize1; 604 xmi->xmi_ip2->i_disk_size = xmi->xmi_isize2; 605 606 xfs_trans_log_inode(tp, xmi->xmi_ip1, XFS_ILOG_CORE); 607 xfs_trans_log_inode(tp, xmi->xmi_ip2, XFS_ILOG_CORE); 608 } 609 } else if (xmi_has_postop_work(xmi)) { 610 /* 611 * Now that we're finished with the exchange operation, 612 * complete the post-op cleanup work. 613 */ 614 error = xfs_exchmaps_do_postop_work(tp, xmi); 615 if (error) 616 return error; 617 } 618 619 if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_EXCHMAPS_FINISH_ONE)) 620 return -EIO; 621 622 /* If we still have work to do, ask for a new transaction. */ 623 if (xmi_has_more_exchange_work(xmi) || xmi_has_postop_work(xmi)) { 624 trace_xfs_exchmaps_defer(tp->t_mountp, xmi); 625 return -EAGAIN; 626 } 627 628 /* 629 * If we reach here, we've finished all the exchange work and the post 630 * operation work. The last thing we need to do before returning to 631 * the caller is to make sure that COW forks are set up correctly. 632 */ 633 if (!(xmi->xmi_flags & XFS_EXCHMAPS_ATTR_FORK)) { 634 xfs_exchmaps_ensure_cowfork(xmi->xmi_ip1); 635 xfs_exchmaps_ensure_cowfork(xmi->xmi_ip2); 636 } 637 638 return 0; 639 } 640 641 /* 642 * Compute the amount of bmbt blocks we should reserve for each file. In the 643 * worst case, each exchange will fill a hole with a new mapping, which could 644 * result in a btree split every time we add a new leaf block. 645 */ 646 static inline uint64_t 647 xfs_exchmaps_bmbt_blocks( 648 struct xfs_mount *mp, 649 const struct xfs_exchmaps_req *req) 650 { 651 return howmany_64(req->nr_exchanges, 652 XFS_MAX_CONTIG_BMAPS_PER_BLOCK(mp)) * 653 XFS_EXTENTADD_SPACE_RES(mp, xfs_exchmaps_reqfork(req)); 654 } 655 656 /* Compute the space we should reserve for the rmap btree expansions. */ 657 static inline uint64_t 658 xfs_exchmaps_rmapbt_blocks( 659 struct xfs_mount *mp, 660 const struct xfs_exchmaps_req *req) 661 { 662 if (!xfs_has_rmapbt(mp)) 663 return 0; 664 if (XFS_IS_REALTIME_INODE(req->ip1)) 665 return 0; 666 667 return howmany_64(req->nr_exchanges, 668 XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) * 669 XFS_RMAPADD_SPACE_RES(mp); 670 } 671 672 /* Estimate the bmbt and rmapbt overhead required to exchange mappings. */ 673 int 674 xfs_exchmaps_estimate_overhead( 675 struct xfs_exchmaps_req *req) 676 { 677 struct xfs_mount *mp = req->ip1->i_mount; 678 xfs_filblks_t bmbt_blocks; 679 xfs_filblks_t rmapbt_blocks; 680 xfs_filblks_t resblks = req->resblks; 681 682 /* 683 * Compute the number of bmbt and rmapbt blocks we might need to handle 684 * the estimated number of exchanges. 685 */ 686 bmbt_blocks = xfs_exchmaps_bmbt_blocks(mp, req); 687 rmapbt_blocks = xfs_exchmaps_rmapbt_blocks(mp, req); 688 689 trace_xfs_exchmaps_overhead(mp, bmbt_blocks, rmapbt_blocks); 690 691 /* Make sure the change in file block count doesn't overflow. */ 692 if (check_add_overflow(req->ip1_bcount, bmbt_blocks, &req->ip1_bcount)) 693 return -EFBIG; 694 if (check_add_overflow(req->ip2_bcount, bmbt_blocks, &req->ip2_bcount)) 695 return -EFBIG; 696 697 /* 698 * Add together the number of blocks we need to handle btree growth, 699 * then add it to the number of blocks we need to reserve to this 700 * transaction. 701 */ 702 if (check_add_overflow(resblks, bmbt_blocks, &resblks)) 703 return -ENOSPC; 704 if (check_add_overflow(resblks, bmbt_blocks, &resblks)) 705 return -ENOSPC; 706 if (check_add_overflow(resblks, rmapbt_blocks, &resblks)) 707 return -ENOSPC; 708 if (check_add_overflow(resblks, rmapbt_blocks, &resblks)) 709 return -ENOSPC; 710 711 /* Can't actually reserve more than UINT_MAX blocks. */ 712 if (req->resblks > UINT_MAX) 713 return -ENOSPC; 714 715 req->resblks = resblks; 716 trace_xfs_exchmaps_final_estimate(req); 717 return 0; 718 } 719 720 /* Decide if we can merge two real mappings. */ 721 static inline bool 722 xmi_can_merge( 723 const struct xfs_bmbt_irec *b1, 724 const struct xfs_bmbt_irec *b2) 725 { 726 /* Don't merge holes. */ 727 if (b1->br_startblock == HOLESTARTBLOCK || 728 b2->br_startblock == HOLESTARTBLOCK) 729 return false; 730 731 /* We don't merge holes. */ 732 if (!xfs_bmap_is_real_extent(b1) || !xfs_bmap_is_real_extent(b2)) 733 return false; 734 735 if (b1->br_startoff + b1->br_blockcount == b2->br_startoff && 736 b1->br_startblock + b1->br_blockcount == b2->br_startblock && 737 b1->br_state == b2->br_state && 738 b1->br_blockcount + b2->br_blockcount <= XFS_MAX_BMBT_EXTLEN) 739 return true; 740 741 return false; 742 } 743 744 /* 745 * Decide if we can merge three mappings. Caller must ensure all three 746 * mappings must not be holes or delalloc reservations. 747 */ 748 static inline bool 749 xmi_can_merge_all( 750 const struct xfs_bmbt_irec *l, 751 const struct xfs_bmbt_irec *m, 752 const struct xfs_bmbt_irec *r) 753 { 754 xfs_filblks_t new_len; 755 756 new_len = l->br_blockcount + m->br_blockcount + r->br_blockcount; 757 return new_len <= XFS_MAX_BMBT_EXTLEN; 758 } 759 760 #define CLEFT_CONTIG 0x01 761 #define CRIGHT_CONTIG 0x02 762 #define CHOLE 0x04 763 #define CBOTH_CONTIG (CLEFT_CONTIG | CRIGHT_CONTIG) 764 765 #define NLEFT_CONTIG 0x10 766 #define NRIGHT_CONTIG 0x20 767 #define NHOLE 0x40 768 #define NBOTH_CONTIG (NLEFT_CONTIG | NRIGHT_CONTIG) 769 770 /* Estimate the effect of a single exchange on mapping count. */ 771 static inline int 772 xmi_delta_nextents_step( 773 struct xfs_mount *mp, 774 const struct xfs_bmbt_irec *left, 775 const struct xfs_bmbt_irec *curr, 776 const struct xfs_bmbt_irec *new, 777 const struct xfs_bmbt_irec *right) 778 { 779 bool lhole, rhole, chole, nhole; 780 unsigned int state = 0; 781 int ret = 0; 782 783 lhole = left->br_startblock == HOLESTARTBLOCK; 784 rhole = right->br_startblock == HOLESTARTBLOCK; 785 chole = curr->br_startblock == HOLESTARTBLOCK; 786 nhole = new->br_startblock == HOLESTARTBLOCK; 787 788 if (chole) 789 state |= CHOLE; 790 if (!lhole && !chole && xmi_can_merge(left, curr)) 791 state |= CLEFT_CONTIG; 792 if (!rhole && !chole && xmi_can_merge(curr, right)) 793 state |= CRIGHT_CONTIG; 794 if ((state & CBOTH_CONTIG) == CBOTH_CONTIG && 795 !xmi_can_merge_all(left, curr, right)) 796 state &= ~CRIGHT_CONTIG; 797 798 if (nhole) 799 state |= NHOLE; 800 if (!lhole && !nhole && xmi_can_merge(left, new)) 801 state |= NLEFT_CONTIG; 802 if (!rhole && !nhole && xmi_can_merge(new, right)) 803 state |= NRIGHT_CONTIG; 804 if ((state & NBOTH_CONTIG) == NBOTH_CONTIG && 805 !xmi_can_merge_all(left, new, right)) 806 state &= ~NRIGHT_CONTIG; 807 808 switch (state & (CLEFT_CONTIG | CRIGHT_CONTIG | CHOLE)) { 809 case CLEFT_CONTIG | CRIGHT_CONTIG: 810 /* 811 * left/curr/right are the same mapping, so deleting curr 812 * causes 2 new mappings to be created. 813 */ 814 ret += 2; 815 break; 816 case 0: 817 /* 818 * curr is not contiguous with any mapping, so we remove curr 819 * completely 820 */ 821 ret--; 822 break; 823 case CHOLE: 824 /* hole, do nothing */ 825 break; 826 case CLEFT_CONTIG: 827 case CRIGHT_CONTIG: 828 /* trim either left or right, no change */ 829 break; 830 } 831 832 switch (state & (NLEFT_CONTIG | NRIGHT_CONTIG | NHOLE)) { 833 case NLEFT_CONTIG | NRIGHT_CONTIG: 834 /* 835 * left/curr/right will become the same mapping, so adding 836 * curr causes the deletion of right. 837 */ 838 ret--; 839 break; 840 case 0: 841 /* new is not contiguous with any mapping */ 842 ret++; 843 break; 844 case NHOLE: 845 /* hole, do nothing. */ 846 break; 847 case NLEFT_CONTIG: 848 case NRIGHT_CONTIG: 849 /* new is absorbed into left or right, no change */ 850 break; 851 } 852 853 trace_xfs_exchmaps_delta_nextents_step(mp, left, curr, new, right, ret, 854 state); 855 return ret; 856 } 857 858 /* Make sure we don't overflow the extent (mapping) counters. */ 859 static inline int 860 xmi_ensure_delta_nextents( 861 struct xfs_exchmaps_req *req, 862 struct xfs_inode *ip, 863 int64_t delta) 864 { 865 struct xfs_mount *mp = ip->i_mount; 866 int whichfork = xfs_exchmaps_reqfork(req); 867 struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); 868 uint64_t new_nextents; 869 xfs_extnum_t max_nextents; 870 871 if (delta < 0) 872 return 0; 873 874 /* 875 * It's always an error if the delta causes integer overflow. delta 876 * needs an explicit cast here to avoid warnings about implicit casts 877 * coded into the overflow check. 878 */ 879 if (check_add_overflow(ifp->if_nextents, (uint64_t)delta, 880 &new_nextents)) 881 return -EFBIG; 882 883 if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) && 884 new_nextents > 10) 885 return -EFBIG; 886 887 /* 888 * We always promote both inodes to have large extent counts if the 889 * superblock feature is enabled, so we only need to check against the 890 * theoretical maximum. 891 */ 892 max_nextents = xfs_iext_max_nextents(xfs_has_large_extent_counts(mp), 893 whichfork); 894 if (new_nextents > max_nextents) 895 return -EFBIG; 896 897 return 0; 898 } 899 900 /* Find the next mapping after irec. */ 901 static inline int 902 xmi_next( 903 struct xfs_inode *ip, 904 int bmap_flags, 905 const struct xfs_bmbt_irec *irec, 906 struct xfs_bmbt_irec *nrec) 907 { 908 xfs_fileoff_t off; 909 xfs_filblks_t blockcount; 910 int nimaps = 1; 911 int error; 912 913 off = irec->br_startoff + irec->br_blockcount; 914 blockcount = XFS_MAX_FILEOFF - off; 915 error = xfs_bmapi_read(ip, off, blockcount, nrec, &nimaps, bmap_flags); 916 if (error) 917 return error; 918 if (nrec->br_startblock == DELAYSTARTBLOCK || 919 nrec->br_startoff != off) { 920 /* 921 * If we don't get the mapping we want, return a zero-length 922 * mapping, which our estimator function will pretend is a hole. 923 * We shouldn't get delalloc reservations. 924 */ 925 nrec->br_startblock = HOLESTARTBLOCK; 926 } 927 928 return 0; 929 } 930 931 int __init 932 xfs_exchmaps_intent_init_cache(void) 933 { 934 xfs_exchmaps_intent_cache = kmem_cache_create("xfs_exchmaps_intent", 935 sizeof(struct xfs_exchmaps_intent), 936 0, 0, NULL); 937 938 return xfs_exchmaps_intent_cache != NULL ? 0 : -ENOMEM; 939 } 940 941 void 942 xfs_exchmaps_intent_destroy_cache(void) 943 { 944 kmem_cache_destroy(xfs_exchmaps_intent_cache); 945 xfs_exchmaps_intent_cache = NULL; 946 } 947 948 /* 949 * Decide if we will exchange the reflink flags between the two files after the 950 * exchange. The only time we want to do this is if we're exchanging all 951 * mappings under EOF and the inode reflink flags have different states. 952 */ 953 static inline bool 954 xmi_can_exchange_reflink_flags( 955 const struct xfs_exchmaps_req *req, 956 unsigned int reflink_state) 957 { 958 struct xfs_mount *mp = req->ip1->i_mount; 959 960 if (hweight32(reflink_state) != 1) 961 return false; 962 if (req->startoff1 != 0 || req->startoff2 != 0) 963 return false; 964 if (req->blockcount != XFS_B_TO_FSB(mp, req->ip1->i_disk_size)) 965 return false; 966 if (req->blockcount != XFS_B_TO_FSB(mp, req->ip2->i_disk_size)) 967 return false; 968 return true; 969 } 970 971 972 /* Allocate and initialize a new incore intent item from a request. */ 973 struct xfs_exchmaps_intent * 974 xfs_exchmaps_init_intent( 975 const struct xfs_exchmaps_req *req) 976 { 977 struct xfs_exchmaps_intent *xmi; 978 unsigned int rs = 0; 979 980 xmi = kmem_cache_zalloc(xfs_exchmaps_intent_cache, 981 GFP_NOFS | __GFP_NOFAIL); 982 INIT_LIST_HEAD(&xmi->xmi_list); 983 xmi->xmi_ip1 = req->ip1; 984 xmi->xmi_ip2 = req->ip2; 985 xmi->xmi_startoff1 = req->startoff1; 986 xmi->xmi_startoff2 = req->startoff2; 987 xmi->xmi_blockcount = req->blockcount; 988 xmi->xmi_isize1 = xmi->xmi_isize2 = -1; 989 xmi->xmi_flags = req->flags & XFS_EXCHMAPS_PARAMS; 990 991 if (xfs_exchmaps_whichfork(xmi) == XFS_ATTR_FORK) { 992 xmi->xmi_flags |= __XFS_EXCHMAPS_INO2_SHORTFORM; 993 return xmi; 994 } 995 996 if (req->flags & XFS_EXCHMAPS_SET_SIZES) { 997 xmi->xmi_flags |= XFS_EXCHMAPS_SET_SIZES; 998 xmi->xmi_isize1 = req->ip2->i_disk_size; 999 xmi->xmi_isize2 = req->ip1->i_disk_size; 1000 } 1001 1002 /* Record the state of each inode's reflink flag before the op. */ 1003 if (xfs_is_reflink_inode(req->ip1)) 1004 rs |= 1; 1005 if (xfs_is_reflink_inode(req->ip2)) 1006 rs |= 2; 1007 1008 /* 1009 * Figure out if we're clearing the reflink flags (which effectively 1010 * exchanges them) after the operation. 1011 */ 1012 if (xmi_can_exchange_reflink_flags(req, rs)) { 1013 if (rs & 1) 1014 xmi->xmi_flags |= XFS_EXCHMAPS_CLEAR_INO1_REFLINK; 1015 if (rs & 2) 1016 xmi->xmi_flags |= XFS_EXCHMAPS_CLEAR_INO2_REFLINK; 1017 } 1018 1019 if (S_ISDIR(VFS_I(xmi->xmi_ip2)->i_mode) || 1020 S_ISLNK(VFS_I(xmi->xmi_ip2)->i_mode)) 1021 xmi->xmi_flags |= __XFS_EXCHMAPS_INO2_SHORTFORM; 1022 1023 return xmi; 1024 } 1025 1026 /* 1027 * Estimate the number of exchange operations and the number of file blocks 1028 * in each file that will be affected by the exchange operation. 1029 */ 1030 int 1031 xfs_exchmaps_estimate( 1032 struct xfs_exchmaps_req *req) 1033 { 1034 struct xfs_exchmaps_intent *xmi; 1035 struct xfs_bmbt_irec irec1, irec2; 1036 struct xfs_exchmaps_adjacent adj = ADJACENT_INIT; 1037 xfs_filblks_t ip1_blocks = 0, ip2_blocks = 0; 1038 int64_t d_nexts1, d_nexts2; 1039 int bmap_flags; 1040 int error; 1041 1042 ASSERT(!(req->flags & ~XFS_EXCHMAPS_PARAMS)); 1043 1044 bmap_flags = xfs_bmapi_aflag(xfs_exchmaps_reqfork(req)); 1045 xmi = xfs_exchmaps_init_intent(req); 1046 1047 /* 1048 * To guard against the possibility of overflowing the extent counters, 1049 * we have to estimate an upper bound on the potential increase in that 1050 * counter. We can split the mapping at each end of the range, and for 1051 * each step of the exchange we can split the mapping that we're 1052 * working on if the mappings do not align. 1053 */ 1054 d_nexts1 = d_nexts2 = 3; 1055 1056 while (xmi_has_more_exchange_work(xmi)) { 1057 /* 1058 * Walk through the file ranges until we find something to 1059 * exchange. Because we're simulating the exchange, pass in 1060 * adj to capture skipped mappings for correct estimation of 1061 * bmbt record merges. 1062 */ 1063 error = xfs_exchmaps_find_mappings(xmi, &irec1, &irec2, &adj); 1064 if (error) 1065 goto out_free; 1066 if (!xmi_has_more_exchange_work(xmi)) 1067 break; 1068 1069 /* Update accounting. */ 1070 if (xfs_bmap_is_real_extent(&irec1)) 1071 ip1_blocks += irec1.br_blockcount; 1072 if (xfs_bmap_is_real_extent(&irec2)) 1073 ip2_blocks += irec2.br_blockcount; 1074 req->nr_exchanges++; 1075 1076 /* Read the next mappings from both files. */ 1077 error = xmi_next(req->ip1, bmap_flags, &irec1, &adj.right1); 1078 if (error) 1079 goto out_free; 1080 1081 error = xmi_next(req->ip2, bmap_flags, &irec2, &adj.right2); 1082 if (error) 1083 goto out_free; 1084 1085 /* Update extent count deltas. */ 1086 d_nexts1 += xmi_delta_nextents_step(req->ip1->i_mount, 1087 &adj.left1, &irec1, &irec2, &adj.right1); 1088 1089 d_nexts2 += xmi_delta_nextents_step(req->ip1->i_mount, 1090 &adj.left2, &irec2, &irec1, &adj.right2); 1091 1092 /* Now pretend we exchanged the mappings. */ 1093 if (xmi_can_merge(&adj.left2, &irec1)) 1094 adj.left2.br_blockcount += irec1.br_blockcount; 1095 else 1096 memcpy(&adj.left2, &irec1, sizeof(irec1)); 1097 1098 if (xmi_can_merge(&adj.left1, &irec2)) 1099 adj.left1.br_blockcount += irec2.br_blockcount; 1100 else 1101 memcpy(&adj.left1, &irec2, sizeof(irec2)); 1102 1103 xmi_advance(xmi, &irec1); 1104 } 1105 1106 /* Account for the blocks that are being exchanged. */ 1107 if (XFS_IS_REALTIME_INODE(req->ip1) && 1108 xfs_exchmaps_reqfork(req) == XFS_DATA_FORK) { 1109 req->ip1_rtbcount = ip1_blocks; 1110 req->ip2_rtbcount = ip2_blocks; 1111 } else { 1112 req->ip1_bcount = ip1_blocks; 1113 req->ip2_bcount = ip2_blocks; 1114 } 1115 1116 /* 1117 * Make sure that both forks have enough slack left in their extent 1118 * counters that the exchange operation will not overflow. 1119 */ 1120 trace_xfs_exchmaps_delta_nextents(req, d_nexts1, d_nexts2); 1121 if (req->ip1 == req->ip2) { 1122 error = xmi_ensure_delta_nextents(req, req->ip1, 1123 d_nexts1 + d_nexts2); 1124 } else { 1125 error = xmi_ensure_delta_nextents(req, req->ip1, d_nexts1); 1126 if (error) 1127 goto out_free; 1128 error = xmi_ensure_delta_nextents(req, req->ip2, d_nexts2); 1129 } 1130 if (error) 1131 goto out_free; 1132 1133 trace_xfs_exchmaps_initial_estimate(req); 1134 error = xfs_exchmaps_estimate_overhead(req); 1135 out_free: 1136 kmem_cache_free(xfs_exchmaps_intent_cache, xmi); 1137 return error; 1138 } 1139 1140 /* Set the reflink flag before an operation. */ 1141 static inline void 1142 xfs_exchmaps_set_reflink( 1143 struct xfs_trans *tp, 1144 struct xfs_inode *ip) 1145 { 1146 trace_xfs_reflink_set_inode_flag(ip); 1147 1148 ip->i_diflags2 |= XFS_DIFLAG2_REFLINK; 1149 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1150 } 1151 1152 /* 1153 * If either file has shared blocks and we're exchanging data forks, we must 1154 * flag the other file as having shared blocks so that we get the shared-block 1155 * rmap functions if we need to fix up the rmaps. 1156 */ 1157 void 1158 xfs_exchmaps_ensure_reflink( 1159 struct xfs_trans *tp, 1160 const struct xfs_exchmaps_intent *xmi) 1161 { 1162 unsigned int rs = 0; 1163 1164 if (xfs_is_reflink_inode(xmi->xmi_ip1)) 1165 rs |= 1; 1166 if (xfs_is_reflink_inode(xmi->xmi_ip2)) 1167 rs |= 2; 1168 1169 if ((rs & 1) && !xfs_is_reflink_inode(xmi->xmi_ip2)) 1170 xfs_exchmaps_set_reflink(tp, xmi->xmi_ip2); 1171 1172 if ((rs & 2) && !xfs_is_reflink_inode(xmi->xmi_ip1)) 1173 xfs_exchmaps_set_reflink(tp, xmi->xmi_ip1); 1174 } 1175 1176 /* Set the large extent count flag before an operation if needed. */ 1177 static inline void 1178 xfs_exchmaps_ensure_large_extent_counts( 1179 struct xfs_trans *tp, 1180 struct xfs_inode *ip) 1181 { 1182 if (xfs_inode_has_large_extent_counts(ip)) 1183 return; 1184 1185 ip->i_diflags2 |= XFS_DIFLAG2_NREXT64; 1186 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1187 } 1188 1189 /* Widen the extent counter fields of both inodes if necessary. */ 1190 void 1191 xfs_exchmaps_upgrade_extent_counts( 1192 struct xfs_trans *tp, 1193 const struct xfs_exchmaps_intent *xmi) 1194 { 1195 if (!xfs_has_large_extent_counts(tp->t_mountp)) 1196 return; 1197 1198 xfs_exchmaps_ensure_large_extent_counts(tp, xmi->xmi_ip1); 1199 xfs_exchmaps_ensure_large_extent_counts(tp, xmi->xmi_ip2); 1200 } 1201 1202 /* 1203 * Schedule an exchange a range of mappings from one inode to another. 1204 * 1205 * The use of file mapping exchange log intent items ensures the operation can 1206 * be resumed even if the system goes down. The caller must commit the 1207 * transaction to start the work. 1208 * 1209 * The caller must ensure the inodes must be joined to the transaction and 1210 * ILOCKd; they will still be joined to the transaction at exit. 1211 */ 1212 void 1213 xfs_exchange_mappings( 1214 struct xfs_trans *tp, 1215 const struct xfs_exchmaps_req *req) 1216 { 1217 struct xfs_exchmaps_intent *xmi; 1218 1219 BUILD_BUG_ON(XFS_EXCHMAPS_INTERNAL_FLAGS & XFS_EXCHMAPS_LOGGED_FLAGS); 1220 1221 xfs_assert_ilocked(req->ip1, XFS_ILOCK_EXCL); 1222 xfs_assert_ilocked(req->ip2, XFS_ILOCK_EXCL); 1223 ASSERT(!(req->flags & ~XFS_EXCHMAPS_LOGGED_FLAGS)); 1224 if (req->flags & XFS_EXCHMAPS_SET_SIZES) 1225 ASSERT(!(req->flags & XFS_EXCHMAPS_ATTR_FORK)); 1226 ASSERT(xfs_has_exchange_range(tp->t_mountp)); 1227 1228 if (req->blockcount == 0) 1229 return; 1230 1231 xmi = xfs_exchmaps_init_intent(req); 1232 xfs_exchmaps_defer_add(tp, xmi); 1233 xfs_exchmaps_ensure_reflink(tp, xmi); 1234 xfs_exchmaps_upgrade_extent_counts(tp, xmi); 1235 } 1236