1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (c) 2020-2024 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_defer.h" 14 #include "xfs_inode.h" 15 #include "xfs_trans.h" 16 #include "xfs_bmap.h" 17 #include "xfs_icache.h" 18 #include "xfs_quota.h" 19 #include "xfs_exchmaps.h" 20 #include "xfs_trace.h" 21 #include "xfs_bmap_btree.h" 22 #include "xfs_trans_space.h" 23 #include "xfs_error.h" 24 #include "xfs_errortag.h" 25 #include "xfs_health.h" 26 #include "xfs_exchmaps_item.h" 27 #include "xfs_da_format.h" 28 #include "xfs_da_btree.h" 29 #include "xfs_attr_leaf.h" 30 #include "xfs_attr.h" 31 #include "xfs_dir2_priv.h" 32 #include "xfs_dir2.h" 33 #include "xfs_symlink_remote.h" 34 35 struct kmem_cache *xfs_exchmaps_intent_cache; 36 37 /* bmbt mappings adjacent to a pair of records. */ 38 struct xfs_exchmaps_adjacent { 39 struct xfs_bmbt_irec left1; 40 struct xfs_bmbt_irec right1; 41 struct xfs_bmbt_irec left2; 42 struct xfs_bmbt_irec right2; 43 }; 44 45 #define ADJACENT_INIT { \ 46 .left1 = { .br_startblock = HOLESTARTBLOCK }, \ 47 .right1 = { .br_startblock = HOLESTARTBLOCK }, \ 48 .left2 = { .br_startblock = HOLESTARTBLOCK }, \ 49 .right2 = { .br_startblock = HOLESTARTBLOCK }, \ 50 } 51 52 /* Information to reset reflink flag / CoW fork state after an exchange. */ 53 54 /* 55 * If the reflink flag is set on either inode, make sure it has an incore CoW 56 * fork, since all reflink inodes must have them. If there's a CoW fork and it 57 * has mappings in it, make sure the inodes are tagged appropriately so that 58 * speculative preallocations can be GC'd if we run low of space. 59 */ 60 static inline void 61 xfs_exchmaps_ensure_cowfork( 62 struct xfs_inode *ip) 63 { 64 struct xfs_ifork *cfork; 65 66 if (xfs_is_reflink_inode(ip)) 67 xfs_ifork_init_cow(ip); 68 69 cfork = xfs_ifork_ptr(ip, XFS_COW_FORK); 70 if (!cfork) 71 return; 72 if (cfork->if_bytes > 0) 73 xfs_inode_set_cowblocks_tag(ip); 74 else 75 xfs_inode_clear_cowblocks_tag(ip); 76 } 77 78 /* 79 * Adjust the on-disk inode size upwards if needed so that we never add 80 * mappings into the file past EOF. This is crucial so that log recovery won't 81 * get confused by the sudden appearance of post-eof mappings. 82 */ 83 STATIC void 84 xfs_exchmaps_update_size( 85 struct xfs_trans *tp, 86 struct xfs_inode *ip, 87 struct xfs_bmbt_irec *imap, 88 xfs_fsize_t new_isize) 89 { 90 struct xfs_mount *mp = tp->t_mountp; 91 xfs_fsize_t len; 92 93 if (new_isize < 0) 94 return; 95 96 len = min(XFS_FSB_TO_B(mp, imap->br_startoff + imap->br_blockcount), 97 new_isize); 98 99 if (len <= ip->i_disk_size) 100 return; 101 102 trace_xfs_exchmaps_update_inode_size(ip, len); 103 104 ip->i_disk_size = len; 105 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 106 } 107 108 /* Advance the incore state tracking after exchanging a mapping. */ 109 static inline void 110 xmi_advance( 111 struct xfs_exchmaps_intent *xmi, 112 const struct xfs_bmbt_irec *irec) 113 { 114 xmi->xmi_startoff1 += irec->br_blockcount; 115 xmi->xmi_startoff2 += irec->br_blockcount; 116 xmi->xmi_blockcount -= irec->br_blockcount; 117 } 118 119 /* Do we still have more mappings to exchange? */ 120 static inline bool 121 xmi_has_more_exchange_work(const struct xfs_exchmaps_intent *xmi) 122 { 123 return xmi->xmi_blockcount > 0; 124 } 125 126 /* Do we have post-operation cleanups to perform? */ 127 static inline bool 128 xmi_has_postop_work(const struct xfs_exchmaps_intent *xmi) 129 { 130 return xmi->xmi_flags & (XFS_EXCHMAPS_CLEAR_INO1_REFLINK | 131 XFS_EXCHMAPS_CLEAR_INO2_REFLINK | 132 __XFS_EXCHMAPS_INO2_SHORTFORM); 133 } 134 135 /* Check all mappings to make sure we can actually exchange them. */ 136 int 137 xfs_exchmaps_check_forks( 138 struct xfs_mount *mp, 139 const struct xfs_exchmaps_req *req) 140 { 141 struct xfs_ifork *ifp1, *ifp2; 142 int whichfork = xfs_exchmaps_reqfork(req); 143 144 /* No fork? */ 145 ifp1 = xfs_ifork_ptr(req->ip1, whichfork); 146 ifp2 = xfs_ifork_ptr(req->ip2, whichfork); 147 if (!ifp1 || !ifp2) 148 return -EINVAL; 149 150 /* We don't know how to exchange local format forks. */ 151 if (ifp1->if_format == XFS_DINODE_FMT_LOCAL || 152 ifp2->if_format == XFS_DINODE_FMT_LOCAL) 153 return -EINVAL; 154 155 return 0; 156 } 157 158 #ifdef CONFIG_XFS_QUOTA 159 /* Log the actual updates to the quota accounting. */ 160 static inline void 161 xfs_exchmaps_update_quota( 162 struct xfs_trans *tp, 163 struct xfs_exchmaps_intent *xmi, 164 struct xfs_bmbt_irec *irec1, 165 struct xfs_bmbt_irec *irec2) 166 { 167 int64_t ip1_delta = 0, ip2_delta = 0; 168 unsigned int qflag; 169 170 qflag = XFS_IS_REALTIME_INODE(xmi->xmi_ip1) ? XFS_TRANS_DQ_RTBCOUNT : 171 XFS_TRANS_DQ_BCOUNT; 172 173 if (xfs_bmap_is_real_extent(irec1)) { 174 ip1_delta -= irec1->br_blockcount; 175 ip2_delta += irec1->br_blockcount; 176 } 177 178 if (xfs_bmap_is_real_extent(irec2)) { 179 ip1_delta += irec2->br_blockcount; 180 ip2_delta -= irec2->br_blockcount; 181 } 182 183 xfs_trans_mod_dquot_byino(tp, xmi->xmi_ip1, qflag, ip1_delta); 184 xfs_trans_mod_dquot_byino(tp, xmi->xmi_ip2, qflag, ip2_delta); 185 } 186 #else 187 # define xfs_exchmaps_update_quota(tp, xmi, irec1, irec2) ((void)0) 188 #endif 189 190 /* Decide if we want to skip this mapping from file1. */ 191 static inline bool 192 xfs_exchmaps_can_skip_mapping( 193 struct xfs_exchmaps_intent *xmi, 194 struct xfs_bmbt_irec *irec) 195 { 196 struct xfs_mount *mp = xmi->xmi_ip1->i_mount; 197 198 /* Do not skip this mapping if the caller did not tell us to. */ 199 if (!(xmi->xmi_flags & XFS_EXCHMAPS_INO1_WRITTEN)) 200 return false; 201 202 /* Do not skip mapped, written mappings. */ 203 if (xfs_bmap_is_written_extent(irec)) 204 return false; 205 206 /* 207 * The mapping is unwritten or a hole. It cannot be a delalloc 208 * reservation because we already excluded those. It cannot be an 209 * unwritten extent with dirty page cache because we flushed the page 210 * cache. For files where the allocation unit is 1FSB (files on the 211 * data dev, rt files if the extent size is 1FSB), we can safely 212 * skip this mapping. 213 */ 214 if (!xfs_inode_has_bigrtalloc(xmi->xmi_ip1)) 215 return true; 216 217 /* 218 * For a realtime file with a multi-fsb allocation unit, the decision 219 * is trickier because we can only swap full allocation units. 220 * Unwritten mappings can appear in the middle of an rtx if the rtx is 221 * partially written, but they can also appear for preallocations. 222 * 223 * If the mapping is a hole, skip it entirely. Holes should align with 224 * rtx boundaries. 225 */ 226 if (!xfs_bmap_is_real_extent(irec)) 227 return true; 228 229 /* 230 * All mappings below this point are unwritten. 231 * 232 * - If the beginning is not aligned to an rtx, trim the end of the 233 * mapping so that it does not cross an rtx boundary, and swap it. 234 * 235 * - If both ends are aligned to an rtx, skip the entire mapping. 236 */ 237 if (!isaligned_64(irec->br_startoff, mp->m_sb.sb_rextsize)) { 238 xfs_fileoff_t new_end; 239 240 new_end = roundup_64(irec->br_startoff, mp->m_sb.sb_rextsize); 241 irec->br_blockcount = min(irec->br_blockcount, 242 new_end - irec->br_startoff); 243 return false; 244 } 245 if (isaligned_64(irec->br_blockcount, mp->m_sb.sb_rextsize)) 246 return true; 247 248 /* 249 * All mappings below this point are unwritten, start on an rtx 250 * boundary, and do not end on an rtx boundary. 251 * 252 * - If the mapping is longer than one rtx, trim the end of the mapping 253 * down to an rtx boundary and skip it. 254 * 255 * - The mapping is shorter than one rtx. Swap it. 256 */ 257 if (irec->br_blockcount > mp->m_sb.sb_rextsize) { 258 xfs_fileoff_t new_end; 259 260 new_end = rounddown_64(irec->br_startoff + irec->br_blockcount, 261 mp->m_sb.sb_rextsize); 262 irec->br_blockcount = new_end - irec->br_startoff; 263 return true; 264 } 265 266 return false; 267 } 268 269 /* 270 * Walk forward through the file ranges in @xmi until we find two different 271 * mappings to exchange. If there is work to do, return the mappings; 272 * otherwise we've reached the end of the range and xmi_blockcount will be 273 * zero. 274 * 275 * If the walk skips over a pair of mappings to the same storage, save them as 276 * the left records in @adj (if provided) so that the simulation phase can 277 * avoid an extra lookup. 278 */ 279 static int 280 xfs_exchmaps_find_mappings( 281 struct xfs_exchmaps_intent *xmi, 282 struct xfs_bmbt_irec *irec1, 283 struct xfs_bmbt_irec *irec2, 284 struct xfs_exchmaps_adjacent *adj) 285 { 286 int nimaps; 287 int bmap_flags; 288 int error; 289 290 bmap_flags = xfs_bmapi_aflag(xfs_exchmaps_whichfork(xmi)); 291 292 for (; xmi_has_more_exchange_work(xmi); xmi_advance(xmi, irec1)) { 293 /* Read mapping from the first file */ 294 nimaps = 1; 295 error = xfs_bmapi_read(xmi->xmi_ip1, xmi->xmi_startoff1, 296 xmi->xmi_blockcount, irec1, &nimaps, 297 bmap_flags); 298 if (error) 299 return error; 300 if (nimaps != 1 || 301 irec1->br_startblock == DELAYSTARTBLOCK || 302 irec1->br_startoff != xmi->xmi_startoff1) { 303 /* 304 * We should never get no mapping or a delalloc mapping 305 * or something that doesn't match what we asked for, 306 * since the caller flushed both inodes and we hold the 307 * ILOCKs for both inodes. 308 */ 309 ASSERT(0); 310 return -EINVAL; 311 } 312 313 if (xfs_exchmaps_can_skip_mapping(xmi, irec1)) { 314 trace_xfs_exchmaps_mapping1_skip(xmi->xmi_ip1, irec1); 315 continue; 316 } 317 318 /* Read mapping from the second file */ 319 nimaps = 1; 320 error = xfs_bmapi_read(xmi->xmi_ip2, xmi->xmi_startoff2, 321 irec1->br_blockcount, irec2, &nimaps, 322 bmap_flags); 323 if (error) 324 return error; 325 if (nimaps != 1 || 326 irec2->br_startblock == DELAYSTARTBLOCK || 327 irec2->br_startoff != xmi->xmi_startoff2) { 328 /* 329 * We should never get no mapping or a delalloc mapping 330 * or something that doesn't match what we asked for, 331 * since the caller flushed both inodes and we hold the 332 * ILOCKs for both inodes. 333 */ 334 ASSERT(0); 335 return -EINVAL; 336 } 337 338 /* 339 * We can only exchange as many blocks as the smaller of the 340 * two mapping maps. 341 */ 342 irec1->br_blockcount = min(irec1->br_blockcount, 343 irec2->br_blockcount); 344 345 trace_xfs_exchmaps_mapping1(xmi->xmi_ip1, irec1); 346 trace_xfs_exchmaps_mapping2(xmi->xmi_ip2, irec2); 347 348 /* We found something to exchange, so return it. */ 349 if (irec1->br_startblock != irec2->br_startblock) 350 return 0; 351 352 /* 353 * Two mappings pointing to the same physical block must not 354 * have different states; that's filesystem corruption. Move 355 * on to the next mapping if they're both holes or both point 356 * to the same physical space extent. 357 */ 358 if (irec1->br_state != irec2->br_state) { 359 xfs_bmap_mark_sick(xmi->xmi_ip1, 360 xfs_exchmaps_whichfork(xmi)); 361 xfs_bmap_mark_sick(xmi->xmi_ip2, 362 xfs_exchmaps_whichfork(xmi)); 363 return -EFSCORRUPTED; 364 } 365 366 /* 367 * Save the mappings if we're estimating work and skipping 368 * these identical mappings. 369 */ 370 if (adj) { 371 memcpy(&adj->left1, irec1, sizeof(*irec1)); 372 memcpy(&adj->left2, irec2, sizeof(*irec2)); 373 } 374 } 375 376 return 0; 377 } 378 379 /* Exchange these two mappings. */ 380 static void 381 xfs_exchmaps_one_step( 382 struct xfs_trans *tp, 383 struct xfs_exchmaps_intent *xmi, 384 struct xfs_bmbt_irec *irec1, 385 struct xfs_bmbt_irec *irec2) 386 { 387 int whichfork = xfs_exchmaps_whichfork(xmi); 388 389 xfs_exchmaps_update_quota(tp, xmi, irec1, irec2); 390 391 /* Remove both mappings. */ 392 xfs_bmap_unmap_extent(tp, xmi->xmi_ip1, whichfork, irec1); 393 xfs_bmap_unmap_extent(tp, xmi->xmi_ip2, whichfork, irec2); 394 395 /* 396 * Re-add both mappings. We exchange the file offsets between the two 397 * maps and add the opposite map, which has the effect of filling the 398 * logical offsets we just unmapped, but with with the physical mapping 399 * information exchanged. 400 */ 401 swap(irec1->br_startoff, irec2->br_startoff); 402 xfs_bmap_map_extent(tp, xmi->xmi_ip1, whichfork, irec2); 403 xfs_bmap_map_extent(tp, xmi->xmi_ip2, whichfork, irec1); 404 405 /* Make sure we're not adding mappings past EOF. */ 406 if (whichfork == XFS_DATA_FORK) { 407 xfs_exchmaps_update_size(tp, xmi->xmi_ip1, irec2, 408 xmi->xmi_isize1); 409 xfs_exchmaps_update_size(tp, xmi->xmi_ip2, irec1, 410 xmi->xmi_isize2); 411 } 412 413 /* 414 * Advance our cursor and exit. The caller (either defer ops or log 415 * recovery) will log the XMD item, and if *blockcount is nonzero, it 416 * will log a new XMI item for the remainder and call us back. 417 */ 418 xmi_advance(xmi, irec1); 419 } 420 421 /* Convert inode2's leaf attr fork back to shortform, if possible.. */ 422 STATIC int 423 xfs_exchmaps_attr_to_sf( 424 struct xfs_trans *tp, 425 struct xfs_exchmaps_intent *xmi) 426 { 427 struct xfs_da_args args = { 428 .dp = xmi->xmi_ip2, 429 .geo = tp->t_mountp->m_attr_geo, 430 .whichfork = XFS_ATTR_FORK, 431 .trans = tp, 432 .owner = xmi->xmi_ip2->i_ino, 433 }; 434 struct xfs_buf *bp; 435 int forkoff; 436 int error; 437 438 if (!xfs_attr_is_leaf(xmi->xmi_ip2)) 439 return 0; 440 441 error = xfs_attr3_leaf_read(tp, xmi->xmi_ip2, xmi->xmi_ip2->i_ino, 0, 442 &bp); 443 if (error) 444 return error; 445 446 forkoff = xfs_attr_shortform_allfit(bp, xmi->xmi_ip2); 447 if (forkoff == 0) 448 return 0; 449 450 return xfs_attr3_leaf_to_shortform(bp, &args, forkoff); 451 } 452 453 /* Convert inode2's block dir fork back to shortform, if possible.. */ 454 STATIC int 455 xfs_exchmaps_dir_to_sf( 456 struct xfs_trans *tp, 457 struct xfs_exchmaps_intent *xmi) 458 { 459 struct xfs_da_args args = { 460 .dp = xmi->xmi_ip2, 461 .geo = tp->t_mountp->m_dir_geo, 462 .whichfork = XFS_DATA_FORK, 463 .trans = tp, 464 .owner = xmi->xmi_ip2->i_ino, 465 }; 466 struct xfs_dir2_sf_hdr sfh; 467 struct xfs_buf *bp; 468 bool isblock; 469 int size; 470 int error; 471 472 error = xfs_dir2_isblock(&args, &isblock); 473 if (error) 474 return error; 475 476 if (!isblock) 477 return 0; 478 479 error = xfs_dir3_block_read(tp, xmi->xmi_ip2, xmi->xmi_ip2->i_ino, &bp); 480 if (error) 481 return error; 482 483 size = xfs_dir2_block_sfsize(xmi->xmi_ip2, bp->b_addr, &sfh); 484 if (size > xfs_inode_data_fork_size(xmi->xmi_ip2)) 485 return 0; 486 487 return xfs_dir2_block_to_sf(&args, bp, size, &sfh); 488 } 489 490 /* Convert inode2's remote symlink target back to shortform, if possible. */ 491 STATIC int 492 xfs_exchmaps_link_to_sf( 493 struct xfs_trans *tp, 494 struct xfs_exchmaps_intent *xmi) 495 { 496 struct xfs_inode *ip = xmi->xmi_ip2; 497 struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); 498 char *buf; 499 int error; 500 501 if (ifp->if_format == XFS_DINODE_FMT_LOCAL || 502 ip->i_disk_size > xfs_inode_data_fork_size(ip)) 503 return 0; 504 505 /* Read the current symlink target into a buffer. */ 506 buf = kmalloc(ip->i_disk_size + 1, 507 GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); 508 if (!buf) { 509 ASSERT(0); 510 return -ENOMEM; 511 } 512 513 error = xfs_symlink_remote_read(ip, buf); 514 if (error) 515 goto free; 516 517 /* Remove the blocks. */ 518 error = xfs_symlink_remote_truncate(tp, ip); 519 if (error) 520 goto free; 521 522 /* Convert fork to local format and log our changes. */ 523 xfs_idestroy_fork(ifp); 524 ifp->if_bytes = 0; 525 ifp->if_format = XFS_DINODE_FMT_LOCAL; 526 xfs_init_local_fork(ip, XFS_DATA_FORK, buf, ip->i_disk_size); 527 xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE); 528 free: 529 kfree(buf); 530 return error; 531 } 532 533 /* Clear the reflink flag after an exchange. */ 534 static inline void 535 xfs_exchmaps_clear_reflink( 536 struct xfs_trans *tp, 537 struct xfs_inode *ip) 538 { 539 trace_xfs_reflink_unset_inode_flag(ip); 540 541 ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; 542 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 543 } 544 545 /* Finish whatever work might come after an exchange operation. */ 546 static int 547 xfs_exchmaps_do_postop_work( 548 struct xfs_trans *tp, 549 struct xfs_exchmaps_intent *xmi) 550 { 551 if (xmi->xmi_flags & __XFS_EXCHMAPS_INO2_SHORTFORM) { 552 int error = 0; 553 554 if (xmi->xmi_flags & XFS_EXCHMAPS_ATTR_FORK) 555 error = xfs_exchmaps_attr_to_sf(tp, xmi); 556 else if (S_ISDIR(VFS_I(xmi->xmi_ip2)->i_mode)) 557 error = xfs_exchmaps_dir_to_sf(tp, xmi); 558 else if (S_ISLNK(VFS_I(xmi->xmi_ip2)->i_mode)) 559 error = xfs_exchmaps_link_to_sf(tp, xmi); 560 xmi->xmi_flags &= ~__XFS_EXCHMAPS_INO2_SHORTFORM; 561 if (error) 562 return error; 563 } 564 565 if (xmi->xmi_flags & XFS_EXCHMAPS_CLEAR_INO1_REFLINK) { 566 xfs_exchmaps_clear_reflink(tp, xmi->xmi_ip1); 567 xmi->xmi_flags &= ~XFS_EXCHMAPS_CLEAR_INO1_REFLINK; 568 } 569 570 if (xmi->xmi_flags & XFS_EXCHMAPS_CLEAR_INO2_REFLINK) { 571 xfs_exchmaps_clear_reflink(tp, xmi->xmi_ip2); 572 xmi->xmi_flags &= ~XFS_EXCHMAPS_CLEAR_INO2_REFLINK; 573 } 574 575 return 0; 576 } 577 578 /* Finish one step in a mapping exchange operation, possibly relogging. */ 579 int 580 xfs_exchmaps_finish_one( 581 struct xfs_trans *tp, 582 struct xfs_exchmaps_intent *xmi) 583 { 584 struct xfs_bmbt_irec irec1, irec2; 585 int error; 586 587 if (xmi_has_more_exchange_work(xmi)) { 588 /* 589 * If the operation state says that some range of the files 590 * have not yet been exchanged, look for mappings in that range 591 * to exchange. If we find some mappings, exchange them. 592 */ 593 error = xfs_exchmaps_find_mappings(xmi, &irec1, &irec2, NULL); 594 if (error) 595 return error; 596 597 if (xmi_has_more_exchange_work(xmi)) 598 xfs_exchmaps_one_step(tp, xmi, &irec1, &irec2); 599 600 /* 601 * If the caller asked us to exchange the file sizes after the 602 * exchange and either we just exchanged the last mappings in 603 * the range or we didn't find anything to exchange, update the 604 * ondisk file sizes. 605 */ 606 if ((xmi->xmi_flags & XFS_EXCHMAPS_SET_SIZES) && 607 !xmi_has_more_exchange_work(xmi)) { 608 xmi->xmi_ip1->i_disk_size = xmi->xmi_isize1; 609 xmi->xmi_ip2->i_disk_size = xmi->xmi_isize2; 610 611 xfs_trans_log_inode(tp, xmi->xmi_ip1, XFS_ILOG_CORE); 612 xfs_trans_log_inode(tp, xmi->xmi_ip2, XFS_ILOG_CORE); 613 } 614 } else if (xmi_has_postop_work(xmi)) { 615 /* 616 * Now that we're finished with the exchange operation, 617 * complete the post-op cleanup work. 618 */ 619 error = xfs_exchmaps_do_postop_work(tp, xmi); 620 if (error) 621 return error; 622 } 623 624 if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_EXCHMAPS_FINISH_ONE)) 625 return -EIO; 626 627 /* If we still have work to do, ask for a new transaction. */ 628 if (xmi_has_more_exchange_work(xmi) || xmi_has_postop_work(xmi)) { 629 trace_xfs_exchmaps_defer(tp->t_mountp, xmi); 630 return -EAGAIN; 631 } 632 633 /* 634 * If we reach here, we've finished all the exchange work and the post 635 * operation work. The last thing we need to do before returning to 636 * the caller is to make sure that COW forks are set up correctly. 637 */ 638 if (!(xmi->xmi_flags & XFS_EXCHMAPS_ATTR_FORK)) { 639 xfs_exchmaps_ensure_cowfork(xmi->xmi_ip1); 640 xfs_exchmaps_ensure_cowfork(xmi->xmi_ip2); 641 } 642 643 return 0; 644 } 645 646 /* 647 * Compute the amount of bmbt blocks we should reserve for each file. In the 648 * worst case, each exchange will fill a hole with a new mapping, which could 649 * result in a btree split every time we add a new leaf block. 650 */ 651 static inline uint64_t 652 xfs_exchmaps_bmbt_blocks( 653 struct xfs_mount *mp, 654 const struct xfs_exchmaps_req *req) 655 { 656 return howmany_64(req->nr_exchanges, 657 XFS_MAX_CONTIG_BMAPS_PER_BLOCK(mp)) * 658 XFS_EXTENTADD_SPACE_RES(mp, xfs_exchmaps_reqfork(req)); 659 } 660 661 /* Compute the space we should reserve for the rmap btree expansions. */ 662 static inline uint64_t 663 xfs_exchmaps_rmapbt_blocks( 664 struct xfs_mount *mp, 665 const struct xfs_exchmaps_req *req) 666 { 667 if (!xfs_has_rmapbt(mp)) 668 return 0; 669 if (XFS_IS_REALTIME_INODE(req->ip1)) 670 return 0; 671 672 return howmany_64(req->nr_exchanges, 673 XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) * 674 XFS_RMAPADD_SPACE_RES(mp); 675 } 676 677 /* Estimate the bmbt and rmapbt overhead required to exchange mappings. */ 678 int 679 xfs_exchmaps_estimate_overhead( 680 struct xfs_exchmaps_req *req) 681 { 682 struct xfs_mount *mp = req->ip1->i_mount; 683 xfs_filblks_t bmbt_blocks; 684 xfs_filblks_t rmapbt_blocks; 685 xfs_filblks_t resblks = req->resblks; 686 687 /* 688 * Compute the number of bmbt and rmapbt blocks we might need to handle 689 * the estimated number of exchanges. 690 */ 691 bmbt_blocks = xfs_exchmaps_bmbt_blocks(mp, req); 692 rmapbt_blocks = xfs_exchmaps_rmapbt_blocks(mp, req); 693 694 trace_xfs_exchmaps_overhead(mp, bmbt_blocks, rmapbt_blocks); 695 696 /* Make sure the change in file block count doesn't overflow. */ 697 if (check_add_overflow(req->ip1_bcount, bmbt_blocks, &req->ip1_bcount)) 698 return -EFBIG; 699 if (check_add_overflow(req->ip2_bcount, bmbt_blocks, &req->ip2_bcount)) 700 return -EFBIG; 701 702 /* 703 * Add together the number of blocks we need to handle btree growth, 704 * then add it to the number of blocks we need to reserve to this 705 * transaction. 706 */ 707 if (check_add_overflow(resblks, bmbt_blocks, &resblks)) 708 return -ENOSPC; 709 if (check_add_overflow(resblks, bmbt_blocks, &resblks)) 710 return -ENOSPC; 711 if (check_add_overflow(resblks, rmapbt_blocks, &resblks)) 712 return -ENOSPC; 713 if (check_add_overflow(resblks, rmapbt_blocks, &resblks)) 714 return -ENOSPC; 715 716 /* Can't actually reserve more than UINT_MAX blocks. */ 717 if (req->resblks > UINT_MAX) 718 return -ENOSPC; 719 720 req->resblks = resblks; 721 trace_xfs_exchmaps_final_estimate(req); 722 return 0; 723 } 724 725 /* Decide if we can merge two real mappings. */ 726 static inline bool 727 xmi_can_merge( 728 const struct xfs_bmbt_irec *b1, 729 const struct xfs_bmbt_irec *b2) 730 { 731 /* Don't merge holes. */ 732 if (b1->br_startblock == HOLESTARTBLOCK || 733 b2->br_startblock == HOLESTARTBLOCK) 734 return false; 735 736 /* We don't merge holes. */ 737 if (!xfs_bmap_is_real_extent(b1) || !xfs_bmap_is_real_extent(b2)) 738 return false; 739 740 if (b1->br_startoff + b1->br_blockcount == b2->br_startoff && 741 b1->br_startblock + b1->br_blockcount == b2->br_startblock && 742 b1->br_state == b2->br_state && 743 b1->br_blockcount + b2->br_blockcount <= XFS_MAX_BMBT_EXTLEN) 744 return true; 745 746 return false; 747 } 748 749 /* 750 * Decide if we can merge three mappings. Caller must ensure all three 751 * mappings must not be holes or delalloc reservations. 752 */ 753 static inline bool 754 xmi_can_merge_all( 755 const struct xfs_bmbt_irec *l, 756 const struct xfs_bmbt_irec *m, 757 const struct xfs_bmbt_irec *r) 758 { 759 xfs_filblks_t new_len; 760 761 new_len = l->br_blockcount + m->br_blockcount + r->br_blockcount; 762 return new_len <= XFS_MAX_BMBT_EXTLEN; 763 } 764 765 #define CLEFT_CONTIG 0x01 766 #define CRIGHT_CONTIG 0x02 767 #define CHOLE 0x04 768 #define CBOTH_CONTIG (CLEFT_CONTIG | CRIGHT_CONTIG) 769 770 #define NLEFT_CONTIG 0x10 771 #define NRIGHT_CONTIG 0x20 772 #define NHOLE 0x40 773 #define NBOTH_CONTIG (NLEFT_CONTIG | NRIGHT_CONTIG) 774 775 /* Estimate the effect of a single exchange on mapping count. */ 776 static inline int 777 xmi_delta_nextents_step( 778 struct xfs_mount *mp, 779 const struct xfs_bmbt_irec *left, 780 const struct xfs_bmbt_irec *curr, 781 const struct xfs_bmbt_irec *new, 782 const struct xfs_bmbt_irec *right) 783 { 784 bool lhole, rhole, chole, nhole; 785 unsigned int state = 0; 786 int ret = 0; 787 788 lhole = left->br_startblock == HOLESTARTBLOCK; 789 rhole = right->br_startblock == HOLESTARTBLOCK; 790 chole = curr->br_startblock == HOLESTARTBLOCK; 791 nhole = new->br_startblock == HOLESTARTBLOCK; 792 793 if (chole) 794 state |= CHOLE; 795 if (!lhole && !chole && xmi_can_merge(left, curr)) 796 state |= CLEFT_CONTIG; 797 if (!rhole && !chole && xmi_can_merge(curr, right)) 798 state |= CRIGHT_CONTIG; 799 if ((state & CBOTH_CONTIG) == CBOTH_CONTIG && 800 !xmi_can_merge_all(left, curr, right)) 801 state &= ~CRIGHT_CONTIG; 802 803 if (nhole) 804 state |= NHOLE; 805 if (!lhole && !nhole && xmi_can_merge(left, new)) 806 state |= NLEFT_CONTIG; 807 if (!rhole && !nhole && xmi_can_merge(new, right)) 808 state |= NRIGHT_CONTIG; 809 if ((state & NBOTH_CONTIG) == NBOTH_CONTIG && 810 !xmi_can_merge_all(left, new, right)) 811 state &= ~NRIGHT_CONTIG; 812 813 switch (state & (CLEFT_CONTIG | CRIGHT_CONTIG | CHOLE)) { 814 case CLEFT_CONTIG | CRIGHT_CONTIG: 815 /* 816 * left/curr/right are the same mapping, so deleting curr 817 * causes 2 new mappings to be created. 818 */ 819 ret += 2; 820 break; 821 case 0: 822 /* 823 * curr is not contiguous with any mapping, so we remove curr 824 * completely 825 */ 826 ret--; 827 break; 828 case CHOLE: 829 /* hole, do nothing */ 830 break; 831 case CLEFT_CONTIG: 832 case CRIGHT_CONTIG: 833 /* trim either left or right, no change */ 834 break; 835 } 836 837 switch (state & (NLEFT_CONTIG | NRIGHT_CONTIG | NHOLE)) { 838 case NLEFT_CONTIG | NRIGHT_CONTIG: 839 /* 840 * left/curr/right will become the same mapping, so adding 841 * curr causes the deletion of right. 842 */ 843 ret--; 844 break; 845 case 0: 846 /* new is not contiguous with any mapping */ 847 ret++; 848 break; 849 case NHOLE: 850 /* hole, do nothing. */ 851 break; 852 case NLEFT_CONTIG: 853 case NRIGHT_CONTIG: 854 /* new is absorbed into left or right, no change */ 855 break; 856 } 857 858 trace_xfs_exchmaps_delta_nextents_step(mp, left, curr, new, right, ret, 859 state); 860 return ret; 861 } 862 863 /* Make sure we don't overflow the extent (mapping) counters. */ 864 static inline int 865 xmi_ensure_delta_nextents( 866 struct xfs_exchmaps_req *req, 867 struct xfs_inode *ip, 868 int64_t delta) 869 { 870 struct xfs_mount *mp = ip->i_mount; 871 int whichfork = xfs_exchmaps_reqfork(req); 872 struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); 873 uint64_t new_nextents; 874 xfs_extnum_t max_nextents; 875 876 if (delta < 0) 877 return 0; 878 879 /* 880 * It's always an error if the delta causes integer overflow. delta 881 * needs an explicit cast here to avoid warnings about implicit casts 882 * coded into the overflow check. 883 */ 884 if (check_add_overflow(ifp->if_nextents, (uint64_t)delta, 885 &new_nextents)) 886 return -EFBIG; 887 888 if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) && 889 new_nextents > 10) 890 return -EFBIG; 891 892 /* 893 * We always promote both inodes to have large extent counts if the 894 * superblock feature is enabled, so we only need to check against the 895 * theoretical maximum. 896 */ 897 max_nextents = xfs_iext_max_nextents(xfs_has_large_extent_counts(mp), 898 whichfork); 899 if (new_nextents > max_nextents) 900 return -EFBIG; 901 902 return 0; 903 } 904 905 /* Find the next mapping after irec. */ 906 static inline int 907 xmi_next( 908 struct xfs_inode *ip, 909 int bmap_flags, 910 const struct xfs_bmbt_irec *irec, 911 struct xfs_bmbt_irec *nrec) 912 { 913 xfs_fileoff_t off; 914 xfs_filblks_t blockcount; 915 int nimaps = 1; 916 int error; 917 918 off = irec->br_startoff + irec->br_blockcount; 919 blockcount = XFS_MAX_FILEOFF - off; 920 error = xfs_bmapi_read(ip, off, blockcount, nrec, &nimaps, bmap_flags); 921 if (error) 922 return error; 923 if (nrec->br_startblock == DELAYSTARTBLOCK || 924 nrec->br_startoff != off) { 925 /* 926 * If we don't get the mapping we want, return a zero-length 927 * mapping, which our estimator function will pretend is a hole. 928 * We shouldn't get delalloc reservations. 929 */ 930 nrec->br_startblock = HOLESTARTBLOCK; 931 } 932 933 return 0; 934 } 935 936 int __init 937 xfs_exchmaps_intent_init_cache(void) 938 { 939 xfs_exchmaps_intent_cache = kmem_cache_create("xfs_exchmaps_intent", 940 sizeof(struct xfs_exchmaps_intent), 941 0, 0, NULL); 942 943 return xfs_exchmaps_intent_cache != NULL ? 0 : -ENOMEM; 944 } 945 946 void 947 xfs_exchmaps_intent_destroy_cache(void) 948 { 949 kmem_cache_destroy(xfs_exchmaps_intent_cache); 950 xfs_exchmaps_intent_cache = NULL; 951 } 952 953 /* 954 * Decide if we will exchange the reflink flags between the two files after the 955 * exchange. The only time we want to do this is if we're exchanging all 956 * mappings under EOF and the inode reflink flags have different states. 957 */ 958 static inline bool 959 xmi_can_exchange_reflink_flags( 960 const struct xfs_exchmaps_req *req, 961 unsigned int reflink_state) 962 { 963 struct xfs_mount *mp = req->ip1->i_mount; 964 965 if (hweight32(reflink_state) != 1) 966 return false; 967 if (req->startoff1 != 0 || req->startoff2 != 0) 968 return false; 969 if (req->blockcount != XFS_B_TO_FSB(mp, req->ip1->i_disk_size)) 970 return false; 971 if (req->blockcount != XFS_B_TO_FSB(mp, req->ip2->i_disk_size)) 972 return false; 973 return true; 974 } 975 976 977 /* Allocate and initialize a new incore intent item from a request. */ 978 struct xfs_exchmaps_intent * 979 xfs_exchmaps_init_intent( 980 const struct xfs_exchmaps_req *req) 981 { 982 struct xfs_exchmaps_intent *xmi; 983 unsigned int rs = 0; 984 985 xmi = kmem_cache_zalloc(xfs_exchmaps_intent_cache, 986 GFP_NOFS | __GFP_NOFAIL); 987 INIT_LIST_HEAD(&xmi->xmi_list); 988 xmi->xmi_ip1 = req->ip1; 989 xmi->xmi_ip2 = req->ip2; 990 xmi->xmi_startoff1 = req->startoff1; 991 xmi->xmi_startoff2 = req->startoff2; 992 xmi->xmi_blockcount = req->blockcount; 993 xmi->xmi_isize1 = xmi->xmi_isize2 = -1; 994 xmi->xmi_flags = req->flags & XFS_EXCHMAPS_PARAMS; 995 996 if (xfs_exchmaps_whichfork(xmi) == XFS_ATTR_FORK) { 997 xmi->xmi_flags |= __XFS_EXCHMAPS_INO2_SHORTFORM; 998 return xmi; 999 } 1000 1001 if (req->flags & XFS_EXCHMAPS_SET_SIZES) { 1002 xmi->xmi_flags |= XFS_EXCHMAPS_SET_SIZES; 1003 xmi->xmi_isize1 = req->ip2->i_disk_size; 1004 xmi->xmi_isize2 = req->ip1->i_disk_size; 1005 } 1006 1007 /* Record the state of each inode's reflink flag before the op. */ 1008 if (xfs_is_reflink_inode(req->ip1)) 1009 rs |= 1; 1010 if (xfs_is_reflink_inode(req->ip2)) 1011 rs |= 2; 1012 1013 /* 1014 * Figure out if we're clearing the reflink flags (which effectively 1015 * exchanges them) after the operation. 1016 */ 1017 if (xmi_can_exchange_reflink_flags(req, rs)) { 1018 if (rs & 1) 1019 xmi->xmi_flags |= XFS_EXCHMAPS_CLEAR_INO1_REFLINK; 1020 if (rs & 2) 1021 xmi->xmi_flags |= XFS_EXCHMAPS_CLEAR_INO2_REFLINK; 1022 } 1023 1024 if (S_ISDIR(VFS_I(xmi->xmi_ip2)->i_mode) || 1025 S_ISLNK(VFS_I(xmi->xmi_ip2)->i_mode)) 1026 xmi->xmi_flags |= __XFS_EXCHMAPS_INO2_SHORTFORM; 1027 1028 return xmi; 1029 } 1030 1031 /* 1032 * Estimate the number of exchange operations and the number of file blocks 1033 * in each file that will be affected by the exchange operation. 1034 */ 1035 int 1036 xfs_exchmaps_estimate( 1037 struct xfs_exchmaps_req *req) 1038 { 1039 struct xfs_exchmaps_intent *xmi; 1040 struct xfs_bmbt_irec irec1, irec2; 1041 struct xfs_exchmaps_adjacent adj = ADJACENT_INIT; 1042 xfs_filblks_t ip1_blocks = 0, ip2_blocks = 0; 1043 int64_t d_nexts1, d_nexts2; 1044 int bmap_flags; 1045 int error; 1046 1047 ASSERT(!(req->flags & ~XFS_EXCHMAPS_PARAMS)); 1048 1049 bmap_flags = xfs_bmapi_aflag(xfs_exchmaps_reqfork(req)); 1050 xmi = xfs_exchmaps_init_intent(req); 1051 1052 /* 1053 * To guard against the possibility of overflowing the extent counters, 1054 * we have to estimate an upper bound on the potential increase in that 1055 * counter. We can split the mapping at each end of the range, and for 1056 * each step of the exchange we can split the mapping that we're 1057 * working on if the mappings do not align. 1058 */ 1059 d_nexts1 = d_nexts2 = 3; 1060 1061 while (xmi_has_more_exchange_work(xmi)) { 1062 /* 1063 * Walk through the file ranges until we find something to 1064 * exchange. Because we're simulating the exchange, pass in 1065 * adj to capture skipped mappings for correct estimation of 1066 * bmbt record merges. 1067 */ 1068 error = xfs_exchmaps_find_mappings(xmi, &irec1, &irec2, &adj); 1069 if (error) 1070 goto out_free; 1071 if (!xmi_has_more_exchange_work(xmi)) 1072 break; 1073 1074 /* Update accounting. */ 1075 if (xfs_bmap_is_real_extent(&irec1)) 1076 ip1_blocks += irec1.br_blockcount; 1077 if (xfs_bmap_is_real_extent(&irec2)) 1078 ip2_blocks += irec2.br_blockcount; 1079 req->nr_exchanges++; 1080 1081 /* Read the next mappings from both files. */ 1082 error = xmi_next(req->ip1, bmap_flags, &irec1, &adj.right1); 1083 if (error) 1084 goto out_free; 1085 1086 error = xmi_next(req->ip2, bmap_flags, &irec2, &adj.right2); 1087 if (error) 1088 goto out_free; 1089 1090 /* Update extent count deltas. */ 1091 d_nexts1 += xmi_delta_nextents_step(req->ip1->i_mount, 1092 &adj.left1, &irec1, &irec2, &adj.right1); 1093 1094 d_nexts2 += xmi_delta_nextents_step(req->ip1->i_mount, 1095 &adj.left2, &irec2, &irec1, &adj.right2); 1096 1097 /* Now pretend we exchanged the mappings. */ 1098 if (xmi_can_merge(&adj.left2, &irec1)) 1099 adj.left2.br_blockcount += irec1.br_blockcount; 1100 else 1101 memcpy(&adj.left2, &irec1, sizeof(irec1)); 1102 1103 if (xmi_can_merge(&adj.left1, &irec2)) 1104 adj.left1.br_blockcount += irec2.br_blockcount; 1105 else 1106 memcpy(&adj.left1, &irec2, sizeof(irec2)); 1107 1108 xmi_advance(xmi, &irec1); 1109 } 1110 1111 /* Account for the blocks that are being exchanged. */ 1112 if (XFS_IS_REALTIME_INODE(req->ip1) && 1113 xfs_exchmaps_reqfork(req) == XFS_DATA_FORK) { 1114 req->ip1_rtbcount = ip1_blocks; 1115 req->ip2_rtbcount = ip2_blocks; 1116 } else { 1117 req->ip1_bcount = ip1_blocks; 1118 req->ip2_bcount = ip2_blocks; 1119 } 1120 1121 /* 1122 * Make sure that both forks have enough slack left in their extent 1123 * counters that the exchange operation will not overflow. 1124 */ 1125 trace_xfs_exchmaps_delta_nextents(req, d_nexts1, d_nexts2); 1126 if (req->ip1 == req->ip2) { 1127 error = xmi_ensure_delta_nextents(req, req->ip1, 1128 d_nexts1 + d_nexts2); 1129 } else { 1130 error = xmi_ensure_delta_nextents(req, req->ip1, d_nexts1); 1131 if (error) 1132 goto out_free; 1133 error = xmi_ensure_delta_nextents(req, req->ip2, d_nexts2); 1134 } 1135 if (error) 1136 goto out_free; 1137 1138 trace_xfs_exchmaps_initial_estimate(req); 1139 error = xfs_exchmaps_estimate_overhead(req); 1140 out_free: 1141 kmem_cache_free(xfs_exchmaps_intent_cache, xmi); 1142 return error; 1143 } 1144 1145 /* Set the reflink flag before an operation. */ 1146 static inline void 1147 xfs_exchmaps_set_reflink( 1148 struct xfs_trans *tp, 1149 struct xfs_inode *ip) 1150 { 1151 trace_xfs_reflink_set_inode_flag(ip); 1152 1153 ip->i_diflags2 |= XFS_DIFLAG2_REFLINK; 1154 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1155 } 1156 1157 /* 1158 * If either file has shared blocks and we're exchanging data forks, we must 1159 * flag the other file as having shared blocks so that we get the shared-block 1160 * rmap functions if we need to fix up the rmaps. 1161 */ 1162 void 1163 xfs_exchmaps_ensure_reflink( 1164 struct xfs_trans *tp, 1165 const struct xfs_exchmaps_intent *xmi) 1166 { 1167 unsigned int rs = 0; 1168 1169 if (xfs_is_reflink_inode(xmi->xmi_ip1)) 1170 rs |= 1; 1171 if (xfs_is_reflink_inode(xmi->xmi_ip2)) 1172 rs |= 2; 1173 1174 if ((rs & 1) && !xfs_is_reflink_inode(xmi->xmi_ip2)) 1175 xfs_exchmaps_set_reflink(tp, xmi->xmi_ip2); 1176 1177 if ((rs & 2) && !xfs_is_reflink_inode(xmi->xmi_ip1)) 1178 xfs_exchmaps_set_reflink(tp, xmi->xmi_ip1); 1179 } 1180 1181 /* Set the large extent count flag before an operation if needed. */ 1182 static inline void 1183 xfs_exchmaps_ensure_large_extent_counts( 1184 struct xfs_trans *tp, 1185 struct xfs_inode *ip) 1186 { 1187 if (xfs_inode_has_large_extent_counts(ip)) 1188 return; 1189 1190 ip->i_diflags2 |= XFS_DIFLAG2_NREXT64; 1191 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1192 } 1193 1194 /* Widen the extent counter fields of both inodes if necessary. */ 1195 void 1196 xfs_exchmaps_upgrade_extent_counts( 1197 struct xfs_trans *tp, 1198 const struct xfs_exchmaps_intent *xmi) 1199 { 1200 if (!xfs_has_large_extent_counts(tp->t_mountp)) 1201 return; 1202 1203 xfs_exchmaps_ensure_large_extent_counts(tp, xmi->xmi_ip1); 1204 xfs_exchmaps_ensure_large_extent_counts(tp, xmi->xmi_ip2); 1205 } 1206 1207 /* 1208 * Schedule an exchange a range of mappings from one inode to another. 1209 * 1210 * The use of file mapping exchange log intent items ensures the operation can 1211 * be resumed even if the system goes down. The caller must commit the 1212 * transaction to start the work. 1213 * 1214 * The caller must ensure the inodes must be joined to the transaction and 1215 * ILOCKd; they will still be joined to the transaction at exit. 1216 */ 1217 void 1218 xfs_exchange_mappings( 1219 struct xfs_trans *tp, 1220 const struct xfs_exchmaps_req *req) 1221 { 1222 struct xfs_exchmaps_intent *xmi; 1223 1224 BUILD_BUG_ON(XFS_EXCHMAPS_INTERNAL_FLAGS & XFS_EXCHMAPS_LOGGED_FLAGS); 1225 1226 xfs_assert_ilocked(req->ip1, XFS_ILOCK_EXCL); 1227 xfs_assert_ilocked(req->ip2, XFS_ILOCK_EXCL); 1228 ASSERT(!(req->flags & ~XFS_EXCHMAPS_LOGGED_FLAGS)); 1229 if (req->flags & XFS_EXCHMAPS_SET_SIZES) 1230 ASSERT(!(req->flags & XFS_EXCHMAPS_ATTR_FORK)); 1231 ASSERT(xfs_has_exchange_range(tp->t_mountp)); 1232 1233 if (req->blockcount == 0) 1234 return; 1235 1236 xmi = xfs_exchmaps_init_intent(req); 1237 xfs_exchmaps_defer_add(tp, xmi); 1238 xfs_exchmaps_ensure_reflink(tp, xmi); 1239 xfs_exchmaps_upgrade_extent_counts(tp, xmi); 1240 } 1241