1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2022-2023 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_btree.h" 13 #include "xfs_log_format.h" 14 #include "xfs_trans.h" 15 #include "xfs_sb.h" 16 #include "xfs_inode.h" 17 #include "xfs_alloc.h" 18 #include "xfs_alloc_btree.h" 19 #include "xfs_ialloc.h" 20 #include "xfs_ialloc_btree.h" 21 #include "xfs_rmap.h" 22 #include "xfs_rmap_btree.h" 23 #include "xfs_refcount.h" 24 #include "xfs_refcount_btree.h" 25 #include "xfs_extent_busy.h" 26 #include "xfs_ag.h" 27 #include "xfs_ag_resv.h" 28 #include "xfs_quota.h" 29 #include "xfs_qm.h" 30 #include "xfs_bmap.h" 31 #include "xfs_da_format.h" 32 #include "xfs_da_btree.h" 33 #include "xfs_attr.h" 34 #include "xfs_attr_remote.h" 35 #include "xfs_defer.h" 36 #include "scrub/scrub.h" 37 #include "scrub/common.h" 38 #include "scrub/trace.h" 39 #include "scrub/repair.h" 40 #include "scrub/bitmap.h" 41 #include "scrub/agb_bitmap.h" 42 #include "scrub/fsb_bitmap.h" 43 #include "scrub/reap.h" 44 45 /* 46 * Disposal of Blocks from Old Metadata 47 * 48 * Now that we've constructed a new btree to replace the damaged one, we want 49 * to dispose of the blocks that (we think) the old btree was using. 50 * Previously, we used the rmapbt to collect the extents (bitmap) with the 51 * rmap owner corresponding to the tree we rebuilt, collected extents for any 52 * blocks with the same rmap owner that are owned by another data structure 53 * (sublist), and subtracted sublist from bitmap. In theory the extents 54 * remaining in bitmap are the old btree's blocks. 55 * 56 * Unfortunately, it's possible that the btree was crosslinked with other 57 * blocks on disk. The rmap data can tell us if there are multiple owners, so 58 * if the rmapbt says there is an owner of this block other than @oinfo, then 59 * the block is crosslinked. Remove the reverse mapping and continue. 60 * 61 * If there is one rmap record, we can free the block, which removes the 62 * reverse mapping but doesn't add the block to the free space. Our repair 63 * strategy is to hope the other metadata objects crosslinked on this block 64 * will be rebuilt (atop different blocks), thereby removing all the cross 65 * links. 66 * 67 * If there are no rmap records at all, we also free the block. If the btree 68 * being rebuilt lives in the free space (bnobt/cntbt/rmapbt) then there isn't 69 * supposed to be a rmap record and everything is ok. For other btrees there 70 * had to have been an rmap entry for the block to have ended up on @bitmap, 71 * so if it's gone now there's something wrong and the fs will shut down. 72 * 73 * Note: If there are multiple rmap records with only the same rmap owner as 74 * the btree we're trying to rebuild and the block is indeed owned by another 75 * data structure with the same rmap owner, then the block will be in sublist 76 * and therefore doesn't need disposal. If there are multiple rmap records 77 * with only the same rmap owner but the block is not owned by something with 78 * the same rmap owner, the block will be freed. 79 * 80 * The caller is responsible for locking the AG headers/inode for the entire 81 * rebuild operation so that nothing else can sneak in and change the incore 82 * state while we're not looking. We must also invalidate any buffers 83 * associated with @bitmap. 84 */ 85 86 /* Information about reaping extents after a repair. */ 87 struct xreap_state { 88 struct xfs_scrub *sc; 89 90 /* Reverse mapping owner and metadata reservation type. */ 91 const struct xfs_owner_info *oinfo; 92 enum xfs_ag_resv_type resv; 93 94 /* If true, roll the transaction before reaping the next extent. */ 95 bool force_roll; 96 97 /* Number of deferred reaps attached to the current transaction. */ 98 unsigned int deferred; 99 100 /* Number of invalidated buffers logged to the current transaction. */ 101 unsigned int invalidated; 102 103 /* Number of deferred reaps queued during the whole reap sequence. */ 104 unsigned long long total_deferred; 105 }; 106 107 /* Put a block back on the AGFL. */ 108 STATIC int 109 xreap_put_freelist( 110 struct xfs_scrub *sc, 111 xfs_agblock_t agbno) 112 { 113 struct xfs_buf *agfl_bp; 114 int error; 115 116 /* Make sure there's space on the freelist. */ 117 error = xrep_fix_freelist(sc, 0); 118 if (error) 119 return error; 120 121 /* 122 * Since we're "freeing" a lost block onto the AGFL, we have to 123 * create an rmap for the block prior to merging it or else other 124 * parts will break. 125 */ 126 error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, 1, 127 &XFS_RMAP_OINFO_AG); 128 if (error) 129 return error; 130 131 /* Put the block on the AGFL. */ 132 error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp); 133 if (error) 134 return error; 135 136 error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp, 137 agfl_bp, agbno, 0); 138 if (error) 139 return error; 140 xfs_extent_busy_insert(sc->tp, pag_group(sc->sa.pag), agbno, 1, 141 XFS_EXTENT_BUSY_SKIP_DISCARD); 142 143 return 0; 144 } 145 146 /* Are there any uncommitted reap operations? */ 147 static inline bool xreap_dirty(const struct xreap_state *rs) 148 { 149 if (rs->force_roll) 150 return true; 151 if (rs->deferred) 152 return true; 153 if (rs->invalidated) 154 return true; 155 if (rs->total_deferred) 156 return true; 157 return false; 158 } 159 160 #define XREAP_MAX_BINVAL (2048) 161 162 /* 163 * Decide if we want to roll the transaction after reaping an extent. We don't 164 * want to overrun the transaction reservation, so we prohibit more than 165 * 128 EFIs per transaction. For the same reason, we limit the number 166 * of buffer invalidations to 2048. 167 */ 168 static inline bool xreap_want_roll(const struct xreap_state *rs) 169 { 170 if (rs->force_roll) 171 return true; 172 if (rs->deferred > XREP_MAX_ITRUNCATE_EFIS) 173 return true; 174 if (rs->invalidated > XREAP_MAX_BINVAL) 175 return true; 176 return false; 177 } 178 179 static inline void xreap_reset(struct xreap_state *rs) 180 { 181 rs->total_deferred += rs->deferred; 182 rs->deferred = 0; 183 rs->invalidated = 0; 184 rs->force_roll = false; 185 } 186 187 #define XREAP_MAX_DEFER_CHAIN (2048) 188 189 /* 190 * Decide if we want to finish the deferred ops that are attached to the scrub 191 * transaction. We don't want to queue huge chains of deferred ops because 192 * that can consume a lot of log space and kernel memory. Hence we trigger a 193 * xfs_defer_finish if there are more than 2048 deferred reap operations or the 194 * caller did some real work. 195 */ 196 static inline bool 197 xreap_want_defer_finish(const struct xreap_state *rs) 198 { 199 if (rs->force_roll) 200 return true; 201 if (rs->total_deferred > XREAP_MAX_DEFER_CHAIN) 202 return true; 203 return false; 204 } 205 206 static inline void xreap_defer_finish_reset(struct xreap_state *rs) 207 { 208 rs->total_deferred = 0; 209 rs->deferred = 0; 210 rs->invalidated = 0; 211 rs->force_roll = false; 212 } 213 214 /* 215 * Compute the maximum length of a buffer cache scan (in units of sectors), 216 * given a quantity of fs blocks. 217 */ 218 xfs_daddr_t 219 xrep_bufscan_max_sectors( 220 struct xfs_mount *mp, 221 xfs_extlen_t fsblocks) 222 { 223 int max_fsbs; 224 225 /* Remote xattr values are the largest buffers that we support. */ 226 max_fsbs = xfs_attr3_max_rmt_blocks(mp); 227 228 return XFS_FSB_TO_BB(mp, min_t(xfs_extlen_t, fsblocks, max_fsbs)); 229 } 230 231 /* 232 * Return an incore buffer from a sector scan, or NULL if there are no buffers 233 * left to return. 234 */ 235 struct xfs_buf * 236 xrep_bufscan_advance( 237 struct xfs_mount *mp, 238 struct xrep_bufscan *scan) 239 { 240 scan->__sector_count += scan->daddr_step; 241 while (scan->__sector_count <= scan->max_sectors) { 242 struct xfs_buf *bp = NULL; 243 int error; 244 245 error = xfs_buf_incore(mp->m_ddev_targp, scan->daddr, 246 scan->__sector_count, XBF_LIVESCAN, &bp); 247 if (!error) 248 return bp; 249 250 scan->__sector_count += scan->daddr_step; 251 } 252 253 return NULL; 254 } 255 256 /* Try to invalidate the incore buffers for an extent that we're freeing. */ 257 STATIC void 258 xreap_agextent_binval( 259 struct xreap_state *rs, 260 xfs_agblock_t agbno, 261 xfs_extlen_t *aglenp) 262 { 263 struct xfs_scrub *sc = rs->sc; 264 struct xfs_perag *pag = sc->sa.pag; 265 struct xfs_mount *mp = sc->mp; 266 xfs_agblock_t agbno_next = agbno + *aglenp; 267 xfs_agblock_t bno = agbno; 268 269 /* 270 * Avoid invalidating AG headers and post-EOFS blocks because we never 271 * own those. 272 */ 273 if (!xfs_verify_agbno(pag, agbno) || 274 !xfs_verify_agbno(pag, agbno_next - 1)) 275 return; 276 277 /* 278 * If there are incore buffers for these blocks, invalidate them. We 279 * assume that the lack of any other known owners means that the buffer 280 * can be locked without risk of deadlocking. The buffer cache cannot 281 * detect aliasing, so employ nested loops to scan for incore buffers 282 * of any plausible size. 283 */ 284 while (bno < agbno_next) { 285 struct xrep_bufscan scan = { 286 .daddr = xfs_agbno_to_daddr(pag, bno), 287 .max_sectors = xrep_bufscan_max_sectors(mp, 288 agbno_next - bno), 289 .daddr_step = XFS_FSB_TO_BB(mp, 1), 290 }; 291 struct xfs_buf *bp; 292 293 while ((bp = xrep_bufscan_advance(mp, &scan)) != NULL) { 294 xfs_trans_bjoin(sc->tp, bp); 295 xfs_trans_binval(sc->tp, bp); 296 rs->invalidated++; 297 298 /* 299 * Stop invalidating if we've hit the limit; we should 300 * still have enough reservation left to free however 301 * far we've gotten. 302 */ 303 if (rs->invalidated > XREAP_MAX_BINVAL) { 304 *aglenp -= agbno_next - bno; 305 goto out; 306 } 307 } 308 309 bno++; 310 } 311 312 out: 313 trace_xreap_agextent_binval(sc->sa.pag, agbno, *aglenp); 314 } 315 316 /* 317 * Figure out the longest run of blocks that we can dispose of with a single 318 * call. Cross-linked blocks should have their reverse mappings removed, but 319 * single-owner extents can be freed. AGFL blocks can only be put back one at 320 * a time. 321 */ 322 STATIC int 323 xreap_agextent_select( 324 struct xreap_state *rs, 325 xfs_agblock_t agbno, 326 xfs_agblock_t agbno_next, 327 bool *crosslinked, 328 xfs_extlen_t *aglenp) 329 { 330 struct xfs_scrub *sc = rs->sc; 331 struct xfs_btree_cur *cur; 332 xfs_agblock_t bno = agbno + 1; 333 xfs_extlen_t len = 1; 334 int error; 335 336 /* 337 * Determine if there are any other rmap records covering the first 338 * block of this extent. If so, the block is crosslinked. 339 */ 340 cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, 341 sc->sa.pag); 342 error = xfs_rmap_has_other_keys(cur, agbno, 1, rs->oinfo, 343 crosslinked); 344 if (error) 345 goto out_cur; 346 347 /* AGFL blocks can only be deal with one at a time. */ 348 if (rs->resv == XFS_AG_RESV_AGFL) 349 goto out_found; 350 351 /* 352 * Figure out how many of the subsequent blocks have the same crosslink 353 * status. 354 */ 355 while (bno < agbno_next) { 356 bool also_crosslinked; 357 358 error = xfs_rmap_has_other_keys(cur, bno, 1, rs->oinfo, 359 &also_crosslinked); 360 if (error) 361 goto out_cur; 362 363 if (*crosslinked != also_crosslinked) 364 break; 365 366 len++; 367 bno++; 368 } 369 370 out_found: 371 *aglenp = len; 372 trace_xreap_agextent_select(sc->sa.pag, agbno, len, *crosslinked); 373 out_cur: 374 xfs_btree_del_cursor(cur, error); 375 return error; 376 } 377 378 /* 379 * Dispose of as much of the beginning of this AG extent as possible. The 380 * number of blocks disposed of will be returned in @aglenp. 381 */ 382 STATIC int 383 xreap_agextent_iter( 384 struct xreap_state *rs, 385 xfs_agblock_t agbno, 386 xfs_extlen_t *aglenp, 387 bool crosslinked) 388 { 389 struct xfs_scrub *sc = rs->sc; 390 xfs_fsblock_t fsbno; 391 int error = 0; 392 393 fsbno = xfs_agbno_to_fsb(sc->sa.pag, agbno); 394 395 /* 396 * If there are other rmappings, this block is cross linked and must 397 * not be freed. Remove the reverse mapping and move on. Otherwise, 398 * we were the only owner of the block, so free the extent, which will 399 * also remove the rmap. 400 * 401 * XXX: XFS doesn't support detecting the case where a single block 402 * metadata structure is crosslinked with a multi-block structure 403 * because the buffer cache doesn't detect aliasing problems, so we 404 * can't fix 100% of crosslinking problems (yet). The verifiers will 405 * blow on writeout, the filesystem will shut down, and the admin gets 406 * to run xfs_repair. 407 */ 408 if (crosslinked) { 409 trace_xreap_dispose_unmap_extent(sc->sa.pag, agbno, *aglenp); 410 411 rs->force_roll = true; 412 413 if (rs->oinfo == &XFS_RMAP_OINFO_COW) { 414 /* 415 * If we're unmapping CoW staging extents, remove the 416 * records from the refcountbt, which will remove the 417 * rmap record as well. 418 */ 419 xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp); 420 return 0; 421 } 422 423 return xfs_rmap_free(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, 424 *aglenp, rs->oinfo); 425 } 426 427 trace_xreap_dispose_free_extent(sc->sa.pag, agbno, *aglenp); 428 429 /* 430 * Invalidate as many buffers as we can, starting at agbno. If this 431 * function sets *aglenp to zero, the transaction is full of logged 432 * buffer invalidations, so we need to return early so that we can 433 * roll and retry. 434 */ 435 xreap_agextent_binval(rs, agbno, aglenp); 436 if (*aglenp == 0) { 437 ASSERT(xreap_want_roll(rs)); 438 return 0; 439 } 440 441 /* 442 * If we're getting rid of CoW staging extents, use deferred work items 443 * to remove the refcountbt records (which removes the rmap records) 444 * and free the extent. We're not worried about the system going down 445 * here because log recovery walks the refcount btree to clean out the 446 * CoW staging extents. 447 */ 448 if (rs->oinfo == &XFS_RMAP_OINFO_COW) { 449 ASSERT(rs->resv == XFS_AG_RESV_NONE); 450 451 xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp); 452 error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, NULL, 453 rs->resv, XFS_FREE_EXTENT_SKIP_DISCARD); 454 if (error) 455 return error; 456 457 rs->force_roll = true; 458 return 0; 459 } 460 461 /* Put blocks back on the AGFL one at a time. */ 462 if (rs->resv == XFS_AG_RESV_AGFL) { 463 ASSERT(*aglenp == 1); 464 error = xreap_put_freelist(sc, agbno); 465 if (error) 466 return error; 467 468 rs->force_roll = true; 469 return 0; 470 } 471 472 /* 473 * Use deferred frees to get rid of the old btree blocks to try to 474 * minimize the window in which we could crash and lose the old blocks. 475 * Add a defer ops barrier every other extent to avoid stressing the 476 * system with large EFIs. 477 */ 478 error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo, 479 rs->resv, XFS_FREE_EXTENT_SKIP_DISCARD); 480 if (error) 481 return error; 482 483 rs->deferred++; 484 if (rs->deferred % 2 == 0) 485 xfs_defer_add_barrier(sc->tp); 486 return 0; 487 } 488 489 /* 490 * Break an AG metadata extent into sub-extents by fate (crosslinked, not 491 * crosslinked), and dispose of each sub-extent separately. 492 */ 493 STATIC int 494 xreap_agmeta_extent( 495 uint32_t agbno, 496 uint32_t len, 497 void *priv) 498 { 499 struct xreap_state *rs = priv; 500 struct xfs_scrub *sc = rs->sc; 501 xfs_agblock_t agbno_next = agbno + len; 502 int error = 0; 503 504 ASSERT(len <= XFS_MAX_BMBT_EXTLEN); 505 ASSERT(sc->ip == NULL); 506 507 while (agbno < agbno_next) { 508 xfs_extlen_t aglen; 509 bool crosslinked; 510 511 error = xreap_agextent_select(rs, agbno, agbno_next, 512 &crosslinked, &aglen); 513 if (error) 514 return error; 515 516 error = xreap_agextent_iter(rs, agbno, &aglen, crosslinked); 517 if (error) 518 return error; 519 520 if (xreap_want_defer_finish(rs)) { 521 error = xrep_defer_finish(sc); 522 if (error) 523 return error; 524 xreap_defer_finish_reset(rs); 525 } else if (xreap_want_roll(rs)) { 526 error = xrep_roll_ag_trans(sc); 527 if (error) 528 return error; 529 xreap_reset(rs); 530 } 531 532 agbno += aglen; 533 } 534 535 return 0; 536 } 537 538 /* Dispose of every block of every AG metadata extent in the bitmap. */ 539 int 540 xrep_reap_agblocks( 541 struct xfs_scrub *sc, 542 struct xagb_bitmap *bitmap, 543 const struct xfs_owner_info *oinfo, 544 enum xfs_ag_resv_type type) 545 { 546 struct xreap_state rs = { 547 .sc = sc, 548 .oinfo = oinfo, 549 .resv = type, 550 }; 551 int error; 552 553 ASSERT(xfs_has_rmapbt(sc->mp)); 554 ASSERT(sc->ip == NULL); 555 556 error = xagb_bitmap_walk(bitmap, xreap_agmeta_extent, &rs); 557 if (error) 558 return error; 559 560 if (xreap_dirty(&rs)) 561 return xrep_defer_finish(sc); 562 563 return 0; 564 } 565 566 /* 567 * Break a file metadata extent into sub-extents by fate (crosslinked, not 568 * crosslinked), and dispose of each sub-extent separately. The extent must 569 * not cross an AG boundary. 570 */ 571 STATIC int 572 xreap_fsmeta_extent( 573 uint64_t fsbno, 574 uint64_t len, 575 void *priv) 576 { 577 struct xreap_state *rs = priv; 578 struct xfs_scrub *sc = rs->sc; 579 xfs_agnumber_t agno = XFS_FSB_TO_AGNO(sc->mp, fsbno); 580 xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno); 581 xfs_agblock_t agbno_next = agbno + len; 582 int error = 0; 583 584 ASSERT(len <= XFS_MAX_BMBT_EXTLEN); 585 ASSERT(sc->ip != NULL); 586 ASSERT(!sc->sa.pag); 587 588 /* 589 * We're reaping blocks after repairing file metadata, which means that 590 * we have to init the xchk_ag structure ourselves. 591 */ 592 sc->sa.pag = xfs_perag_get(sc->mp, agno); 593 if (!sc->sa.pag) 594 return -EFSCORRUPTED; 595 596 error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &sc->sa.agf_bp); 597 if (error) 598 goto out_pag; 599 600 while (agbno < agbno_next) { 601 xfs_extlen_t aglen; 602 bool crosslinked; 603 604 error = xreap_agextent_select(rs, agbno, agbno_next, 605 &crosslinked, &aglen); 606 if (error) 607 goto out_agf; 608 609 error = xreap_agextent_iter(rs, agbno, &aglen, crosslinked); 610 if (error) 611 goto out_agf; 612 613 if (xreap_want_defer_finish(rs)) { 614 /* 615 * Holds the AGF buffer across the deferred chain 616 * processing. 617 */ 618 error = xrep_defer_finish(sc); 619 if (error) 620 goto out_agf; 621 xreap_defer_finish_reset(rs); 622 } else if (xreap_want_roll(rs)) { 623 /* 624 * Hold the AGF buffer across the transaction roll so 625 * that we don't have to reattach it to the scrub 626 * context. 627 */ 628 xfs_trans_bhold(sc->tp, sc->sa.agf_bp); 629 error = xfs_trans_roll_inode(&sc->tp, sc->ip); 630 xfs_trans_bjoin(sc->tp, sc->sa.agf_bp); 631 if (error) 632 goto out_agf; 633 xreap_reset(rs); 634 } 635 636 agbno += aglen; 637 } 638 639 out_agf: 640 xfs_trans_brelse(sc->tp, sc->sa.agf_bp); 641 sc->sa.agf_bp = NULL; 642 out_pag: 643 xfs_perag_put(sc->sa.pag); 644 sc->sa.pag = NULL; 645 return error; 646 } 647 648 /* 649 * Dispose of every block of every fs metadata extent in the bitmap. 650 * Do not use this to dispose of the mappings in an ondisk inode fork. 651 */ 652 int 653 xrep_reap_fsblocks( 654 struct xfs_scrub *sc, 655 struct xfsb_bitmap *bitmap, 656 const struct xfs_owner_info *oinfo) 657 { 658 struct xreap_state rs = { 659 .sc = sc, 660 .oinfo = oinfo, 661 .resv = XFS_AG_RESV_NONE, 662 }; 663 int error; 664 665 ASSERT(xfs_has_rmapbt(sc->mp)); 666 ASSERT(sc->ip != NULL); 667 668 error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs); 669 if (error) 670 return error; 671 672 if (xreap_dirty(&rs)) 673 return xrep_defer_finish(sc); 674 675 return 0; 676 } 677 678 /* 679 * Metadata files are not supposed to share blocks with anything else. 680 * If blocks are shared, we remove the reverse mapping (thus reducing the 681 * crosslink factor); if blocks are not shared, we also need to free them. 682 * 683 * This first step determines the longest subset of the passed-in imap 684 * (starting at its beginning) that is either crosslinked or not crosslinked. 685 * The blockcount will be adjust down as needed. 686 */ 687 STATIC int 688 xreap_bmapi_select( 689 struct xfs_scrub *sc, 690 struct xfs_inode *ip, 691 int whichfork, 692 struct xfs_bmbt_irec *imap, 693 bool *crosslinked) 694 { 695 struct xfs_owner_info oinfo; 696 struct xfs_btree_cur *cur; 697 xfs_filblks_t len = 1; 698 xfs_agblock_t bno; 699 xfs_agblock_t agbno; 700 xfs_agblock_t agbno_next; 701 int error; 702 703 agbno = XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock); 704 agbno_next = agbno + imap->br_blockcount; 705 706 cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, 707 sc->sa.pag); 708 709 xfs_rmap_ino_owner(&oinfo, ip->i_ino, whichfork, imap->br_startoff); 710 error = xfs_rmap_has_other_keys(cur, agbno, 1, &oinfo, crosslinked); 711 if (error) 712 goto out_cur; 713 714 bno = agbno + 1; 715 while (bno < agbno_next) { 716 bool also_crosslinked; 717 718 oinfo.oi_offset++; 719 error = xfs_rmap_has_other_keys(cur, bno, 1, &oinfo, 720 &also_crosslinked); 721 if (error) 722 goto out_cur; 723 724 if (also_crosslinked != *crosslinked) 725 break; 726 727 len++; 728 bno++; 729 } 730 731 imap->br_blockcount = len; 732 trace_xreap_bmapi_select(sc->sa.pag, agbno, len, *crosslinked); 733 out_cur: 734 xfs_btree_del_cursor(cur, error); 735 return error; 736 } 737 738 /* 739 * Decide if this buffer can be joined to a transaction. This is true for most 740 * buffers, but there are two cases that we want to catch: large remote xattr 741 * value buffers are not logged and can overflow the buffer log item dirty 742 * bitmap size; and oversized cached buffers if things have really gone 743 * haywire. 744 */ 745 static inline bool 746 xreap_buf_loggable( 747 const struct xfs_buf *bp) 748 { 749 int i; 750 751 for (i = 0; i < bp->b_map_count; i++) { 752 int chunks; 753 int map_size; 754 755 chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len), 756 XFS_BLF_CHUNK); 757 map_size = DIV_ROUND_UP(chunks, NBWORD); 758 if (map_size > XFS_BLF_DATAMAP_SIZE) 759 return false; 760 } 761 762 return true; 763 } 764 765 /* 766 * Invalidate any buffers for this file mapping. The @imap blockcount may be 767 * adjusted downward if we need to roll the transaction. 768 */ 769 STATIC int 770 xreap_bmapi_binval( 771 struct xfs_scrub *sc, 772 struct xfs_inode *ip, 773 int whichfork, 774 struct xfs_bmbt_irec *imap) 775 { 776 struct xfs_mount *mp = sc->mp; 777 struct xfs_perag *pag = sc->sa.pag; 778 int bmap_flags = xfs_bmapi_aflag(whichfork); 779 xfs_fileoff_t off; 780 xfs_fileoff_t max_off; 781 xfs_extlen_t scan_blocks; 782 xfs_agblock_t bno; 783 xfs_agblock_t agbno; 784 xfs_agblock_t agbno_next; 785 unsigned int invalidated = 0; 786 int error; 787 788 /* 789 * Avoid invalidating AG headers and post-EOFS blocks because we never 790 * own those. 791 */ 792 agbno = bno = XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock); 793 agbno_next = agbno + imap->br_blockcount; 794 if (!xfs_verify_agbno(pag, agbno) || 795 !xfs_verify_agbno(pag, agbno_next - 1)) 796 return 0; 797 798 /* 799 * Buffers for file blocks can span multiple contiguous mappings. This 800 * means that for each block in the mapping, there could exist an 801 * xfs_buf indexed by that block with any length up to the maximum 802 * buffer size (remote xattr values) or to the next hole in the fork. 803 * To set up our binval scan, first we need to figure out the location 804 * of the next hole. 805 */ 806 off = imap->br_startoff + imap->br_blockcount; 807 max_off = off + xfs_attr3_max_rmt_blocks(mp); 808 while (off < max_off) { 809 struct xfs_bmbt_irec hmap; 810 int nhmaps = 1; 811 812 error = xfs_bmapi_read(ip, off, max_off - off, &hmap, 813 &nhmaps, bmap_flags); 814 if (error) 815 return error; 816 if (nhmaps != 1 || hmap.br_startblock == DELAYSTARTBLOCK) { 817 ASSERT(0); 818 return -EFSCORRUPTED; 819 } 820 821 if (!xfs_bmap_is_real_extent(&hmap)) 822 break; 823 824 off = hmap.br_startoff + hmap.br_blockcount; 825 } 826 scan_blocks = off - imap->br_startoff; 827 828 trace_xreap_bmapi_binval_scan(sc, imap, scan_blocks); 829 830 /* 831 * If there are incore buffers for these blocks, invalidate them. If 832 * we can't (try)lock the buffer we assume it's owned by someone else 833 * and leave it alone. The buffer cache cannot detect aliasing, so 834 * employ nested loops to detect incore buffers of any plausible size. 835 */ 836 while (bno < agbno_next) { 837 struct xrep_bufscan scan = { 838 .daddr = xfs_agbno_to_daddr(pag, bno), 839 .max_sectors = xrep_bufscan_max_sectors(mp, 840 scan_blocks), 841 .daddr_step = XFS_FSB_TO_BB(mp, 1), 842 }; 843 struct xfs_buf *bp; 844 845 while ((bp = xrep_bufscan_advance(mp, &scan)) != NULL) { 846 if (xreap_buf_loggable(bp)) { 847 xfs_trans_bjoin(sc->tp, bp); 848 xfs_trans_binval(sc->tp, bp); 849 } else { 850 xfs_buf_stale(bp); 851 xfs_buf_relse(bp); 852 } 853 invalidated++; 854 855 /* 856 * Stop invalidating if we've hit the limit; we should 857 * still have enough reservation left to free however 858 * much of the mapping we've seen so far. 859 */ 860 if (invalidated > XREAP_MAX_BINVAL) { 861 imap->br_blockcount = agbno_next - bno; 862 goto out; 863 } 864 } 865 866 bno++; 867 scan_blocks--; 868 } 869 870 out: 871 trace_xreap_bmapi_binval(sc->sa.pag, agbno, imap->br_blockcount); 872 return 0; 873 } 874 875 /* 876 * Dispose of as much of the beginning of this file fork mapping as possible. 877 * The number of blocks disposed of is returned in @imap->br_blockcount. 878 */ 879 STATIC int 880 xrep_reap_bmapi_iter( 881 struct xfs_scrub *sc, 882 struct xfs_inode *ip, 883 int whichfork, 884 struct xfs_bmbt_irec *imap, 885 bool crosslinked) 886 { 887 int error; 888 889 if (crosslinked) { 890 /* 891 * If there are other rmappings, this block is cross linked and 892 * must not be freed. Remove the reverse mapping, leave the 893 * buffer cache in its possibly confused state, and move on. 894 * We don't want to risk discarding valid data buffers from 895 * anybody else who thinks they own the block, even though that 896 * runs the risk of stale buffer warnings in the future. 897 */ 898 trace_xreap_dispose_unmap_extent(sc->sa.pag, 899 XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock), 900 imap->br_blockcount); 901 902 /* 903 * Schedule removal of the mapping from the fork. We use 904 * deferred log intents in this function to control the exact 905 * sequence of metadata updates. 906 */ 907 xfs_bmap_unmap_extent(sc->tp, ip, whichfork, imap); 908 xfs_trans_mod_dquot_byino(sc->tp, ip, XFS_TRANS_DQ_BCOUNT, 909 -(int64_t)imap->br_blockcount); 910 xfs_rmap_unmap_extent(sc->tp, ip, whichfork, imap); 911 return 0; 912 } 913 914 /* 915 * If the block is not crosslinked, we can invalidate all the incore 916 * buffers for the extent, and then free the extent. This is a bit of 917 * a mess since we don't detect discontiguous buffers that are indexed 918 * by a block starting before the first block of the extent but overlap 919 * anyway. 920 */ 921 trace_xreap_dispose_free_extent(sc->sa.pag, 922 XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock), 923 imap->br_blockcount); 924 925 /* 926 * Invalidate as many buffers as we can, starting at the beginning of 927 * this mapping. If this function sets blockcount to zero, the 928 * transaction is full of logged buffer invalidations, so we need to 929 * return early so that we can roll and retry. 930 */ 931 error = xreap_bmapi_binval(sc, ip, whichfork, imap); 932 if (error || imap->br_blockcount == 0) 933 return error; 934 935 /* 936 * Schedule removal of the mapping from the fork. We use deferred log 937 * intents in this function to control the exact sequence of metadata 938 * updates. 939 */ 940 xfs_bmap_unmap_extent(sc->tp, ip, whichfork, imap); 941 xfs_trans_mod_dquot_byino(sc->tp, ip, XFS_TRANS_DQ_BCOUNT, 942 -(int64_t)imap->br_blockcount); 943 return xfs_free_extent_later(sc->tp, imap->br_startblock, 944 imap->br_blockcount, NULL, XFS_AG_RESV_NONE, 945 XFS_FREE_EXTENT_SKIP_DISCARD); 946 } 947 948 /* 949 * Dispose of as much of this file extent as we can. Upon successful return, 950 * the imap will reflect the mapping that was removed from the fork. 951 */ 952 STATIC int 953 xreap_ifork_extent( 954 struct xfs_scrub *sc, 955 struct xfs_inode *ip, 956 int whichfork, 957 struct xfs_bmbt_irec *imap) 958 { 959 xfs_agnumber_t agno; 960 bool crosslinked; 961 int error; 962 963 ASSERT(sc->sa.pag == NULL); 964 965 trace_xreap_ifork_extent(sc, ip, whichfork, imap); 966 967 agno = XFS_FSB_TO_AGNO(sc->mp, imap->br_startblock); 968 sc->sa.pag = xfs_perag_get(sc->mp, agno); 969 if (!sc->sa.pag) 970 return -EFSCORRUPTED; 971 972 error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &sc->sa.agf_bp); 973 if (error) 974 goto out_pag; 975 976 /* 977 * Decide the fate of the blocks at the beginning of the mapping, then 978 * update the mapping to use it with the unmap calls. 979 */ 980 error = xreap_bmapi_select(sc, ip, whichfork, imap, &crosslinked); 981 if (error) 982 goto out_agf; 983 984 error = xrep_reap_bmapi_iter(sc, ip, whichfork, imap, crosslinked); 985 if (error) 986 goto out_agf; 987 988 out_agf: 989 xfs_trans_brelse(sc->tp, sc->sa.agf_bp); 990 sc->sa.agf_bp = NULL; 991 out_pag: 992 xfs_perag_put(sc->sa.pag); 993 sc->sa.pag = NULL; 994 return error; 995 } 996 997 /* 998 * Dispose of each block mapped to the given fork of the given file. Callers 999 * must hold ILOCK_EXCL, and ip can only be sc->ip or sc->tempip. The fork 1000 * must not have any delalloc reservations. 1001 */ 1002 int 1003 xrep_reap_ifork( 1004 struct xfs_scrub *sc, 1005 struct xfs_inode *ip, 1006 int whichfork) 1007 { 1008 xfs_fileoff_t off = 0; 1009 int bmap_flags = xfs_bmapi_aflag(whichfork); 1010 int error; 1011 1012 ASSERT(xfs_has_rmapbt(sc->mp)); 1013 ASSERT(ip == sc->ip || ip == sc->tempip); 1014 ASSERT(whichfork == XFS_ATTR_FORK || !XFS_IS_REALTIME_INODE(ip)); 1015 1016 while (off < XFS_MAX_FILEOFF) { 1017 struct xfs_bmbt_irec imap; 1018 int nimaps = 1; 1019 1020 /* Read the next extent, skip past holes and delalloc. */ 1021 error = xfs_bmapi_read(ip, off, XFS_MAX_FILEOFF - off, &imap, 1022 &nimaps, bmap_flags); 1023 if (error) 1024 return error; 1025 if (nimaps != 1 || imap.br_startblock == DELAYSTARTBLOCK) { 1026 ASSERT(0); 1027 return -EFSCORRUPTED; 1028 } 1029 1030 /* 1031 * If this is a real space mapping, reap as much of it as we 1032 * can in a single transaction. 1033 */ 1034 if (xfs_bmap_is_real_extent(&imap)) { 1035 error = xreap_ifork_extent(sc, ip, whichfork, &imap); 1036 if (error) 1037 return error; 1038 1039 error = xfs_defer_finish(&sc->tp); 1040 if (error) 1041 return error; 1042 } 1043 1044 off = imap.br_startoff + imap.br_blockcount; 1045 } 1046 1047 return 0; 1048 } 1049