1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2022-2023 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_btree.h" 13 #include "xfs_log_format.h" 14 #include "xfs_trans.h" 15 #include "xfs_sb.h" 16 #include "xfs_inode.h" 17 #include "xfs_alloc.h" 18 #include "xfs_alloc_btree.h" 19 #include "xfs_ialloc.h" 20 #include "xfs_ialloc_btree.h" 21 #include "xfs_rmap.h" 22 #include "xfs_rmap_btree.h" 23 #include "xfs_refcount.h" 24 #include "xfs_refcount_btree.h" 25 #include "xfs_extent_busy.h" 26 #include "xfs_ag.h" 27 #include "xfs_ag_resv.h" 28 #include "xfs_quota.h" 29 #include "xfs_qm.h" 30 #include "xfs_bmap.h" 31 #include "xfs_da_format.h" 32 #include "xfs_da_btree.h" 33 #include "xfs_attr.h" 34 #include "xfs_attr_remote.h" 35 #include "xfs_defer.h" 36 #include "xfs_metafile.h" 37 #include "xfs_rtgroup.h" 38 #include "xfs_rtrmap_btree.h" 39 #include "xfs_extfree_item.h" 40 #include "xfs_rmap_item.h" 41 #include "xfs_refcount_item.h" 42 #include "xfs_buf_item.h" 43 #include "xfs_bmap_item.h" 44 #include "xfs_bmap_btree.h" 45 #include "scrub/scrub.h" 46 #include "scrub/common.h" 47 #include "scrub/trace.h" 48 #include "scrub/repair.h" 49 #include "scrub/bitmap.h" 50 #include "scrub/agb_bitmap.h" 51 #include "scrub/fsb_bitmap.h" 52 #include "scrub/rtb_bitmap.h" 53 #include "scrub/reap.h" 54 55 /* 56 * Disposal of Blocks from Old Metadata 57 * 58 * Now that we've constructed a new btree to replace the damaged one, we want 59 * to dispose of the blocks that (we think) the old btree was using. 60 * Previously, we used the rmapbt to collect the extents (bitmap) with the 61 * rmap owner corresponding to the tree we rebuilt, collected extents for any 62 * blocks with the same rmap owner that are owned by another data structure 63 * (sublist), and subtracted sublist from bitmap. In theory the extents 64 * remaining in bitmap are the old btree's blocks. 65 * 66 * Unfortunately, it's possible that the btree was crosslinked with other 67 * blocks on disk. The rmap data can tell us if there are multiple owners, so 68 * if the rmapbt says there is an owner of this block other than @oinfo, then 69 * the block is crosslinked. Remove the reverse mapping and continue. 70 * 71 * If there is one rmap record, we can free the block, which removes the 72 * reverse mapping but doesn't add the block to the free space. Our repair 73 * strategy is to hope the other metadata objects crosslinked on this block 74 * will be rebuilt (atop different blocks), thereby removing all the cross 75 * links. 76 * 77 * If there are no rmap records at all, we also free the block. If the btree 78 * being rebuilt lives in the free space (bnobt/cntbt/rmapbt) then there isn't 79 * supposed to be a rmap record and everything is ok. For other btrees there 80 * had to have been an rmap entry for the block to have ended up on @bitmap, 81 * so if it's gone now there's something wrong and the fs will shut down. 82 * 83 * Note: If there are multiple rmap records with only the same rmap owner as 84 * the btree we're trying to rebuild and the block is indeed owned by another 85 * data structure with the same rmap owner, then the block will be in sublist 86 * and therefore doesn't need disposal. If there are multiple rmap records 87 * with only the same rmap owner but the block is not owned by something with 88 * the same rmap owner, the block will be freed. 89 * 90 * The caller is responsible for locking the AG headers/inode for the entire 91 * rebuild operation so that nothing else can sneak in and change the incore 92 * state while we're not looking. We must also invalidate any buffers 93 * associated with @bitmap. 94 */ 95 96 /* Information about reaping extents after a repair. */ 97 struct xreap_state { 98 struct xfs_scrub *sc; 99 100 union { 101 struct { 102 /* 103 * For AG blocks, this is reverse mapping owner and 104 * metadata reservation type. 105 */ 106 const struct xfs_owner_info *oinfo; 107 enum xfs_ag_resv_type resv; 108 }; 109 struct { 110 /* For file blocks, this is the inode and fork. */ 111 struct xfs_inode *ip; 112 int whichfork; 113 }; 114 }; 115 116 /* Number of invalidated buffers logged to the current transaction. */ 117 unsigned int nr_binval; 118 119 /* Maximum number of buffers we can invalidate in a single tx. */ 120 unsigned int max_binval; 121 122 /* Number of deferred reaps attached to the current transaction. */ 123 unsigned int nr_deferred; 124 125 /* Maximum number of intents we can reap in a single transaction. */ 126 unsigned int max_deferred; 127 }; 128 129 /* Put a block back on the AGFL. */ 130 STATIC int 131 xreap_put_freelist( 132 struct xfs_scrub *sc, 133 xfs_agblock_t agbno) 134 { 135 struct xfs_buf *agfl_bp; 136 int error; 137 138 /* Make sure there's space on the freelist. */ 139 error = xrep_fix_freelist(sc, 0); 140 if (error) 141 return error; 142 143 /* 144 * Since we're "freeing" a lost block onto the AGFL, we have to 145 * create an rmap for the block prior to merging it or else other 146 * parts will break. 147 */ 148 error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, 1, 149 &XFS_RMAP_OINFO_AG); 150 if (error) 151 return error; 152 153 /* Put the block on the AGFL. */ 154 error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp); 155 if (error) 156 return error; 157 158 error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp, 159 agfl_bp, agbno, 0); 160 if (error) 161 return error; 162 xfs_extent_busy_insert(sc->tp, pag_group(sc->sa.pag), agbno, 1, 163 XFS_EXTENT_BUSY_SKIP_DISCARD); 164 165 return 0; 166 } 167 168 /* Are there any uncommitted reap operations? */ 169 static inline bool xreap_is_dirty(const struct xreap_state *rs) 170 { 171 return rs->nr_binval > 0 || rs->nr_deferred > 0; 172 } 173 174 /* 175 * Decide if we need to roll the transaction to clear out the the log 176 * reservation that we allocated to buffer invalidations. 177 */ 178 static inline bool xreap_want_binval_roll(const struct xreap_state *rs) 179 { 180 return rs->nr_binval >= rs->max_binval; 181 } 182 183 /* Reset the buffer invalidation count after rolling. */ 184 static inline void xreap_binval_reset(struct xreap_state *rs) 185 { 186 rs->nr_binval = 0; 187 } 188 189 /* 190 * Bump the number of invalidated buffers, and return true if we can continue, 191 * or false if we need to roll the transaction. 192 */ 193 static inline bool xreap_inc_binval(struct xreap_state *rs) 194 { 195 rs->nr_binval++; 196 return rs->nr_binval < rs->max_binval; 197 } 198 199 /* 200 * Decide if we want to finish the deferred ops that are attached to the scrub 201 * transaction. We don't want to queue huge chains of deferred ops because 202 * that can consume a lot of log space and kernel memory. Hence we trigger a 203 * xfs_defer_finish if there are too many deferred reap operations or we've run 204 * out of space for invalidations. 205 */ 206 static inline bool xreap_want_defer_finish(const struct xreap_state *rs) 207 { 208 return rs->nr_deferred >= rs->max_deferred; 209 } 210 211 /* 212 * Reset the defer chain length and buffer invalidation count after finishing 213 * items. 214 */ 215 static inline void xreap_defer_finish_reset(struct xreap_state *rs) 216 { 217 rs->nr_deferred = 0; 218 rs->nr_binval = 0; 219 } 220 221 /* 222 * Bump the number of deferred extent reaps. 223 */ 224 static inline void xreap_inc_defer(struct xreap_state *rs) 225 { 226 rs->nr_deferred++; 227 } 228 229 /* Force the caller to finish a deferred item chain. */ 230 static inline void xreap_force_defer_finish(struct xreap_state *rs) 231 { 232 rs->nr_deferred = rs->max_deferred; 233 } 234 235 /* Maximum number of fsblocks that we might find in a buffer to invalidate. */ 236 static inline unsigned int 237 xrep_binval_max_fsblocks( 238 struct xfs_mount *mp) 239 { 240 /* Remote xattr values are the largest buffers that we support. */ 241 return xfs_attr3_max_rmt_blocks(mp); 242 } 243 244 /* 245 * Compute the maximum length of a buffer cache scan (in units of sectors), 246 * given a quantity of fs blocks. 247 */ 248 xfs_daddr_t 249 xrep_bufscan_max_sectors( 250 struct xfs_mount *mp, 251 xfs_extlen_t fsblocks) 252 { 253 return XFS_FSB_TO_BB(mp, min_t(xfs_extlen_t, fsblocks, 254 xrep_binval_max_fsblocks(mp))); 255 } 256 257 /* 258 * Return an incore buffer from a sector scan, or NULL if there are no buffers 259 * left to return. 260 */ 261 struct xfs_buf * 262 xrep_bufscan_advance( 263 struct xfs_mount *mp, 264 struct xrep_bufscan *scan) 265 { 266 scan->__sector_count += scan->daddr_step; 267 while (scan->__sector_count <= scan->max_sectors) { 268 struct xfs_buf *bp = NULL; 269 int error; 270 271 error = xfs_buf_incore(mp->m_ddev_targp, scan->daddr, 272 scan->__sector_count, XBF_LIVESCAN, &bp); 273 if (!error) 274 return bp; 275 276 scan->__sector_count += scan->daddr_step; 277 } 278 279 return NULL; 280 } 281 282 /* Try to invalidate the incore buffers for an extent that we're freeing. */ 283 STATIC void 284 xreap_agextent_binval( 285 struct xreap_state *rs, 286 xfs_agblock_t agbno, 287 xfs_extlen_t *aglenp) 288 { 289 struct xfs_scrub *sc = rs->sc; 290 struct xfs_perag *pag = sc->sa.pag; 291 struct xfs_mount *mp = sc->mp; 292 xfs_agblock_t agbno_next = agbno + *aglenp; 293 xfs_agblock_t bno = agbno; 294 295 /* 296 * Avoid invalidating AG headers and post-EOFS blocks because we never 297 * own those. 298 */ 299 if (!xfs_verify_agbno(pag, agbno) || 300 !xfs_verify_agbno(pag, agbno_next - 1)) 301 return; 302 303 /* 304 * If there are incore buffers for these blocks, invalidate them. We 305 * assume that the lack of any other known owners means that the buffer 306 * can be locked without risk of deadlocking. The buffer cache cannot 307 * detect aliasing, so employ nested loops to scan for incore buffers 308 * of any plausible size. 309 */ 310 while (bno < agbno_next) { 311 struct xrep_bufscan scan = { 312 .daddr = xfs_agbno_to_daddr(pag, bno), 313 .max_sectors = xrep_bufscan_max_sectors(mp, 314 agbno_next - bno), 315 .daddr_step = XFS_FSB_TO_BB(mp, 1), 316 }; 317 struct xfs_buf *bp; 318 319 while ((bp = xrep_bufscan_advance(mp, &scan)) != NULL) { 320 xfs_trans_bjoin(sc->tp, bp); 321 xfs_trans_binval(sc->tp, bp); 322 323 /* 324 * Stop invalidating if we've hit the limit; we should 325 * still have enough reservation left to free however 326 * far we've gotten. 327 */ 328 if (!xreap_inc_binval(rs)) { 329 *aglenp -= agbno_next - bno; 330 goto out; 331 } 332 } 333 334 bno++; 335 } 336 337 out: 338 trace_xreap_agextent_binval(pag_group(sc->sa.pag), agbno, *aglenp); 339 } 340 341 /* 342 * Figure out the longest run of blocks that we can dispose of with a single 343 * call. Cross-linked blocks should have their reverse mappings removed, but 344 * single-owner extents can be freed. AGFL blocks can only be put back one at 345 * a time. 346 */ 347 STATIC int 348 xreap_agextent_select( 349 struct xreap_state *rs, 350 xfs_agblock_t agbno, 351 xfs_agblock_t agbno_next, 352 bool *crosslinked, 353 xfs_extlen_t *aglenp) 354 { 355 struct xfs_scrub *sc = rs->sc; 356 struct xfs_btree_cur *cur; 357 xfs_agblock_t bno = agbno + 1; 358 xfs_extlen_t len = 1; 359 int error; 360 361 /* 362 * Determine if there are any other rmap records covering the first 363 * block of this extent. If so, the block is crosslinked. 364 */ 365 cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, 366 sc->sa.pag); 367 error = xfs_rmap_has_other_keys(cur, agbno, 1, rs->oinfo, 368 crosslinked); 369 if (error) 370 goto out_cur; 371 372 /* AGFL blocks can only be deal with one at a time. */ 373 if (rs->resv == XFS_AG_RESV_AGFL) 374 goto out_found; 375 376 /* 377 * Figure out how many of the subsequent blocks have the same crosslink 378 * status. 379 */ 380 while (bno < agbno_next) { 381 bool also_crosslinked; 382 383 error = xfs_rmap_has_other_keys(cur, bno, 1, rs->oinfo, 384 &also_crosslinked); 385 if (error) 386 goto out_cur; 387 388 if (*crosslinked != also_crosslinked) 389 break; 390 391 len++; 392 bno++; 393 } 394 395 out_found: 396 *aglenp = len; 397 trace_xreap_agextent_select(pag_group(sc->sa.pag), agbno, len, 398 *crosslinked); 399 out_cur: 400 xfs_btree_del_cursor(cur, error); 401 return error; 402 } 403 404 /* 405 * Dispose of as much of the beginning of this AG extent as possible. The 406 * number of blocks disposed of will be returned in @aglenp. 407 */ 408 STATIC int 409 xreap_agextent_iter( 410 struct xreap_state *rs, 411 xfs_agblock_t agbno, 412 xfs_extlen_t *aglenp, 413 bool crosslinked) 414 { 415 struct xfs_scrub *sc = rs->sc; 416 xfs_fsblock_t fsbno; 417 int error = 0; 418 419 ASSERT(rs->resv != XFS_AG_RESV_METAFILE); 420 421 fsbno = xfs_agbno_to_fsb(sc->sa.pag, agbno); 422 423 /* 424 * If there are other rmappings, this block is cross linked and must 425 * not be freed. Remove the reverse mapping and move on. Otherwise, 426 * we were the only owner of the block, so free the extent, which will 427 * also remove the rmap. 428 * 429 * XXX: XFS doesn't support detecting the case where a single block 430 * metadata structure is crosslinked with a multi-block structure 431 * because the buffer cache doesn't detect aliasing problems, so we 432 * can't fix 100% of crosslinking problems (yet). The verifiers will 433 * blow on writeout, the filesystem will shut down, and the admin gets 434 * to run xfs_repair. 435 */ 436 if (crosslinked) { 437 trace_xreap_dispose_unmap_extent(pag_group(sc->sa.pag), agbno, 438 *aglenp); 439 440 if (rs->oinfo == &XFS_RMAP_OINFO_COW) { 441 /* 442 * t0: Unmapping CoW staging extents, remove the 443 * records from the refcountbt, which will remove the 444 * rmap record as well. 445 */ 446 xfs_refcount_free_cow_extent(sc->tp, false, fsbno, 447 *aglenp); 448 xreap_inc_defer(rs); 449 return 0; 450 } 451 452 /* t1: unmap crosslinked metadata blocks */ 453 xfs_rmap_free_extent(sc->tp, false, fsbno, *aglenp, 454 rs->oinfo->oi_owner); 455 xreap_inc_defer(rs); 456 return 0; 457 } 458 459 trace_xreap_dispose_free_extent(pag_group(sc->sa.pag), agbno, *aglenp); 460 461 /* 462 * Invalidate as many buffers as we can, starting at agbno. If this 463 * function sets *aglenp to zero, the transaction is full of logged 464 * buffer invalidations, so we need to return early so that we can 465 * roll and retry. 466 */ 467 xreap_agextent_binval(rs, agbno, aglenp); 468 if (*aglenp == 0) { 469 ASSERT(xreap_want_binval_roll(rs)); 470 return 0; 471 } 472 473 /* 474 * t2: To get rid of CoW staging extents, use deferred work items 475 * to remove the refcountbt records (which removes the rmap records) 476 * and free the extent. We're not worried about the system going down 477 * here because log recovery walks the refcount btree to clean out the 478 * CoW staging extents. 479 */ 480 if (rs->oinfo == &XFS_RMAP_OINFO_COW) { 481 ASSERT(rs->resv == XFS_AG_RESV_NONE); 482 483 xfs_refcount_free_cow_extent(sc->tp, false, fsbno, *aglenp); 484 error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, NULL, 485 rs->resv, XFS_FREE_EXTENT_SKIP_DISCARD); 486 if (error) 487 return error; 488 489 xreap_inc_defer(rs); 490 return 0; 491 } 492 493 /* t3: Put blocks back on the AGFL one at a time. */ 494 if (rs->resv == XFS_AG_RESV_AGFL) { 495 ASSERT(*aglenp == 1); 496 error = xreap_put_freelist(sc, agbno); 497 if (error) 498 return error; 499 500 xreap_force_defer_finish(rs); 501 return 0; 502 } 503 504 /* 505 * t4: Use deferred frees to get rid of the old btree blocks to try to 506 * minimize the window in which we could crash and lose the old blocks. 507 * Add a defer ops barrier every other extent to avoid stressing the 508 * system with large EFIs. 509 */ 510 error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo, 511 rs->resv, XFS_FREE_EXTENT_SKIP_DISCARD); 512 if (error) 513 return error; 514 515 xreap_inc_defer(rs); 516 if (rs->nr_deferred % 2 == 0) 517 xfs_defer_add_barrier(sc->tp); 518 return 0; 519 } 520 521 /* Configure the deferral and invalidation limits */ 522 static inline void 523 xreap_configure_limits( 524 struct xreap_state *rs, 525 unsigned int fixed_overhead, 526 unsigned int variable_overhead, 527 unsigned int per_intent, 528 unsigned int per_binval) 529 { 530 struct xfs_scrub *sc = rs->sc; 531 unsigned int res = sc->tp->t_log_res - fixed_overhead; 532 533 /* Don't underflow the reservation */ 534 if (sc->tp->t_log_res < (fixed_overhead + variable_overhead)) { 535 ASSERT(sc->tp->t_log_res >= 536 (fixed_overhead + variable_overhead)); 537 xfs_force_shutdown(sc->mp, SHUTDOWN_CORRUPT_INCORE); 538 return; 539 } 540 541 rs->max_deferred = per_intent ? res / variable_overhead : 0; 542 res -= rs->max_deferred * per_intent; 543 rs->max_binval = per_binval ? res / per_binval : 0; 544 } 545 546 /* 547 * Compute the maximum number of intent items that reaping can attach to the 548 * scrub transaction given the worst case log overhead of the intent items 549 * needed to reap a single per-AG space extent. This is not for freeing CoW 550 * staging extents. 551 */ 552 STATIC void 553 xreap_configure_agextent_limits( 554 struct xreap_state *rs) 555 { 556 struct xfs_scrub *sc = rs->sc; 557 struct xfs_mount *mp = sc->mp; 558 559 /* 560 * In the worst case, relogging an intent item causes both an intent 561 * item and a done item to be attached to a transaction for each extent 562 * that we'd like to process. 563 */ 564 const unsigned int efi = xfs_efi_log_space(1) + 565 xfs_efd_log_space(1); 566 const unsigned int rui = xfs_rui_log_space(1) + 567 xfs_rud_log_space(); 568 569 /* 570 * Various things can happen when reaping non-CoW metadata blocks: 571 * 572 * t1: Unmapping crosslinked metadata blocks: deferred removal of rmap 573 * record. 574 * 575 * t3: Freeing to AGFL: roll and finish deferred items for every block. 576 * Limits here do not matter. 577 * 578 * t4: Freeing metadata blocks: deferred freeing of the space, which 579 * also removes the rmap record. 580 * 581 * For simplicity, we'll use the worst-case intents size to determine 582 * the maximum number of deferred extents before we have to finish the 583 * whole chain. If we're trying to reap a btree larger than this size, 584 * a crash midway through reaping can result in leaked blocks. 585 */ 586 const unsigned int t1 = rui; 587 const unsigned int t4 = rui + efi; 588 const unsigned int per_intent = max(t1, t4); 589 590 /* 591 * For each transaction in a reap chain, we must be able to take one 592 * step in the defer item chain, which should only consist of EFI or 593 * RUI items. 594 */ 595 const unsigned int f1 = xfs_calc_finish_efi_reservation(mp, 1); 596 const unsigned int f2 = xfs_calc_finish_rui_reservation(mp, 1); 597 const unsigned int step_size = max(f1, f2); 598 599 /* Largest buffer size (in fsblocks) that can be invalidated. */ 600 const unsigned int max_binval = xrep_binval_max_fsblocks(mp); 601 602 /* Maximum overhead of invalidating one buffer. */ 603 const unsigned int per_binval = 604 xfs_buf_inval_log_space(1, XFS_B_TO_FSBT(mp, max_binval)); 605 606 /* 607 * For each transaction in a reap chain, we can delete some number of 608 * extents and invalidate some number of blocks. We assume that btree 609 * blocks aren't usually contiguous; and that scrub likely pulled all 610 * the buffers into memory. From these assumptions, set the maximum 611 * number of deferrals we can queue before flushing the defer chain, 612 * and the number of invalidations we can queue before rolling to a 613 * clean transaction (and possibly relogging some of the deferrals) to 614 * the same quantity. 615 */ 616 const unsigned int variable_overhead = per_intent + per_binval; 617 618 xreap_configure_limits(rs, step_size, variable_overhead, per_intent, 619 per_binval); 620 621 trace_xreap_agextent_limits(sc->tp, per_binval, rs->max_binval, 622 step_size, per_intent, rs->max_deferred); 623 } 624 625 /* 626 * Compute the maximum number of intent items that reaping can attach to the 627 * scrub transaction given the worst case log overhead of the intent items 628 * needed to reap a single CoW staging extent. This is not for freeing 629 * metadata blocks. 630 */ 631 STATIC void 632 xreap_configure_agcow_limits( 633 struct xreap_state *rs) 634 { 635 struct xfs_scrub *sc = rs->sc; 636 struct xfs_mount *mp = sc->mp; 637 638 /* 639 * In the worst case, relogging an intent item causes both an intent 640 * item and a done item to be attached to a transaction for each extent 641 * that we'd like to process. 642 */ 643 const unsigned int efi = xfs_efi_log_space(1) + 644 xfs_efd_log_space(1); 645 const unsigned int rui = xfs_rui_log_space(1) + 646 xfs_rud_log_space(); 647 const unsigned int cui = xfs_cui_log_space(1) + 648 xfs_cud_log_space(); 649 650 /* 651 * Various things can happen when reaping non-CoW metadata blocks: 652 * 653 * t0: Unmapping crosslinked CoW blocks: deferred removal of refcount 654 * record, which defers removal of rmap record 655 * 656 * t2: Freeing CoW blocks: deferred removal of refcount record, which 657 * defers removal of rmap record; and deferred removal of the space 658 * 659 * For simplicity, we'll use the worst-case intents size to determine 660 * the maximum number of deferred extents before we have to finish the 661 * whole chain. If we're trying to reap a btree larger than this size, 662 * a crash midway through reaping can result in leaked blocks. 663 */ 664 const unsigned int t0 = cui + rui; 665 const unsigned int t2 = cui + rui + efi; 666 const unsigned int per_intent = max(t0, t2); 667 668 /* 669 * For each transaction in a reap chain, we must be able to take one 670 * step in the defer item chain, which should only consist of CUI, EFI, 671 * or RUI items. 672 */ 673 const unsigned int f1 = xfs_calc_finish_efi_reservation(mp, 1); 674 const unsigned int f2 = xfs_calc_finish_rui_reservation(mp, 1); 675 const unsigned int f3 = xfs_calc_finish_cui_reservation(mp, 1); 676 const unsigned int step_size = max3(f1, f2, f3); 677 678 /* Largest buffer size (in fsblocks) that can be invalidated. */ 679 const unsigned int max_binval = xrep_binval_max_fsblocks(mp); 680 681 /* Overhead of invalidating one buffer */ 682 const unsigned int per_binval = 683 xfs_buf_inval_log_space(1, XFS_B_TO_FSBT(mp, max_binval)); 684 685 /* 686 * For each transaction in a reap chain, we can delete some number of 687 * extents and invalidate some number of blocks. We assume that CoW 688 * staging extents are usually more than 1 fsblock, and that there 689 * shouldn't be any buffers for those blocks. From the assumptions, 690 * set the number of deferrals to use as much of the reservation as 691 * it can, but leave space to invalidate 1/8th that number of buffers. 692 */ 693 const unsigned int variable_overhead = per_intent + 694 (per_binval / 8); 695 696 xreap_configure_limits(rs, step_size, variable_overhead, per_intent, 697 per_binval); 698 699 trace_xreap_agcow_limits(sc->tp, per_binval, rs->max_binval, step_size, 700 per_intent, rs->max_deferred); 701 } 702 703 /* 704 * Break an AG metadata extent into sub-extents by fate (crosslinked, not 705 * crosslinked), and dispose of each sub-extent separately. 706 */ 707 STATIC int 708 xreap_agmeta_extent( 709 uint32_t agbno, 710 uint32_t len, 711 void *priv) 712 { 713 struct xreap_state *rs = priv; 714 struct xfs_scrub *sc = rs->sc; 715 xfs_agblock_t agbno_next = agbno + len; 716 int error = 0; 717 718 ASSERT(len <= XFS_MAX_BMBT_EXTLEN); 719 ASSERT(sc->ip == NULL); 720 721 while (agbno < agbno_next) { 722 xfs_extlen_t aglen; 723 bool crosslinked; 724 725 error = xreap_agextent_select(rs, agbno, agbno_next, 726 &crosslinked, &aglen); 727 if (error) 728 return error; 729 730 error = xreap_agextent_iter(rs, agbno, &aglen, crosslinked); 731 if (error) 732 return error; 733 734 if (xreap_want_defer_finish(rs)) { 735 error = xrep_defer_finish(sc); 736 if (error) 737 return error; 738 xreap_defer_finish_reset(rs); 739 } else if (xreap_want_binval_roll(rs)) { 740 error = xrep_roll_ag_trans(sc); 741 if (error) 742 return error; 743 xreap_binval_reset(rs); 744 } 745 746 agbno += aglen; 747 } 748 749 return 0; 750 } 751 752 /* Dispose of every block of every AG metadata extent in the bitmap. */ 753 int 754 xrep_reap_agblocks( 755 struct xfs_scrub *sc, 756 struct xagb_bitmap *bitmap, 757 const struct xfs_owner_info *oinfo, 758 enum xfs_ag_resv_type type) 759 { 760 struct xreap_state rs = { 761 .sc = sc, 762 .oinfo = oinfo, 763 .resv = type, 764 }; 765 int error; 766 767 ASSERT(xfs_has_rmapbt(sc->mp)); 768 ASSERT(sc->ip == NULL); 769 770 xreap_configure_agextent_limits(&rs); 771 error = xagb_bitmap_walk(bitmap, xreap_agmeta_extent, &rs); 772 if (error) 773 return error; 774 775 if (xreap_is_dirty(&rs)) 776 return xrep_defer_finish(sc); 777 778 return 0; 779 } 780 781 /* 782 * Break a file metadata extent into sub-extents by fate (crosslinked, not 783 * crosslinked), and dispose of each sub-extent separately. The extent must 784 * not cross an AG boundary. 785 */ 786 STATIC int 787 xreap_fsmeta_extent( 788 uint64_t fsbno, 789 uint64_t len, 790 void *priv) 791 { 792 struct xreap_state *rs = priv; 793 struct xfs_scrub *sc = rs->sc; 794 xfs_agnumber_t agno = XFS_FSB_TO_AGNO(sc->mp, fsbno); 795 xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno); 796 xfs_agblock_t agbno_next = agbno + len; 797 int error = 0; 798 799 ASSERT(len <= XFS_MAX_BMBT_EXTLEN); 800 ASSERT(sc->ip != NULL); 801 ASSERT(!sc->sa.pag); 802 803 /* 804 * We're reaping blocks after repairing file metadata, which means that 805 * we have to init the xchk_ag structure ourselves. 806 */ 807 sc->sa.pag = xfs_perag_get(sc->mp, agno); 808 if (!sc->sa.pag) 809 return -EFSCORRUPTED; 810 811 error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &sc->sa.agf_bp); 812 if (error) 813 goto out_pag; 814 815 while (agbno < agbno_next) { 816 xfs_extlen_t aglen; 817 bool crosslinked; 818 819 error = xreap_agextent_select(rs, agbno, agbno_next, 820 &crosslinked, &aglen); 821 if (error) 822 goto out_agf; 823 824 error = xreap_agextent_iter(rs, agbno, &aglen, crosslinked); 825 if (error) 826 goto out_agf; 827 828 if (xreap_want_defer_finish(rs)) { 829 /* 830 * Holds the AGF buffer across the deferred chain 831 * processing. 832 */ 833 error = xrep_defer_finish(sc); 834 if (error) 835 goto out_agf; 836 xreap_defer_finish_reset(rs); 837 } else if (xreap_want_binval_roll(rs)) { 838 /* 839 * Hold the AGF buffer across the transaction roll so 840 * that we don't have to reattach it to the scrub 841 * context. 842 */ 843 xfs_trans_bhold(sc->tp, sc->sa.agf_bp); 844 error = xfs_trans_roll_inode(&sc->tp, sc->ip); 845 xfs_trans_bjoin(sc->tp, sc->sa.agf_bp); 846 if (error) 847 goto out_agf; 848 xreap_binval_reset(rs); 849 } 850 851 agbno += aglen; 852 } 853 854 out_agf: 855 xfs_trans_brelse(sc->tp, sc->sa.agf_bp); 856 sc->sa.agf_bp = NULL; 857 out_pag: 858 xfs_perag_put(sc->sa.pag); 859 sc->sa.pag = NULL; 860 return error; 861 } 862 863 /* 864 * Dispose of every block of every fs metadata extent in the bitmap. 865 * Do not use this to dispose of the mappings in an ondisk inode fork. 866 */ 867 int 868 xrep_reap_fsblocks( 869 struct xfs_scrub *sc, 870 struct xfsb_bitmap *bitmap, 871 const struct xfs_owner_info *oinfo) 872 { 873 struct xreap_state rs = { 874 .sc = sc, 875 .oinfo = oinfo, 876 .resv = XFS_AG_RESV_NONE, 877 }; 878 int error; 879 880 ASSERT(xfs_has_rmapbt(sc->mp)); 881 ASSERT(sc->ip != NULL); 882 883 if (oinfo == &XFS_RMAP_OINFO_COW) 884 xreap_configure_agcow_limits(&rs); 885 else 886 xreap_configure_agextent_limits(&rs); 887 error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs); 888 if (error) 889 return error; 890 891 if (xreap_is_dirty(&rs)) 892 return xrep_defer_finish(sc); 893 894 return 0; 895 } 896 897 #ifdef CONFIG_XFS_RT 898 /* 899 * Figure out the longest run of blocks that we can dispose of with a single 900 * call. Cross-linked blocks should have their reverse mappings removed, but 901 * single-owner extents can be freed. Units are rt blocks, not rt extents. 902 */ 903 STATIC int 904 xreap_rgextent_select( 905 struct xreap_state *rs, 906 xfs_rgblock_t rgbno, 907 xfs_rgblock_t rgbno_next, 908 bool *crosslinked, 909 xfs_extlen_t *rglenp) 910 { 911 struct xfs_scrub *sc = rs->sc; 912 struct xfs_btree_cur *cur; 913 xfs_rgblock_t bno = rgbno + 1; 914 xfs_extlen_t len = 1; 915 int error; 916 917 /* 918 * Determine if there are any other rmap records covering the first 919 * block of this extent. If so, the block is crosslinked. 920 */ 921 cur = xfs_rtrmapbt_init_cursor(sc->tp, sc->sr.rtg); 922 error = xfs_rmap_has_other_keys(cur, rgbno, 1, rs->oinfo, 923 crosslinked); 924 if (error) 925 goto out_cur; 926 927 /* 928 * Figure out how many of the subsequent blocks have the same crosslink 929 * status. 930 */ 931 while (bno < rgbno_next) { 932 bool also_crosslinked; 933 934 error = xfs_rmap_has_other_keys(cur, bno, 1, rs->oinfo, 935 &also_crosslinked); 936 if (error) 937 goto out_cur; 938 939 if (*crosslinked != also_crosslinked) 940 break; 941 942 len++; 943 bno++; 944 } 945 946 *rglenp = len; 947 trace_xreap_agextent_select(rtg_group(sc->sr.rtg), rgbno, len, 948 *crosslinked); 949 out_cur: 950 xfs_btree_del_cursor(cur, error); 951 return error; 952 } 953 954 /* 955 * Dispose of as much of the beginning of this rtgroup extent as possible. 956 * The number of blocks disposed of will be returned in @rglenp. 957 */ 958 STATIC int 959 xreap_rgextent_iter( 960 struct xreap_state *rs, 961 xfs_rgblock_t rgbno, 962 xfs_extlen_t *rglenp, 963 bool crosslinked) 964 { 965 struct xfs_scrub *sc = rs->sc; 966 xfs_rtblock_t rtbno; 967 int error; 968 969 /* 970 * The only caller so far is CoW fork repair, so we only know how to 971 * unlink or free CoW staging extents. Here we don't have to worry 972 * about invalidating buffers! 973 */ 974 if (rs->oinfo != &XFS_RMAP_OINFO_COW) { 975 ASSERT(rs->oinfo == &XFS_RMAP_OINFO_COW); 976 return -EFSCORRUPTED; 977 } 978 ASSERT(rs->resv == XFS_AG_RESV_NONE); 979 980 rtbno = xfs_rgbno_to_rtb(sc->sr.rtg, rgbno); 981 982 /* 983 * t1: There are other rmappings; this block is cross linked and must 984 * not be freed. Remove the forward and reverse mapping and move on. 985 */ 986 if (crosslinked) { 987 trace_xreap_dispose_unmap_extent(rtg_group(sc->sr.rtg), rgbno, 988 *rglenp); 989 990 xfs_refcount_free_cow_extent(sc->tp, true, rtbno, *rglenp); 991 xreap_inc_defer(rs); 992 return 0; 993 } 994 995 trace_xreap_dispose_free_extent(rtg_group(sc->sr.rtg), rgbno, *rglenp); 996 997 /* 998 * t2: The CoW staging extent is not crosslinked. Use deferred work 999 * to remove the refcountbt records (which removes the rmap records) 1000 * and free the extent. We're not worried about the system going down 1001 * here because log recovery walks the refcount btree to clean out the 1002 * CoW staging extents. 1003 */ 1004 xfs_refcount_free_cow_extent(sc->tp, true, rtbno, *rglenp); 1005 error = xfs_free_extent_later(sc->tp, rtbno, *rglenp, NULL, 1006 rs->resv, 1007 XFS_FREE_EXTENT_REALTIME | 1008 XFS_FREE_EXTENT_SKIP_DISCARD); 1009 if (error) 1010 return error; 1011 1012 xreap_inc_defer(rs); 1013 return 0; 1014 } 1015 1016 /* 1017 * Compute the maximum number of intent items that reaping can attach to the 1018 * scrub transaction given the worst case log overhead of the intent items 1019 * needed to reap a single CoW staging extent. This is not for freeing 1020 * metadata blocks. 1021 */ 1022 STATIC void 1023 xreap_configure_rgcow_limits( 1024 struct xreap_state *rs) 1025 { 1026 struct xfs_scrub *sc = rs->sc; 1027 struct xfs_mount *mp = sc->mp; 1028 1029 /* 1030 * In the worst case, relogging an intent item causes both an intent 1031 * item and a done item to be attached to a transaction for each extent 1032 * that we'd like to process. 1033 */ 1034 const unsigned int efi = xfs_efi_log_space(1) + 1035 xfs_efd_log_space(1); 1036 const unsigned int rui = xfs_rui_log_space(1) + 1037 xfs_rud_log_space(); 1038 const unsigned int cui = xfs_cui_log_space(1) + 1039 xfs_cud_log_space(); 1040 1041 /* 1042 * Various things can happen when reaping non-CoW metadata blocks: 1043 * 1044 * t1: Unmapping crosslinked CoW blocks: deferred removal of refcount 1045 * record, which defers removal of rmap record 1046 * 1047 * t2: Freeing CoW blocks: deferred removal of refcount record, which 1048 * defers removal of rmap record; and deferred removal of the space 1049 * 1050 * For simplicity, we'll use the worst-case intents size to determine 1051 * the maximum number of deferred extents before we have to finish the 1052 * whole chain. If we're trying to reap a btree larger than this size, 1053 * a crash midway through reaping can result in leaked blocks. 1054 */ 1055 const unsigned int t1 = cui + rui; 1056 const unsigned int t2 = cui + rui + efi; 1057 const unsigned int per_intent = max(t1, t2); 1058 1059 /* 1060 * For each transaction in a reap chain, we must be able to take one 1061 * step in the defer item chain, which should only consist of CUI, EFI, 1062 * or RUI items. 1063 */ 1064 const unsigned int f1 = xfs_calc_finish_rt_efi_reservation(mp, 1); 1065 const unsigned int f2 = xfs_calc_finish_rt_rui_reservation(mp, 1); 1066 const unsigned int f3 = xfs_calc_finish_rt_cui_reservation(mp, 1); 1067 const unsigned int step_size = max3(f1, f2, f3); 1068 1069 /* 1070 * The only buffer for the rt device is the rtgroup super, so we don't 1071 * need to save space for buffer invalidations. 1072 */ 1073 xreap_configure_limits(rs, step_size, per_intent, per_intent, 0); 1074 1075 trace_xreap_rgcow_limits(sc->tp, 0, 0, step_size, per_intent, 1076 rs->max_deferred); 1077 } 1078 1079 #define XREAP_RTGLOCK_ALL (XFS_RTGLOCK_BITMAP | \ 1080 XFS_RTGLOCK_RMAP | \ 1081 XFS_RTGLOCK_REFCOUNT) 1082 1083 /* 1084 * Break a rt file metadata extent into sub-extents by fate (crosslinked, not 1085 * crosslinked), and dispose of each sub-extent separately. The extent must 1086 * be aligned to a realtime extent. 1087 */ 1088 STATIC int 1089 xreap_rtmeta_extent( 1090 uint64_t rtbno, 1091 uint64_t len, 1092 void *priv) 1093 { 1094 struct xreap_state *rs = priv; 1095 struct xfs_scrub *sc = rs->sc; 1096 xfs_rgblock_t rgbno = xfs_rtb_to_rgbno(sc->mp, rtbno); 1097 xfs_rgblock_t rgbno_next = rgbno + len; 1098 int error = 0; 1099 1100 ASSERT(sc->ip != NULL); 1101 ASSERT(!sc->sr.rtg); 1102 1103 /* 1104 * We're reaping blocks after repairing file metadata, which means that 1105 * we have to init the xchk_ag structure ourselves. 1106 */ 1107 sc->sr.rtg = xfs_rtgroup_get(sc->mp, xfs_rtb_to_rgno(sc->mp, rtbno)); 1108 if (!sc->sr.rtg) 1109 return -EFSCORRUPTED; 1110 1111 xfs_rtgroup_lock(sc->sr.rtg, XREAP_RTGLOCK_ALL); 1112 1113 while (rgbno < rgbno_next) { 1114 xfs_extlen_t rglen; 1115 bool crosslinked; 1116 1117 error = xreap_rgextent_select(rs, rgbno, rgbno_next, 1118 &crosslinked, &rglen); 1119 if (error) 1120 goto out_unlock; 1121 1122 error = xreap_rgextent_iter(rs, rgbno, &rglen, crosslinked); 1123 if (error) 1124 goto out_unlock; 1125 1126 if (xreap_want_defer_finish(rs)) { 1127 error = xfs_defer_finish(&sc->tp); 1128 if (error) 1129 goto out_unlock; 1130 xreap_defer_finish_reset(rs); 1131 } else if (xreap_want_binval_roll(rs)) { 1132 error = xfs_trans_roll_inode(&sc->tp, sc->ip); 1133 if (error) 1134 goto out_unlock; 1135 xreap_binval_reset(rs); 1136 } 1137 1138 rgbno += rglen; 1139 } 1140 1141 out_unlock: 1142 xfs_rtgroup_unlock(sc->sr.rtg, XREAP_RTGLOCK_ALL); 1143 xfs_rtgroup_put(sc->sr.rtg); 1144 sc->sr.rtg = NULL; 1145 return error; 1146 } 1147 1148 /* 1149 * Dispose of every block of every rt metadata extent in the bitmap. 1150 * Do not use this to dispose of the mappings in an ondisk inode fork. 1151 */ 1152 int 1153 xrep_reap_rtblocks( 1154 struct xfs_scrub *sc, 1155 struct xrtb_bitmap *bitmap, 1156 const struct xfs_owner_info *oinfo) 1157 { 1158 struct xreap_state rs = { 1159 .sc = sc, 1160 .oinfo = oinfo, 1161 .resv = XFS_AG_RESV_NONE, 1162 }; 1163 int error; 1164 1165 ASSERT(xfs_has_rmapbt(sc->mp)); 1166 ASSERT(sc->ip != NULL); 1167 ASSERT(oinfo == &XFS_RMAP_OINFO_COW); 1168 1169 xreap_configure_rgcow_limits(&rs); 1170 error = xrtb_bitmap_walk(bitmap, xreap_rtmeta_extent, &rs); 1171 if (error) 1172 return error; 1173 1174 if (xreap_is_dirty(&rs)) 1175 return xrep_defer_finish(sc); 1176 1177 return 0; 1178 } 1179 #endif /* CONFIG_XFS_RT */ 1180 1181 /* 1182 * Dispose of every block of an old metadata btree that used to be rooted in a 1183 * metadata directory file. 1184 */ 1185 int 1186 xrep_reap_metadir_fsblocks( 1187 struct xfs_scrub *sc, 1188 struct xfsb_bitmap *bitmap) 1189 { 1190 /* 1191 * Reap old metadir btree blocks with XFS_AG_RESV_NONE because the old 1192 * blocks are no longer mapped by the inode, and inode metadata space 1193 * reservations can only account freed space to the i_nblocks. 1194 */ 1195 struct xfs_owner_info oinfo; 1196 struct xreap_state rs = { 1197 .sc = sc, 1198 .oinfo = &oinfo, 1199 .resv = XFS_AG_RESV_NONE, 1200 }; 1201 int error; 1202 1203 ASSERT(xfs_has_rmapbt(sc->mp)); 1204 ASSERT(sc->ip != NULL); 1205 ASSERT(xfs_is_metadir_inode(sc->ip)); 1206 1207 xreap_configure_agextent_limits(&rs); 1208 xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, XFS_DATA_FORK); 1209 error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs); 1210 if (error) 1211 return error; 1212 1213 if (xreap_is_dirty(&rs)) { 1214 error = xrep_defer_finish(sc); 1215 if (error) 1216 return error; 1217 } 1218 1219 return xrep_reset_metafile_resv(sc); 1220 } 1221 1222 /* 1223 * Metadata files are not supposed to share blocks with anything else. 1224 * If blocks are shared, we remove the reverse mapping (thus reducing the 1225 * crosslink factor); if blocks are not shared, we also need to free them. 1226 * 1227 * This first step determines the longest subset of the passed-in imap 1228 * (starting at its beginning) that is either crosslinked or not crosslinked. 1229 * The blockcount will be adjust down as needed. 1230 */ 1231 STATIC int 1232 xreap_bmapi_select( 1233 struct xreap_state *rs, 1234 struct xfs_bmbt_irec *imap, 1235 bool *crosslinked) 1236 { 1237 struct xfs_owner_info oinfo; 1238 struct xfs_scrub *sc = rs->sc; 1239 struct xfs_btree_cur *cur; 1240 xfs_filblks_t len = 1; 1241 xfs_agblock_t bno; 1242 xfs_agblock_t agbno; 1243 xfs_agblock_t agbno_next; 1244 int error; 1245 1246 agbno = XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock); 1247 agbno_next = agbno + imap->br_blockcount; 1248 1249 cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, 1250 sc->sa.pag); 1251 1252 xfs_rmap_ino_owner(&oinfo, rs->ip->i_ino, rs->whichfork, 1253 imap->br_startoff); 1254 error = xfs_rmap_has_other_keys(cur, agbno, 1, &oinfo, crosslinked); 1255 if (error) 1256 goto out_cur; 1257 1258 bno = agbno + 1; 1259 while (bno < agbno_next) { 1260 bool also_crosslinked; 1261 1262 oinfo.oi_offset++; 1263 error = xfs_rmap_has_other_keys(cur, bno, 1, &oinfo, 1264 &also_crosslinked); 1265 if (error) 1266 goto out_cur; 1267 1268 if (also_crosslinked != *crosslinked) 1269 break; 1270 1271 len++; 1272 bno++; 1273 } 1274 1275 imap->br_blockcount = len; 1276 trace_xreap_bmapi_select(pag_group(sc->sa.pag), agbno, len, 1277 *crosslinked); 1278 out_cur: 1279 xfs_btree_del_cursor(cur, error); 1280 return error; 1281 } 1282 1283 /* 1284 * Decide if this buffer can be joined to a transaction. This is true for most 1285 * buffers, but there are two cases that we want to catch: large remote xattr 1286 * value buffers are not logged and can overflow the buffer log item dirty 1287 * bitmap size; and oversized cached buffers if things have really gone 1288 * haywire. 1289 */ 1290 static inline bool 1291 xreap_buf_loggable( 1292 const struct xfs_buf *bp) 1293 { 1294 int i; 1295 1296 for (i = 0; i < bp->b_map_count; i++) { 1297 int chunks; 1298 int map_size; 1299 1300 chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len), 1301 XFS_BLF_CHUNK); 1302 map_size = DIV_ROUND_UP(chunks, NBWORD); 1303 if (map_size > XFS_BLF_DATAMAP_SIZE) 1304 return false; 1305 } 1306 1307 return true; 1308 } 1309 1310 /* 1311 * Invalidate any buffers for this file mapping. The @imap blockcount may be 1312 * adjusted downward if we need to roll the transaction. 1313 */ 1314 STATIC int 1315 xreap_bmapi_binval( 1316 struct xreap_state *rs, 1317 struct xfs_bmbt_irec *imap) 1318 { 1319 struct xfs_scrub *sc = rs->sc; 1320 struct xfs_mount *mp = sc->mp; 1321 struct xfs_perag *pag = sc->sa.pag; 1322 int bmap_flags = xfs_bmapi_aflag(rs->whichfork); 1323 xfs_fileoff_t off; 1324 xfs_fileoff_t max_off; 1325 xfs_extlen_t scan_blocks; 1326 xfs_agblock_t bno; 1327 xfs_agblock_t agbno; 1328 xfs_agblock_t agbno_next; 1329 int error; 1330 1331 /* 1332 * Avoid invalidating AG headers and post-EOFS blocks because we never 1333 * own those. 1334 */ 1335 agbno = bno = XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock); 1336 agbno_next = agbno + imap->br_blockcount; 1337 if (!xfs_verify_agbno(pag, agbno) || 1338 !xfs_verify_agbno(pag, agbno_next - 1)) 1339 return 0; 1340 1341 /* 1342 * Buffers for file blocks can span multiple contiguous mappings. This 1343 * means that for each block in the mapping, there could exist an 1344 * xfs_buf indexed by that block with any length up to the maximum 1345 * buffer size (remote xattr values) or to the next hole in the fork. 1346 * To set up our binval scan, first we need to figure out the location 1347 * of the next hole. 1348 */ 1349 off = imap->br_startoff + imap->br_blockcount; 1350 max_off = off + xfs_attr3_max_rmt_blocks(mp); 1351 while (off < max_off) { 1352 struct xfs_bmbt_irec hmap; 1353 int nhmaps = 1; 1354 1355 error = xfs_bmapi_read(rs->ip, off, max_off - off, &hmap, 1356 &nhmaps, bmap_flags); 1357 if (error) 1358 return error; 1359 if (nhmaps != 1 || hmap.br_startblock == DELAYSTARTBLOCK) { 1360 ASSERT(0); 1361 return -EFSCORRUPTED; 1362 } 1363 1364 if (!xfs_bmap_is_real_extent(&hmap)) 1365 break; 1366 1367 off = hmap.br_startoff + hmap.br_blockcount; 1368 } 1369 scan_blocks = off - imap->br_startoff; 1370 1371 trace_xreap_bmapi_binval_scan(sc, imap, scan_blocks); 1372 1373 /* 1374 * If there are incore buffers for these blocks, invalidate them. If 1375 * we can't (try)lock the buffer we assume it's owned by someone else 1376 * and leave it alone. The buffer cache cannot detect aliasing, so 1377 * employ nested loops to detect incore buffers of any plausible size. 1378 */ 1379 while (bno < agbno_next) { 1380 struct xrep_bufscan scan = { 1381 .daddr = xfs_agbno_to_daddr(pag, bno), 1382 .max_sectors = xrep_bufscan_max_sectors(mp, 1383 scan_blocks), 1384 .daddr_step = XFS_FSB_TO_BB(mp, 1), 1385 }; 1386 struct xfs_buf *bp; 1387 1388 while ((bp = xrep_bufscan_advance(mp, &scan)) != NULL) { 1389 if (xreap_buf_loggable(bp)) { 1390 xfs_trans_bjoin(sc->tp, bp); 1391 xfs_trans_binval(sc->tp, bp); 1392 } else { 1393 xfs_buf_stale(bp); 1394 xfs_buf_relse(bp); 1395 } 1396 1397 /* 1398 * Stop invalidating if we've hit the limit; we should 1399 * still have enough reservation left to free however 1400 * far we've gotten. 1401 */ 1402 if (!xreap_inc_binval(rs)) { 1403 imap->br_blockcount = agbno_next - bno; 1404 goto out; 1405 } 1406 } 1407 1408 bno++; 1409 scan_blocks--; 1410 } 1411 1412 out: 1413 trace_xreap_bmapi_binval(pag_group(sc->sa.pag), agbno, 1414 imap->br_blockcount); 1415 return 0; 1416 } 1417 1418 /* 1419 * Dispose of as much of the beginning of this file fork mapping as possible. 1420 * The number of blocks disposed of is returned in @imap->br_blockcount. 1421 */ 1422 STATIC int 1423 xrep_reap_bmapi_iter( 1424 struct xreap_state *rs, 1425 struct xfs_bmbt_irec *imap, 1426 bool crosslinked) 1427 { 1428 struct xfs_scrub *sc = rs->sc; 1429 int error; 1430 1431 if (crosslinked) { 1432 /* 1433 * If there are other rmappings, this block is cross linked and 1434 * must not be freed. Remove the reverse mapping, leave the 1435 * buffer cache in its possibly confused state, and move on. 1436 * We don't want to risk discarding valid data buffers from 1437 * anybody else who thinks they own the block, even though that 1438 * runs the risk of stale buffer warnings in the future. 1439 */ 1440 trace_xreap_dispose_unmap_extent(pag_group(sc->sa.pag), 1441 XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock), 1442 imap->br_blockcount); 1443 1444 /* 1445 * t0: Schedule removal of the mapping from the fork. We use 1446 * deferred log intents in this function to control the exact 1447 * sequence of metadata updates. 1448 */ 1449 xfs_bmap_unmap_extent(sc->tp, rs->ip, rs->whichfork, imap); 1450 xfs_trans_mod_dquot_byino(sc->tp, rs->ip, XFS_TRANS_DQ_BCOUNT, 1451 -(int64_t)imap->br_blockcount); 1452 xfs_rmap_unmap_extent(sc->tp, rs->ip, rs->whichfork, imap); 1453 return 0; 1454 } 1455 1456 /* 1457 * If the block is not crosslinked, we can invalidate all the incore 1458 * buffers for the extent, and then free the extent. This is a bit of 1459 * a mess since we don't detect discontiguous buffers that are indexed 1460 * by a block starting before the first block of the extent but overlap 1461 * anyway. 1462 */ 1463 trace_xreap_dispose_free_extent(pag_group(sc->sa.pag), 1464 XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock), 1465 imap->br_blockcount); 1466 1467 /* 1468 * Invalidate as many buffers as we can, starting at the beginning of 1469 * this mapping. If this function sets blockcount to zero, the 1470 * transaction is full of logged buffer invalidations, so we need to 1471 * return early so that we can roll and retry. 1472 */ 1473 error = xreap_bmapi_binval(rs, imap); 1474 if (error || imap->br_blockcount == 0) 1475 return error; 1476 1477 /* 1478 * t1: Schedule removal of the mapping from the fork. We use deferred 1479 * work in this function to control the exact sequence of metadata 1480 * updates. 1481 */ 1482 xfs_bmap_unmap_extent(sc->tp, rs->ip, rs->whichfork, imap); 1483 xfs_trans_mod_dquot_byino(sc->tp, rs->ip, XFS_TRANS_DQ_BCOUNT, 1484 -(int64_t)imap->br_blockcount); 1485 return xfs_free_extent_later(sc->tp, imap->br_startblock, 1486 imap->br_blockcount, NULL, XFS_AG_RESV_NONE, 1487 XFS_FREE_EXTENT_SKIP_DISCARD); 1488 } 1489 1490 /* Compute the maximum mapcount of a file buffer. */ 1491 static unsigned int 1492 xreap_bmapi_binval_mapcount( 1493 struct xfs_scrub *sc) 1494 { 1495 /* directory blocks can span multiple fsblocks and be discontiguous */ 1496 if (sc->sm->sm_type == XFS_SCRUB_TYPE_DIR) 1497 return sc->mp->m_dir_geo->fsbcount; 1498 1499 /* all other file xattr/symlink blocks must be contiguous */ 1500 return 1; 1501 } 1502 1503 /* Compute the maximum block size of a file buffer. */ 1504 static unsigned int 1505 xreap_bmapi_binval_blocksize( 1506 struct xfs_scrub *sc) 1507 { 1508 switch (sc->sm->sm_type) { 1509 case XFS_SCRUB_TYPE_DIR: 1510 return sc->mp->m_dir_geo->blksize; 1511 case XFS_SCRUB_TYPE_XATTR: 1512 case XFS_SCRUB_TYPE_PARENT: 1513 /* 1514 * The xattr structure itself consists of single fsblocks, but 1515 * there could be remote xattr blocks to invalidate. 1516 */ 1517 return XFS_XATTR_SIZE_MAX; 1518 } 1519 1520 /* everything else is a single block */ 1521 return sc->mp->m_sb.sb_blocksize; 1522 } 1523 1524 /* 1525 * Compute the maximum number of buffer invalidations that we can do while 1526 * reaping a single extent from a file fork. 1527 */ 1528 STATIC void 1529 xreap_configure_bmapi_limits( 1530 struct xreap_state *rs) 1531 { 1532 struct xfs_scrub *sc = rs->sc; 1533 struct xfs_mount *mp = sc->mp; 1534 1535 /* overhead of invalidating a buffer */ 1536 const unsigned int per_binval = 1537 xfs_buf_inval_log_space(xreap_bmapi_binval_mapcount(sc), 1538 xreap_bmapi_binval_blocksize(sc)); 1539 1540 /* 1541 * In the worst case, relogging an intent item causes both an intent 1542 * item and a done item to be attached to a transaction for each extent 1543 * that we'd like to process. 1544 */ 1545 const unsigned int efi = xfs_efi_log_space(1) + 1546 xfs_efd_log_space(1); 1547 const unsigned int rui = xfs_rui_log_space(1) + 1548 xfs_rud_log_space(); 1549 const unsigned int bui = xfs_bui_log_space(1) + 1550 xfs_bud_log_space(); 1551 1552 /* 1553 * t1: Unmapping crosslinked file data blocks: one bmap deletion, 1554 * possibly an EFI for underfilled bmbt blocks, and an rmap deletion. 1555 * 1556 * t2: Freeing freeing file data blocks: one bmap deletion, possibly an 1557 * EFI for underfilled bmbt blocks, and another EFI for the space 1558 * itself. 1559 */ 1560 const unsigned int t1 = (bui + efi) + rui; 1561 const unsigned int t2 = (bui + efi) + efi; 1562 const unsigned int per_intent = max(t1, t2); 1563 1564 /* 1565 * For each transaction in a reap chain, we must be able to take one 1566 * step in the defer item chain, which should only consist of CUI, EFI, 1567 * or RUI items. 1568 */ 1569 const unsigned int f1 = xfs_calc_finish_efi_reservation(mp, 1); 1570 const unsigned int f2 = xfs_calc_finish_rui_reservation(mp, 1); 1571 const unsigned int f3 = xfs_calc_finish_bui_reservation(mp, 1); 1572 const unsigned int step_size = max3(f1, f2, f3); 1573 1574 /* 1575 * Each call to xreap_ifork_extent starts with a clean transaction and 1576 * operates on a single mapping by creating a chain of log intent items 1577 * for that mapping. We need to leave enough reservation in the 1578 * transaction to log btree buffer and inode updates for each step in 1579 * the chain, and to relog the log intents. 1580 */ 1581 const unsigned int per_extent_res = per_intent + step_size; 1582 1583 xreap_configure_limits(rs, per_extent_res, per_binval, 0, per_binval); 1584 1585 trace_xreap_bmapi_limits(sc->tp, per_binval, rs->max_binval, 1586 step_size, per_intent, 1); 1587 } 1588 1589 /* 1590 * Dispose of as much of this file extent as we can. Upon successful return, 1591 * the imap will reflect the mapping that was removed from the fork. 1592 */ 1593 STATIC int 1594 xreap_ifork_extent( 1595 struct xreap_state *rs, 1596 struct xfs_bmbt_irec *imap) 1597 { 1598 struct xfs_scrub *sc = rs->sc; 1599 xfs_agnumber_t agno; 1600 bool crosslinked; 1601 int error; 1602 1603 ASSERT(sc->sa.pag == NULL); 1604 1605 trace_xreap_ifork_extent(sc, rs->ip, rs->whichfork, imap); 1606 1607 agno = XFS_FSB_TO_AGNO(sc->mp, imap->br_startblock); 1608 sc->sa.pag = xfs_perag_get(sc->mp, agno); 1609 if (!sc->sa.pag) 1610 return -EFSCORRUPTED; 1611 1612 error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &sc->sa.agf_bp); 1613 if (error) 1614 goto out_pag; 1615 1616 /* 1617 * Decide the fate of the blocks at the beginning of the mapping, then 1618 * update the mapping to use it with the unmap calls. 1619 */ 1620 error = xreap_bmapi_select(rs, imap, &crosslinked); 1621 if (error) 1622 goto out_agf; 1623 1624 error = xrep_reap_bmapi_iter(rs, imap, crosslinked); 1625 if (error) 1626 goto out_agf; 1627 1628 out_agf: 1629 xfs_trans_brelse(sc->tp, sc->sa.agf_bp); 1630 sc->sa.agf_bp = NULL; 1631 out_pag: 1632 xfs_perag_put(sc->sa.pag); 1633 sc->sa.pag = NULL; 1634 return error; 1635 } 1636 1637 /* 1638 * Dispose of each block mapped to the given fork of the given file. Callers 1639 * must hold ILOCK_EXCL, and ip can only be sc->ip or sc->tempip. The fork 1640 * must not have any delalloc reservations. 1641 */ 1642 int 1643 xrep_reap_ifork( 1644 struct xfs_scrub *sc, 1645 struct xfs_inode *ip, 1646 int whichfork) 1647 { 1648 struct xreap_state rs = { 1649 .sc = sc, 1650 .ip = ip, 1651 .whichfork = whichfork, 1652 }; 1653 xfs_fileoff_t off = 0; 1654 int bmap_flags = xfs_bmapi_aflag(whichfork); 1655 int error; 1656 1657 ASSERT(xfs_has_rmapbt(sc->mp)); 1658 ASSERT(ip == sc->ip || ip == sc->tempip); 1659 ASSERT(whichfork == XFS_ATTR_FORK || !XFS_IS_REALTIME_INODE(ip)); 1660 1661 xreap_configure_bmapi_limits(&rs); 1662 while (off < XFS_MAX_FILEOFF) { 1663 struct xfs_bmbt_irec imap; 1664 int nimaps = 1; 1665 1666 /* Read the next extent, skip past holes and delalloc. */ 1667 error = xfs_bmapi_read(ip, off, XFS_MAX_FILEOFF - off, &imap, 1668 &nimaps, bmap_flags); 1669 if (error) 1670 return error; 1671 if (nimaps != 1 || imap.br_startblock == DELAYSTARTBLOCK) { 1672 ASSERT(0); 1673 return -EFSCORRUPTED; 1674 } 1675 1676 /* 1677 * If this is a real space mapping, reap as much of it as we 1678 * can in a single transaction. 1679 */ 1680 if (xfs_bmap_is_real_extent(&imap)) { 1681 error = xreap_ifork_extent(&rs, &imap); 1682 if (error) 1683 return error; 1684 1685 error = xfs_defer_finish(&sc->tp); 1686 if (error) 1687 return error; 1688 xreap_defer_finish_reset(&rs); 1689 } 1690 1691 off = imap.br_startoff + imap.br_blockcount; 1692 } 1693 1694 return 0; 1695 } 1696