1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2018-2023 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_btree.h" 13 #include "xfs_log_format.h" 14 #include "xfs_trans.h" 15 #include "xfs_sb.h" 16 #include "xfs_inode.h" 17 #include "xfs_alloc.h" 18 #include "xfs_alloc_btree.h" 19 #include "xfs_ialloc.h" 20 #include "xfs_ialloc_btree.h" 21 #include "xfs_rmap.h" 22 #include "xfs_rmap_btree.h" 23 #include "xfs_refcount_btree.h" 24 #include "xfs_extent_busy.h" 25 #include "xfs_ag.h" 26 #include "xfs_ag_resv.h" 27 #include "xfs_quota.h" 28 #include "xfs_qm.h" 29 #include "xfs_defer.h" 30 #include "xfs_errortag.h" 31 #include "xfs_error.h" 32 #include "xfs_reflink.h" 33 #include "scrub/scrub.h" 34 #include "scrub/common.h" 35 #include "scrub/trace.h" 36 #include "scrub/repair.h" 37 #include "scrub/bitmap.h" 38 #include "scrub/stats.h" 39 40 /* 41 * Attempt to repair some metadata, if the metadata is corrupt and userspace 42 * told us to fix it. This function returns -EAGAIN to mean "re-run scrub", 43 * and will set *fixed to true if it thinks it repaired anything. 44 */ 45 int 46 xrep_attempt( 47 struct xfs_scrub *sc, 48 struct xchk_stats_run *run) 49 { 50 u64 repair_start; 51 int error = 0; 52 53 trace_xrep_attempt(XFS_I(file_inode(sc->file)), sc->sm, error); 54 55 xchk_ag_btcur_free(&sc->sa); 56 57 /* Repair whatever's broken. */ 58 ASSERT(sc->ops->repair); 59 run->repair_attempted = true; 60 repair_start = xchk_stats_now(); 61 error = sc->ops->repair(sc); 62 trace_xrep_done(XFS_I(file_inode(sc->file)), sc->sm, error); 63 run->repair_ns += xchk_stats_elapsed_ns(repair_start); 64 switch (error) { 65 case 0: 66 /* 67 * Repair succeeded. Commit the fixes and perform a second 68 * scrub so that we can tell userspace if we fixed the problem. 69 */ 70 sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; 71 sc->flags |= XREP_ALREADY_FIXED; 72 run->repair_succeeded = true; 73 return -EAGAIN; 74 case -ECHRNG: 75 sc->flags |= XCHK_NEED_DRAIN; 76 run->retries++; 77 return -EAGAIN; 78 case -EDEADLOCK: 79 /* Tell the caller to try again having grabbed all the locks. */ 80 if (!(sc->flags & XCHK_TRY_HARDER)) { 81 sc->flags |= XCHK_TRY_HARDER; 82 run->retries++; 83 return -EAGAIN; 84 } 85 /* 86 * We tried harder but still couldn't grab all the resources 87 * we needed to fix it. The corruption has not been fixed, 88 * so exit to userspace with the scan's output flags unchanged. 89 */ 90 return 0; 91 default: 92 /* 93 * EAGAIN tells the caller to re-scrub, so we cannot return 94 * that here. 95 */ 96 ASSERT(error != -EAGAIN); 97 return error; 98 } 99 } 100 101 /* 102 * Complain about unfixable problems in the filesystem. We don't log 103 * corruptions when IFLAG_REPAIR wasn't set on the assumption that the driver 104 * program is xfs_scrub, which will call back with IFLAG_REPAIR set if the 105 * administrator isn't running xfs_scrub in no-repairs mode. 106 * 107 * Use this helper function because _ratelimited silently declares a static 108 * structure to track rate limiting information. 109 */ 110 void 111 xrep_failure( 112 struct xfs_mount *mp) 113 { 114 xfs_alert_ratelimited(mp, 115 "Corruption not fixed during online repair. Unmount and run xfs_repair."); 116 } 117 118 /* 119 * Repair probe -- userspace uses this to probe if we're willing to repair a 120 * given mountpoint. 121 */ 122 int 123 xrep_probe( 124 struct xfs_scrub *sc) 125 { 126 int error = 0; 127 128 if (xchk_should_terminate(sc, &error)) 129 return error; 130 131 return 0; 132 } 133 134 /* 135 * Roll a transaction, keeping the AG headers locked and reinitializing 136 * the btree cursors. 137 */ 138 int 139 xrep_roll_ag_trans( 140 struct xfs_scrub *sc) 141 { 142 int error; 143 144 /* 145 * Keep the AG header buffers locked while we roll the transaction. 146 * Ensure that both AG buffers are dirty and held when we roll the 147 * transaction so that they move forward in the log without losing the 148 * bli (and hence the bli type) when the transaction commits. 149 * 150 * Normal code would never hold clean buffers across a roll, but repair 151 * needs both buffers to maintain a total lock on the AG. 152 */ 153 if (sc->sa.agi_bp) { 154 xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp, XFS_AGI_MAGICNUM); 155 xfs_trans_bhold(sc->tp, sc->sa.agi_bp); 156 } 157 158 if (sc->sa.agf_bp) { 159 xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_MAGICNUM); 160 xfs_trans_bhold(sc->tp, sc->sa.agf_bp); 161 } 162 163 /* 164 * Roll the transaction. We still hold the AG header buffers locked 165 * regardless of whether or not that succeeds. On failure, the buffers 166 * will be released during teardown on our way out of the kernel. If 167 * successful, join the buffers to the new transaction and move on. 168 */ 169 error = xfs_trans_roll(&sc->tp); 170 if (error) 171 return error; 172 173 /* Join the AG headers to the new transaction. */ 174 if (sc->sa.agi_bp) 175 xfs_trans_bjoin(sc->tp, sc->sa.agi_bp); 176 if (sc->sa.agf_bp) 177 xfs_trans_bjoin(sc->tp, sc->sa.agf_bp); 178 179 return 0; 180 } 181 182 /* Roll the scrub transaction, holding the primary metadata locked. */ 183 int 184 xrep_roll_trans( 185 struct xfs_scrub *sc) 186 { 187 if (!sc->ip) 188 return xrep_roll_ag_trans(sc); 189 return xfs_trans_roll_inode(&sc->tp, sc->ip); 190 } 191 192 /* Finish all deferred work attached to the repair transaction. */ 193 int 194 xrep_defer_finish( 195 struct xfs_scrub *sc) 196 { 197 int error; 198 199 /* 200 * Keep the AG header buffers locked while we complete deferred work 201 * items. Ensure that both AG buffers are dirty and held when we roll 202 * the transaction so that they move forward in the log without losing 203 * the bli (and hence the bli type) when the transaction commits. 204 * 205 * Normal code would never hold clean buffers across a roll, but repair 206 * needs both buffers to maintain a total lock on the AG. 207 */ 208 if (sc->sa.agi_bp) { 209 xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp, XFS_AGI_MAGICNUM); 210 xfs_trans_bhold(sc->tp, sc->sa.agi_bp); 211 } 212 213 if (sc->sa.agf_bp) { 214 xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_MAGICNUM); 215 xfs_trans_bhold(sc->tp, sc->sa.agf_bp); 216 } 217 218 /* 219 * Finish all deferred work items. We still hold the AG header buffers 220 * locked regardless of whether or not that succeeds. On failure, the 221 * buffers will be released during teardown on our way out of the 222 * kernel. If successful, join the buffers to the new transaction 223 * and move on. 224 */ 225 error = xfs_defer_finish(&sc->tp); 226 if (error) 227 return error; 228 229 /* 230 * Release the hold that we set above because defer_finish won't do 231 * that for us. The defer roll code redirties held buffers after each 232 * roll, so the AG header buffers should be ready for logging. 233 */ 234 if (sc->sa.agi_bp) 235 xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp); 236 if (sc->sa.agf_bp) 237 xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp); 238 239 return 0; 240 } 241 242 /* 243 * Does the given AG have enough space to rebuild a btree? Neither AG 244 * reservation can be critical, and we must have enough space (factoring 245 * in AG reservations) to construct a whole btree. 246 */ 247 bool 248 xrep_ag_has_space( 249 struct xfs_perag *pag, 250 xfs_extlen_t nr_blocks, 251 enum xfs_ag_resv_type type) 252 { 253 return !xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) && 254 !xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA) && 255 pag->pagf_freeblks > xfs_ag_resv_needed(pag, type) + nr_blocks; 256 } 257 258 /* 259 * Figure out how many blocks to reserve for an AG repair. We calculate the 260 * worst case estimate for the number of blocks we'd need to rebuild one of 261 * any type of per-AG btree. 262 */ 263 xfs_extlen_t 264 xrep_calc_ag_resblks( 265 struct xfs_scrub *sc) 266 { 267 struct xfs_mount *mp = sc->mp; 268 struct xfs_scrub_metadata *sm = sc->sm; 269 struct xfs_perag *pag; 270 struct xfs_buf *bp; 271 xfs_agino_t icount = NULLAGINO; 272 xfs_extlen_t aglen = NULLAGBLOCK; 273 xfs_extlen_t usedlen; 274 xfs_extlen_t freelen; 275 xfs_extlen_t bnobt_sz; 276 xfs_extlen_t inobt_sz; 277 xfs_extlen_t rmapbt_sz; 278 xfs_extlen_t refcbt_sz; 279 int error; 280 281 if (!(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)) 282 return 0; 283 284 pag = xfs_perag_get(mp, sm->sm_agno); 285 if (xfs_perag_initialised_agi(pag)) { 286 /* Use in-core icount if possible. */ 287 icount = pag->pagi_count; 288 } else { 289 /* Try to get the actual counters from disk. */ 290 error = xfs_ialloc_read_agi(pag, NULL, &bp); 291 if (!error) { 292 icount = pag->pagi_count; 293 xfs_buf_relse(bp); 294 } 295 } 296 297 /* Now grab the block counters from the AGF. */ 298 error = xfs_alloc_read_agf(pag, NULL, 0, &bp); 299 if (error) { 300 aglen = pag->block_count; 301 freelen = aglen; 302 usedlen = aglen; 303 } else { 304 struct xfs_agf *agf = bp->b_addr; 305 306 aglen = be32_to_cpu(agf->agf_length); 307 freelen = be32_to_cpu(agf->agf_freeblks); 308 usedlen = aglen - freelen; 309 xfs_buf_relse(bp); 310 } 311 312 /* If the icount is impossible, make some worst-case assumptions. */ 313 if (icount == NULLAGINO || 314 !xfs_verify_agino(pag, icount)) { 315 icount = pag->agino_max - pag->agino_min + 1; 316 } 317 318 /* If the block counts are impossible, make worst-case assumptions. */ 319 if (aglen == NULLAGBLOCK || 320 aglen != pag->block_count || 321 freelen >= aglen) { 322 aglen = pag->block_count; 323 freelen = aglen; 324 usedlen = aglen; 325 } 326 xfs_perag_put(pag); 327 328 trace_xrep_calc_ag_resblks(mp, sm->sm_agno, icount, aglen, 329 freelen, usedlen); 330 331 /* 332 * Figure out how many blocks we'd need worst case to rebuild 333 * each type of btree. Note that we can only rebuild the 334 * bnobt/cntbt or inobt/finobt as pairs. 335 */ 336 bnobt_sz = 2 * xfs_allocbt_calc_size(mp, freelen); 337 if (xfs_has_sparseinodes(mp)) 338 inobt_sz = xfs_iallocbt_calc_size(mp, icount / 339 XFS_INODES_PER_HOLEMASK_BIT); 340 else 341 inobt_sz = xfs_iallocbt_calc_size(mp, icount / 342 XFS_INODES_PER_CHUNK); 343 if (xfs_has_finobt(mp)) 344 inobt_sz *= 2; 345 if (xfs_has_reflink(mp)) 346 refcbt_sz = xfs_refcountbt_calc_size(mp, usedlen); 347 else 348 refcbt_sz = 0; 349 if (xfs_has_rmapbt(mp)) { 350 /* 351 * Guess how many blocks we need to rebuild the rmapbt. 352 * For non-reflink filesystems we can't have more records than 353 * used blocks. However, with reflink it's possible to have 354 * more than one rmap record per AG block. We don't know how 355 * many rmaps there could be in the AG, so we start off with 356 * what we hope is an generous over-estimation. 357 */ 358 if (xfs_has_reflink(mp)) 359 rmapbt_sz = xfs_rmapbt_calc_size(mp, 360 (unsigned long long)aglen * 2); 361 else 362 rmapbt_sz = xfs_rmapbt_calc_size(mp, usedlen); 363 } else { 364 rmapbt_sz = 0; 365 } 366 367 trace_xrep_calc_ag_resblks_btsize(mp, sm->sm_agno, bnobt_sz, 368 inobt_sz, rmapbt_sz, refcbt_sz); 369 370 return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz)); 371 } 372 373 /* 374 * Reconstructing per-AG Btrees 375 * 376 * When a space btree is corrupt, we don't bother trying to fix it. Instead, 377 * we scan secondary space metadata to derive the records that should be in 378 * the damaged btree, initialize a fresh btree root, and insert the records. 379 * Note that for rebuilding the rmapbt we scan all the primary data to 380 * generate the new records. 381 * 382 * However, that leaves the matter of removing all the metadata describing the 383 * old broken structure. For primary metadata we use the rmap data to collect 384 * every extent with a matching rmap owner (bitmap); we then iterate all other 385 * metadata structures with the same rmap owner to collect the extents that 386 * cannot be removed (sublist). We then subtract sublist from bitmap to 387 * derive the blocks that were used by the old btree. These blocks can be 388 * reaped. 389 * 390 * For rmapbt reconstructions we must use different tactics for extent 391 * collection. First we iterate all primary metadata (this excludes the old 392 * rmapbt, obviously) to generate new rmap records. The gaps in the rmap 393 * records are collected as bitmap. The bnobt records are collected as 394 * sublist. As with the other btrees we subtract sublist from bitmap, and the 395 * result (since the rmapbt lives in the free space) are the blocks from the 396 * old rmapbt. 397 */ 398 399 /* Ensure the freelist is the correct size. */ 400 int 401 xrep_fix_freelist( 402 struct xfs_scrub *sc, 403 bool can_shrink) 404 { 405 struct xfs_alloc_arg args = {0}; 406 407 args.mp = sc->mp; 408 args.tp = sc->tp; 409 args.agno = sc->sa.pag->pag_agno; 410 args.alignment = 1; 411 args.pag = sc->sa.pag; 412 413 return xfs_alloc_fix_freelist(&args, 414 can_shrink ? 0 : XFS_ALLOC_FLAG_NOSHRINK); 415 } 416 417 /* 418 * Finding per-AG Btree Roots for AGF/AGI Reconstruction 419 * 420 * If the AGF or AGI become slightly corrupted, it may be necessary to rebuild 421 * the AG headers by using the rmap data to rummage through the AG looking for 422 * btree roots. This is not guaranteed to work if the AG is heavily damaged 423 * or the rmap data are corrupt. 424 * 425 * Callers of xrep_find_ag_btree_roots must lock the AGF and AGFL 426 * buffers if the AGF is being rebuilt; or the AGF and AGI buffers if the 427 * AGI is being rebuilt. It must maintain these locks until it's safe for 428 * other threads to change the btrees' shapes. The caller provides 429 * information about the btrees to look for by passing in an array of 430 * xrep_find_ag_btree with the (rmap owner, buf_ops, magic) fields set. 431 * The (root, height) fields will be set on return if anything is found. The 432 * last element of the array should have a NULL buf_ops to mark the end of the 433 * array. 434 * 435 * For every rmapbt record matching any of the rmap owners in btree_info, 436 * read each block referenced by the rmap record. If the block is a btree 437 * block from this filesystem matching any of the magic numbers and has a 438 * level higher than what we've already seen, remember the block and the 439 * height of the tree required to have such a block. When the call completes, 440 * we return the highest block we've found for each btree description; those 441 * should be the roots. 442 */ 443 444 struct xrep_findroot { 445 struct xfs_scrub *sc; 446 struct xfs_buf *agfl_bp; 447 struct xfs_agf *agf; 448 struct xrep_find_ag_btree *btree_info; 449 }; 450 451 /* See if our block is in the AGFL. */ 452 STATIC int 453 xrep_findroot_agfl_walk( 454 struct xfs_mount *mp, 455 xfs_agblock_t bno, 456 void *priv) 457 { 458 xfs_agblock_t *agbno = priv; 459 460 return (*agbno == bno) ? -ECANCELED : 0; 461 } 462 463 /* Does this block match the btree information passed in? */ 464 STATIC int 465 xrep_findroot_block( 466 struct xrep_findroot *ri, 467 struct xrep_find_ag_btree *fab, 468 uint64_t owner, 469 xfs_agblock_t agbno, 470 bool *done_with_block) 471 { 472 struct xfs_mount *mp = ri->sc->mp; 473 struct xfs_buf *bp; 474 struct xfs_btree_block *btblock; 475 xfs_daddr_t daddr; 476 int block_level; 477 int error = 0; 478 479 daddr = XFS_AGB_TO_DADDR(mp, ri->sc->sa.pag->pag_agno, agbno); 480 481 /* 482 * Blocks in the AGFL have stale contents that might just happen to 483 * have a matching magic and uuid. We don't want to pull these blocks 484 * in as part of a tree root, so we have to filter out the AGFL stuff 485 * here. If the AGFL looks insane we'll just refuse to repair. 486 */ 487 if (owner == XFS_RMAP_OWN_AG) { 488 error = xfs_agfl_walk(mp, ri->agf, ri->agfl_bp, 489 xrep_findroot_agfl_walk, &agbno); 490 if (error == -ECANCELED) 491 return 0; 492 if (error) 493 return error; 494 } 495 496 /* 497 * Read the buffer into memory so that we can see if it's a match for 498 * our btree type. We have no clue if it is beforehand, and we want to 499 * avoid xfs_trans_read_buf's behavior of dumping the DONE state (which 500 * will cause needless disk reads in subsequent calls to this function) 501 * and logging metadata verifier failures. 502 * 503 * Therefore, pass in NULL buffer ops. If the buffer was already in 504 * memory from some other caller it will already have b_ops assigned. 505 * If it was in memory from a previous unsuccessful findroot_block 506 * call, the buffer won't have b_ops but it should be clean and ready 507 * for us to try to verify if the read call succeeds. The same applies 508 * if the buffer wasn't in memory at all. 509 * 510 * Note: If we never match a btree type with this buffer, it will be 511 * left in memory with NULL b_ops. This shouldn't be a problem unless 512 * the buffer gets written. 513 */ 514 error = xfs_trans_read_buf(mp, ri->sc->tp, mp->m_ddev_targp, daddr, 515 mp->m_bsize, 0, &bp, NULL); 516 if (error) 517 return error; 518 519 /* Ensure the block magic matches the btree type we're looking for. */ 520 btblock = XFS_BUF_TO_BLOCK(bp); 521 ASSERT(fab->buf_ops->magic[1] != 0); 522 if (btblock->bb_magic != fab->buf_ops->magic[1]) 523 goto out; 524 525 /* 526 * If the buffer already has ops applied and they're not the ones for 527 * this btree type, we know this block doesn't match the btree and we 528 * can bail out. 529 * 530 * If the buffer ops match ours, someone else has already validated 531 * the block for us, so we can move on to checking if this is a root 532 * block candidate. 533 * 534 * If the buffer does not have ops, nobody has successfully validated 535 * the contents and the buffer cannot be dirty. If the magic, uuid, 536 * and structure match this btree type then we'll move on to checking 537 * if it's a root block candidate. If there is no match, bail out. 538 */ 539 if (bp->b_ops) { 540 if (bp->b_ops != fab->buf_ops) 541 goto out; 542 } else { 543 ASSERT(!xfs_trans_buf_is_dirty(bp)); 544 if (!uuid_equal(&btblock->bb_u.s.bb_uuid, 545 &mp->m_sb.sb_meta_uuid)) 546 goto out; 547 /* 548 * Read verifiers can reference b_ops, so we set the pointer 549 * here. If the verifier fails we'll reset the buffer state 550 * to what it was before we touched the buffer. 551 */ 552 bp->b_ops = fab->buf_ops; 553 fab->buf_ops->verify_read(bp); 554 if (bp->b_error) { 555 bp->b_ops = NULL; 556 bp->b_error = 0; 557 goto out; 558 } 559 560 /* 561 * Some read verifiers will (re)set b_ops, so we must be 562 * careful not to change b_ops after running the verifier. 563 */ 564 } 565 566 /* 567 * This block passes the magic/uuid and verifier tests for this btree 568 * type. We don't need the caller to try the other tree types. 569 */ 570 *done_with_block = true; 571 572 /* 573 * Compare this btree block's level to the height of the current 574 * candidate root block. 575 * 576 * If the level matches the root we found previously, throw away both 577 * blocks because there can't be two candidate roots. 578 * 579 * If level is lower in the tree than the root we found previously, 580 * ignore this block. 581 */ 582 block_level = xfs_btree_get_level(btblock); 583 if (block_level + 1 == fab->height) { 584 fab->root = NULLAGBLOCK; 585 goto out; 586 } else if (block_level < fab->height) { 587 goto out; 588 } 589 590 /* 591 * This is the highest block in the tree that we've found so far. 592 * Update the btree height to reflect what we've learned from this 593 * block. 594 */ 595 fab->height = block_level + 1; 596 597 /* 598 * If this block doesn't have sibling pointers, then it's the new root 599 * block candidate. Otherwise, the root will be found farther up the 600 * tree. 601 */ 602 if (btblock->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) && 603 btblock->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK)) 604 fab->root = agbno; 605 else 606 fab->root = NULLAGBLOCK; 607 608 trace_xrep_findroot_block(mp, ri->sc->sa.pag->pag_agno, agbno, 609 be32_to_cpu(btblock->bb_magic), fab->height - 1); 610 out: 611 xfs_trans_brelse(ri->sc->tp, bp); 612 return error; 613 } 614 615 /* 616 * Do any of the blocks in this rmap record match one of the btrees we're 617 * looking for? 618 */ 619 STATIC int 620 xrep_findroot_rmap( 621 struct xfs_btree_cur *cur, 622 const struct xfs_rmap_irec *rec, 623 void *priv) 624 { 625 struct xrep_findroot *ri = priv; 626 struct xrep_find_ag_btree *fab; 627 xfs_agblock_t b; 628 bool done; 629 int error = 0; 630 631 /* Ignore anything that isn't AG metadata. */ 632 if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner)) 633 return 0; 634 635 /* Otherwise scan each block + btree type. */ 636 for (b = 0; b < rec->rm_blockcount; b++) { 637 done = false; 638 for (fab = ri->btree_info; fab->buf_ops; fab++) { 639 if (rec->rm_owner != fab->rmap_owner) 640 continue; 641 error = xrep_findroot_block(ri, fab, 642 rec->rm_owner, rec->rm_startblock + b, 643 &done); 644 if (error) 645 return error; 646 if (done) 647 break; 648 } 649 } 650 651 return 0; 652 } 653 654 /* Find the roots of the per-AG btrees described in btree_info. */ 655 int 656 xrep_find_ag_btree_roots( 657 struct xfs_scrub *sc, 658 struct xfs_buf *agf_bp, 659 struct xrep_find_ag_btree *btree_info, 660 struct xfs_buf *agfl_bp) 661 { 662 struct xfs_mount *mp = sc->mp; 663 struct xrep_findroot ri; 664 struct xrep_find_ag_btree *fab; 665 struct xfs_btree_cur *cur; 666 int error; 667 668 ASSERT(xfs_buf_islocked(agf_bp)); 669 ASSERT(agfl_bp == NULL || xfs_buf_islocked(agfl_bp)); 670 671 ri.sc = sc; 672 ri.btree_info = btree_info; 673 ri.agf = agf_bp->b_addr; 674 ri.agfl_bp = agfl_bp; 675 for (fab = btree_info; fab->buf_ops; fab++) { 676 ASSERT(agfl_bp || fab->rmap_owner != XFS_RMAP_OWN_AG); 677 ASSERT(XFS_RMAP_NON_INODE_OWNER(fab->rmap_owner)); 678 fab->root = NULLAGBLOCK; 679 fab->height = 0; 680 } 681 682 cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); 683 error = xfs_rmap_query_all(cur, xrep_findroot_rmap, &ri); 684 xfs_btree_del_cursor(cur, error); 685 686 return error; 687 } 688 689 #ifdef CONFIG_XFS_QUOTA 690 /* Force a quotacheck the next time we mount. */ 691 void 692 xrep_force_quotacheck( 693 struct xfs_scrub *sc, 694 xfs_dqtype_t type) 695 { 696 uint flag; 697 698 flag = xfs_quota_chkd_flag(type); 699 if (!(flag & sc->mp->m_qflags)) 700 return; 701 702 mutex_lock(&sc->mp->m_quotainfo->qi_quotaofflock); 703 sc->mp->m_qflags &= ~flag; 704 spin_lock(&sc->mp->m_sb_lock); 705 sc->mp->m_sb.sb_qflags &= ~flag; 706 spin_unlock(&sc->mp->m_sb_lock); 707 xfs_log_sb(sc->tp); 708 mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock); 709 } 710 711 /* 712 * Attach dquots to this inode, or schedule quotacheck to fix them. 713 * 714 * This function ensures that the appropriate dquots are attached to an inode. 715 * We cannot allow the dquot code to allocate an on-disk dquot block here 716 * because we're already in transaction context. The on-disk dquot should 717 * already exist anyway. If the quota code signals corruption or missing quota 718 * information, schedule quotacheck, which will repair corruptions in the quota 719 * metadata. 720 */ 721 int 722 xrep_ino_dqattach( 723 struct xfs_scrub *sc) 724 { 725 int error; 726 727 ASSERT(sc->tp != NULL); 728 ASSERT(sc->ip != NULL); 729 730 error = xfs_qm_dqattach(sc->ip); 731 switch (error) { 732 case -EFSBADCRC: 733 case -EFSCORRUPTED: 734 case -ENOENT: 735 xfs_err_ratelimited(sc->mp, 736 "inode %llu repair encountered quota error %d, quotacheck forced.", 737 (unsigned long long)sc->ip->i_ino, error); 738 if (XFS_IS_UQUOTA_ON(sc->mp) && !sc->ip->i_udquot) 739 xrep_force_quotacheck(sc, XFS_DQTYPE_USER); 740 if (XFS_IS_GQUOTA_ON(sc->mp) && !sc->ip->i_gdquot) 741 xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP); 742 if (XFS_IS_PQUOTA_ON(sc->mp) && !sc->ip->i_pdquot) 743 xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ); 744 fallthrough; 745 case -ESRCH: 746 error = 0; 747 break; 748 default: 749 break; 750 } 751 752 return error; 753 } 754 #endif /* CONFIG_XFS_QUOTA */ 755 756 /* 757 * Ensure that the inode being repaired is ready to handle a certain number of 758 * extents, or return EFSCORRUPTED. Caller must hold the ILOCK of the inode 759 * being repaired and have joined it to the scrub transaction. 760 */ 761 int 762 xrep_ino_ensure_extent_count( 763 struct xfs_scrub *sc, 764 int whichfork, 765 xfs_extnum_t nextents) 766 { 767 xfs_extnum_t max_extents; 768 bool inode_has_nrext64; 769 770 inode_has_nrext64 = xfs_inode_has_large_extent_counts(sc->ip); 771 max_extents = xfs_iext_max_nextents(inode_has_nrext64, whichfork); 772 if (nextents <= max_extents) 773 return 0; 774 if (inode_has_nrext64) 775 return -EFSCORRUPTED; 776 if (!xfs_has_large_extent_counts(sc->mp)) 777 return -EFSCORRUPTED; 778 779 max_extents = xfs_iext_max_nextents(true, whichfork); 780 if (nextents > max_extents) 781 return -EFSCORRUPTED; 782 783 sc->ip->i_diflags2 |= XFS_DIFLAG2_NREXT64; 784 xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); 785 return 0; 786 } 787 788 /* 789 * Initialize all the btree cursors for an AG repair except for the btree that 790 * we're rebuilding. 791 */ 792 void 793 xrep_ag_btcur_init( 794 struct xfs_scrub *sc, 795 struct xchk_ag *sa) 796 { 797 struct xfs_mount *mp = sc->mp; 798 799 /* Set up a bnobt cursor for cross-referencing. */ 800 if (sc->sm->sm_type != XFS_SCRUB_TYPE_BNOBT && 801 sc->sm->sm_type != XFS_SCRUB_TYPE_CNTBT) { 802 sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp, 803 sc->sa.pag, XFS_BTNUM_BNO); 804 sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp, 805 sc->sa.pag, XFS_BTNUM_CNT); 806 } 807 808 /* Set up a inobt cursor for cross-referencing. */ 809 if (sc->sm->sm_type != XFS_SCRUB_TYPE_INOBT && 810 sc->sm->sm_type != XFS_SCRUB_TYPE_FINOBT) { 811 sa->ino_cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, 812 sa->agi_bp, XFS_BTNUM_INO); 813 if (xfs_has_finobt(mp)) 814 sa->fino_cur = xfs_inobt_init_cursor(sc->sa.pag, 815 sc->tp, sa->agi_bp, XFS_BTNUM_FINO); 816 } 817 818 /* Set up a rmapbt cursor for cross-referencing. */ 819 if (sc->sm->sm_type != XFS_SCRUB_TYPE_RMAPBT && 820 xfs_has_rmapbt(mp)) 821 sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp, 822 sc->sa.pag); 823 824 /* Set up a refcountbt cursor for cross-referencing. */ 825 if (sc->sm->sm_type != XFS_SCRUB_TYPE_REFCNTBT && 826 xfs_has_reflink(mp)) 827 sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp, 828 sa->agf_bp, sc->sa.pag); 829 } 830 831 /* 832 * Reinitialize the in-core AG state after a repair by rereading the AGF 833 * buffer. We had better get the same AGF buffer as the one that's attached 834 * to the scrub context. 835 */ 836 int 837 xrep_reinit_pagf( 838 struct xfs_scrub *sc) 839 { 840 struct xfs_perag *pag = sc->sa.pag; 841 struct xfs_buf *bp; 842 int error; 843 844 ASSERT(pag); 845 ASSERT(xfs_perag_initialised_agf(pag)); 846 847 clear_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate); 848 error = xfs_alloc_read_agf(pag, sc->tp, 0, &bp); 849 if (error) 850 return error; 851 852 if (bp != sc->sa.agf_bp) { 853 ASSERT(bp == sc->sa.agf_bp); 854 return -EFSCORRUPTED; 855 } 856 857 return 0; 858 } 859 860 /* 861 * Reinitialize the in-core AG state after a repair by rereading the AGI 862 * buffer. We had better get the same AGI buffer as the one that's attached 863 * to the scrub context. 864 */ 865 int 866 xrep_reinit_pagi( 867 struct xfs_scrub *sc) 868 { 869 struct xfs_perag *pag = sc->sa.pag; 870 struct xfs_buf *bp; 871 int error; 872 873 ASSERT(pag); 874 ASSERT(xfs_perag_initialised_agi(pag)); 875 876 clear_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate); 877 error = xfs_ialloc_read_agi(pag, sc->tp, &bp); 878 if (error) 879 return error; 880 881 if (bp != sc->sa.agi_bp) { 882 ASSERT(bp == sc->sa.agi_bp); 883 return -EFSCORRUPTED; 884 } 885 886 return 0; 887 } 888 889 /* 890 * Given an active reference to a perag structure, load AG headers and cursors. 891 * This should only be called to scan an AG while repairing file-based metadata. 892 */ 893 int 894 xrep_ag_init( 895 struct xfs_scrub *sc, 896 struct xfs_perag *pag, 897 struct xchk_ag *sa) 898 { 899 int error; 900 901 ASSERT(!sa->pag); 902 903 error = xfs_ialloc_read_agi(pag, sc->tp, &sa->agi_bp); 904 if (error) 905 return error; 906 907 error = xfs_alloc_read_agf(pag, sc->tp, 0, &sa->agf_bp); 908 if (error) 909 return error; 910 911 /* Grab our own passive reference from the caller's ref. */ 912 sa->pag = xfs_perag_hold(pag); 913 xrep_ag_btcur_init(sc, sa); 914 return 0; 915 } 916 917 /* Reinitialize the per-AG block reservation for the AG we just fixed. */ 918 int 919 xrep_reset_perag_resv( 920 struct xfs_scrub *sc) 921 { 922 int error; 923 924 if (!(sc->flags & XREP_RESET_PERAG_RESV)) 925 return 0; 926 927 ASSERT(sc->sa.pag != NULL); 928 ASSERT(sc->ops->type == ST_PERAG); 929 ASSERT(sc->tp); 930 931 sc->flags &= ~XREP_RESET_PERAG_RESV; 932 error = xfs_ag_resv_free(sc->sa.pag); 933 if (error) 934 goto out; 935 error = xfs_ag_resv_init(sc->sa.pag, sc->tp); 936 if (error == -ENOSPC) { 937 xfs_err(sc->mp, 938 "Insufficient free space to reset per-AG reservation for AG %u after repair.", 939 sc->sa.pag->pag_agno); 940 error = 0; 941 } 942 943 out: 944 return error; 945 } 946 947 /* Decide if we are going to call the repair function for a scrub type. */ 948 bool 949 xrep_will_attempt( 950 struct xfs_scrub *sc) 951 { 952 /* Userspace asked us to rebuild the structure regardless. */ 953 if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) 954 return true; 955 956 /* Let debug users force us into the repair routines. */ 957 if (XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) 958 return true; 959 960 /* Metadata is corrupt or failed cross-referencing. */ 961 if (xchk_needs_repair(sc->sm)) 962 return true; 963 964 return false; 965 } 966 967 /* Try to fix some part of a metadata inode by calling another scrubber. */ 968 STATIC int 969 xrep_metadata_inode_subtype( 970 struct xfs_scrub *sc, 971 unsigned int scrub_type) 972 { 973 __u32 smtype = sc->sm->sm_type; 974 __u32 smflags = sc->sm->sm_flags; 975 unsigned int sick_mask = sc->sick_mask; 976 int error; 977 978 /* 979 * Let's see if the inode needs repair. We're going to open-code calls 980 * to the scrub and repair functions so that we can hang on to the 981 * resources that we already acquired instead of using the standard 982 * setup/teardown routines. 983 */ 984 sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; 985 sc->sm->sm_type = scrub_type; 986 987 switch (scrub_type) { 988 case XFS_SCRUB_TYPE_INODE: 989 error = xchk_inode(sc); 990 break; 991 case XFS_SCRUB_TYPE_BMBTD: 992 error = xchk_bmap_data(sc); 993 break; 994 case XFS_SCRUB_TYPE_BMBTA: 995 error = xchk_bmap_attr(sc); 996 break; 997 default: 998 ASSERT(0); 999 error = -EFSCORRUPTED; 1000 } 1001 if (error) 1002 goto out; 1003 1004 if (!xrep_will_attempt(sc)) 1005 goto out; 1006 1007 /* 1008 * Repair some part of the inode. This will potentially join the inode 1009 * to the transaction. 1010 */ 1011 switch (scrub_type) { 1012 case XFS_SCRUB_TYPE_INODE: 1013 error = xrep_inode(sc); 1014 break; 1015 case XFS_SCRUB_TYPE_BMBTD: 1016 error = xrep_bmap(sc, XFS_DATA_FORK, false); 1017 break; 1018 case XFS_SCRUB_TYPE_BMBTA: 1019 error = xrep_bmap(sc, XFS_ATTR_FORK, false); 1020 break; 1021 } 1022 if (error) 1023 goto out; 1024 1025 /* 1026 * Finish all deferred intent items and then roll the transaction so 1027 * that the inode will not be joined to the transaction when we exit 1028 * the function. 1029 */ 1030 error = xfs_defer_finish(&sc->tp); 1031 if (error) 1032 goto out; 1033 error = xfs_trans_roll(&sc->tp); 1034 if (error) 1035 goto out; 1036 1037 /* 1038 * Clear the corruption flags and re-check the metadata that we just 1039 * repaired. 1040 */ 1041 sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; 1042 1043 switch (scrub_type) { 1044 case XFS_SCRUB_TYPE_INODE: 1045 error = xchk_inode(sc); 1046 break; 1047 case XFS_SCRUB_TYPE_BMBTD: 1048 error = xchk_bmap_data(sc); 1049 break; 1050 case XFS_SCRUB_TYPE_BMBTA: 1051 error = xchk_bmap_attr(sc); 1052 break; 1053 } 1054 if (error) 1055 goto out; 1056 1057 /* If corruption persists, the repair has failed. */ 1058 if (xchk_needs_repair(sc->sm)) { 1059 error = -EFSCORRUPTED; 1060 goto out; 1061 } 1062 out: 1063 sc->sick_mask = sick_mask; 1064 sc->sm->sm_type = smtype; 1065 sc->sm->sm_flags = smflags; 1066 return error; 1067 } 1068 1069 /* 1070 * Repair the ondisk forks of a metadata inode. The caller must ensure that 1071 * sc->ip points to the metadata inode and the ILOCK is held on that inode. 1072 * The inode must not be joined to the transaction before the call, and will 1073 * not be afterwards. 1074 */ 1075 int 1076 xrep_metadata_inode_forks( 1077 struct xfs_scrub *sc) 1078 { 1079 bool dirty = false; 1080 int error; 1081 1082 /* Repair the inode record and the data fork. */ 1083 error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_INODE); 1084 if (error) 1085 return error; 1086 1087 error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTD); 1088 if (error) 1089 return error; 1090 1091 /* Make sure the attr fork looks ok before we delete it. */ 1092 error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTA); 1093 if (error) 1094 return error; 1095 1096 /* Clear the reflink flag since metadata never shares. */ 1097 if (xfs_is_reflink_inode(sc->ip)) { 1098 dirty = true; 1099 xfs_trans_ijoin(sc->tp, sc->ip, 0); 1100 error = xfs_reflink_clear_inode_flag(sc->ip, &sc->tp); 1101 if (error) 1102 return error; 1103 } 1104 1105 /* 1106 * If we modified the inode, roll the transaction but don't rejoin the 1107 * inode to the new transaction because xrep_bmap_data can do that. 1108 */ 1109 if (dirty) { 1110 error = xfs_trans_roll(&sc->tp); 1111 if (error) 1112 return error; 1113 dirty = false; 1114 } 1115 1116 return 0; 1117 } 1118