1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2018-2023 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_btree.h" 13 #include "xfs_log_format.h" 14 #include "xfs_trans.h" 15 #include "xfs_sb.h" 16 #include "xfs_inode.h" 17 #include "xfs_alloc.h" 18 #include "xfs_alloc_btree.h" 19 #include "xfs_ialloc.h" 20 #include "xfs_ialloc_btree.h" 21 #include "xfs_rmap.h" 22 #include "xfs_rmap_btree.h" 23 #include "xfs_refcount_btree.h" 24 #include "xfs_extent_busy.h" 25 #include "xfs_ag.h" 26 #include "xfs_ag_resv.h" 27 #include "xfs_quota.h" 28 #include "xfs_qm.h" 29 #include "xfs_defer.h" 30 #include "xfs_errortag.h" 31 #include "xfs_error.h" 32 #include "xfs_reflink.h" 33 #include "xfs_health.h" 34 #include "xfs_buf_mem.h" 35 #include "scrub/scrub.h" 36 #include "scrub/common.h" 37 #include "scrub/trace.h" 38 #include "scrub/repair.h" 39 #include "scrub/bitmap.h" 40 #include "scrub/stats.h" 41 #include "scrub/xfile.h" 42 43 /* 44 * Attempt to repair some metadata, if the metadata is corrupt and userspace 45 * told us to fix it. This function returns -EAGAIN to mean "re-run scrub", 46 * and will set *fixed to true if it thinks it repaired anything. 47 */ 48 int 49 xrep_attempt( 50 struct xfs_scrub *sc, 51 struct xchk_stats_run *run) 52 { 53 u64 repair_start; 54 int error = 0; 55 56 trace_xrep_attempt(XFS_I(file_inode(sc->file)), sc->sm, error); 57 58 xchk_ag_btcur_free(&sc->sa); 59 60 /* Repair whatever's broken. */ 61 ASSERT(sc->ops->repair); 62 run->repair_attempted = true; 63 repair_start = xchk_stats_now(); 64 error = sc->ops->repair(sc); 65 trace_xrep_done(XFS_I(file_inode(sc->file)), sc->sm, error); 66 run->repair_ns += xchk_stats_elapsed_ns(repair_start); 67 switch (error) { 68 case 0: 69 /* 70 * Repair succeeded. Commit the fixes and perform a second 71 * scrub so that we can tell userspace if we fixed the problem. 72 */ 73 sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; 74 sc->flags |= XREP_ALREADY_FIXED; 75 run->repair_succeeded = true; 76 return -EAGAIN; 77 case -ECHRNG: 78 sc->flags |= XCHK_NEED_DRAIN; 79 run->retries++; 80 return -EAGAIN; 81 case -EDEADLOCK: 82 /* Tell the caller to try again having grabbed all the locks. */ 83 if (!(sc->flags & XCHK_TRY_HARDER)) { 84 sc->flags |= XCHK_TRY_HARDER; 85 run->retries++; 86 return -EAGAIN; 87 } 88 /* 89 * We tried harder but still couldn't grab all the resources 90 * we needed to fix it. The corruption has not been fixed, 91 * so exit to userspace with the scan's output flags unchanged. 92 */ 93 return 0; 94 default: 95 /* 96 * EAGAIN tells the caller to re-scrub, so we cannot return 97 * that here. 98 */ 99 ASSERT(error != -EAGAIN); 100 return error; 101 } 102 } 103 104 /* 105 * Complain about unfixable problems in the filesystem. We don't log 106 * corruptions when IFLAG_REPAIR wasn't set on the assumption that the driver 107 * program is xfs_scrub, which will call back with IFLAG_REPAIR set if the 108 * administrator isn't running xfs_scrub in no-repairs mode. 109 * 110 * Use this helper function because _ratelimited silently declares a static 111 * structure to track rate limiting information. 112 */ 113 void 114 xrep_failure( 115 struct xfs_mount *mp) 116 { 117 xfs_alert_ratelimited(mp, 118 "Corruption not fixed during online repair. Unmount and run xfs_repair."); 119 } 120 121 /* 122 * Repair probe -- userspace uses this to probe if we're willing to repair a 123 * given mountpoint. 124 */ 125 int 126 xrep_probe( 127 struct xfs_scrub *sc) 128 { 129 int error = 0; 130 131 if (xchk_should_terminate(sc, &error)) 132 return error; 133 134 return 0; 135 } 136 137 /* 138 * Roll a transaction, keeping the AG headers locked and reinitializing 139 * the btree cursors. 140 */ 141 int 142 xrep_roll_ag_trans( 143 struct xfs_scrub *sc) 144 { 145 int error; 146 147 /* 148 * Keep the AG header buffers locked while we roll the transaction. 149 * Ensure that both AG buffers are dirty and held when we roll the 150 * transaction so that they move forward in the log without losing the 151 * bli (and hence the bli type) when the transaction commits. 152 * 153 * Normal code would never hold clean buffers across a roll, but repair 154 * needs both buffers to maintain a total lock on the AG. 155 */ 156 if (sc->sa.agi_bp) { 157 xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp, XFS_AGI_MAGICNUM); 158 xfs_trans_bhold(sc->tp, sc->sa.agi_bp); 159 } 160 161 if (sc->sa.agf_bp) { 162 xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_MAGICNUM); 163 xfs_trans_bhold(sc->tp, sc->sa.agf_bp); 164 } 165 166 /* 167 * Roll the transaction. We still hold the AG header buffers locked 168 * regardless of whether or not that succeeds. On failure, the buffers 169 * will be released during teardown on our way out of the kernel. If 170 * successful, join the buffers to the new transaction and move on. 171 */ 172 error = xfs_trans_roll(&sc->tp); 173 if (error) 174 return error; 175 176 /* Join the AG headers to the new transaction. */ 177 if (sc->sa.agi_bp) 178 xfs_trans_bjoin(sc->tp, sc->sa.agi_bp); 179 if (sc->sa.agf_bp) 180 xfs_trans_bjoin(sc->tp, sc->sa.agf_bp); 181 182 return 0; 183 } 184 185 /* Roll the scrub transaction, holding the primary metadata locked. */ 186 int 187 xrep_roll_trans( 188 struct xfs_scrub *sc) 189 { 190 if (!sc->ip) 191 return xrep_roll_ag_trans(sc); 192 return xfs_trans_roll_inode(&sc->tp, sc->ip); 193 } 194 195 /* Finish all deferred work attached to the repair transaction. */ 196 int 197 xrep_defer_finish( 198 struct xfs_scrub *sc) 199 { 200 int error; 201 202 /* 203 * Keep the AG header buffers locked while we complete deferred work 204 * items. Ensure that both AG buffers are dirty and held when we roll 205 * the transaction so that they move forward in the log without losing 206 * the bli (and hence the bli type) when the transaction commits. 207 * 208 * Normal code would never hold clean buffers across a roll, but repair 209 * needs both buffers to maintain a total lock on the AG. 210 */ 211 if (sc->sa.agi_bp) { 212 xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp, XFS_AGI_MAGICNUM); 213 xfs_trans_bhold(sc->tp, sc->sa.agi_bp); 214 } 215 216 if (sc->sa.agf_bp) { 217 xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_MAGICNUM); 218 xfs_trans_bhold(sc->tp, sc->sa.agf_bp); 219 } 220 221 /* 222 * Finish all deferred work items. We still hold the AG header buffers 223 * locked regardless of whether or not that succeeds. On failure, the 224 * buffers will be released during teardown on our way out of the 225 * kernel. If successful, join the buffers to the new transaction 226 * and move on. 227 */ 228 error = xfs_defer_finish(&sc->tp); 229 if (error) 230 return error; 231 232 /* 233 * Release the hold that we set above because defer_finish won't do 234 * that for us. The defer roll code redirties held buffers after each 235 * roll, so the AG header buffers should be ready for logging. 236 */ 237 if (sc->sa.agi_bp) 238 xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp); 239 if (sc->sa.agf_bp) 240 xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp); 241 242 return 0; 243 } 244 245 /* 246 * Does the given AG have enough space to rebuild a btree? Neither AG 247 * reservation can be critical, and we must have enough space (factoring 248 * in AG reservations) to construct a whole btree. 249 */ 250 bool 251 xrep_ag_has_space( 252 struct xfs_perag *pag, 253 xfs_extlen_t nr_blocks, 254 enum xfs_ag_resv_type type) 255 { 256 return !xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) && 257 !xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA) && 258 pag->pagf_freeblks > xfs_ag_resv_needed(pag, type) + nr_blocks; 259 } 260 261 /* 262 * Figure out how many blocks to reserve for an AG repair. We calculate the 263 * worst case estimate for the number of blocks we'd need to rebuild one of 264 * any type of per-AG btree. 265 */ 266 xfs_extlen_t 267 xrep_calc_ag_resblks( 268 struct xfs_scrub *sc) 269 { 270 struct xfs_mount *mp = sc->mp; 271 struct xfs_scrub_metadata *sm = sc->sm; 272 struct xfs_perag *pag; 273 struct xfs_buf *bp; 274 xfs_agino_t icount = NULLAGINO; 275 xfs_extlen_t aglen = NULLAGBLOCK; 276 xfs_extlen_t usedlen; 277 xfs_extlen_t freelen; 278 xfs_extlen_t bnobt_sz; 279 xfs_extlen_t inobt_sz; 280 xfs_extlen_t rmapbt_sz; 281 xfs_extlen_t refcbt_sz; 282 int error; 283 284 if (!(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)) 285 return 0; 286 287 pag = xfs_perag_get(mp, sm->sm_agno); 288 if (xfs_perag_initialised_agi(pag)) { 289 /* Use in-core icount if possible. */ 290 icount = pag->pagi_count; 291 } else { 292 /* Try to get the actual counters from disk. */ 293 error = xfs_ialloc_read_agi(pag, NULL, &bp); 294 if (!error) { 295 icount = pag->pagi_count; 296 xfs_buf_relse(bp); 297 } 298 } 299 300 /* Now grab the block counters from the AGF. */ 301 error = xfs_alloc_read_agf(pag, NULL, 0, &bp); 302 if (error) { 303 aglen = pag->block_count; 304 freelen = aglen; 305 usedlen = aglen; 306 } else { 307 struct xfs_agf *agf = bp->b_addr; 308 309 aglen = be32_to_cpu(agf->agf_length); 310 freelen = be32_to_cpu(agf->agf_freeblks); 311 usedlen = aglen - freelen; 312 xfs_buf_relse(bp); 313 } 314 315 /* If the icount is impossible, make some worst-case assumptions. */ 316 if (icount == NULLAGINO || 317 !xfs_verify_agino(pag, icount)) { 318 icount = pag->agino_max - pag->agino_min + 1; 319 } 320 321 /* If the block counts are impossible, make worst-case assumptions. */ 322 if (aglen == NULLAGBLOCK || 323 aglen != pag->block_count || 324 freelen >= aglen) { 325 aglen = pag->block_count; 326 freelen = aglen; 327 usedlen = aglen; 328 } 329 xfs_perag_put(pag); 330 331 trace_xrep_calc_ag_resblks(mp, sm->sm_agno, icount, aglen, 332 freelen, usedlen); 333 334 /* 335 * Figure out how many blocks we'd need worst case to rebuild 336 * each type of btree. Note that we can only rebuild the 337 * bnobt/cntbt or inobt/finobt as pairs. 338 */ 339 bnobt_sz = 2 * xfs_allocbt_calc_size(mp, freelen); 340 if (xfs_has_sparseinodes(mp)) 341 inobt_sz = xfs_iallocbt_calc_size(mp, icount / 342 XFS_INODES_PER_HOLEMASK_BIT); 343 else 344 inobt_sz = xfs_iallocbt_calc_size(mp, icount / 345 XFS_INODES_PER_CHUNK); 346 if (xfs_has_finobt(mp)) 347 inobt_sz *= 2; 348 if (xfs_has_reflink(mp)) 349 refcbt_sz = xfs_refcountbt_calc_size(mp, usedlen); 350 else 351 refcbt_sz = 0; 352 if (xfs_has_rmapbt(mp)) { 353 /* 354 * Guess how many blocks we need to rebuild the rmapbt. 355 * For non-reflink filesystems we can't have more records than 356 * used blocks. However, with reflink it's possible to have 357 * more than one rmap record per AG block. We don't know how 358 * many rmaps there could be in the AG, so we start off with 359 * what we hope is an generous over-estimation. 360 */ 361 if (xfs_has_reflink(mp)) 362 rmapbt_sz = xfs_rmapbt_calc_size(mp, 363 (unsigned long long)aglen * 2); 364 else 365 rmapbt_sz = xfs_rmapbt_calc_size(mp, usedlen); 366 } else { 367 rmapbt_sz = 0; 368 } 369 370 trace_xrep_calc_ag_resblks_btsize(mp, sm->sm_agno, bnobt_sz, 371 inobt_sz, rmapbt_sz, refcbt_sz); 372 373 return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz)); 374 } 375 376 /* 377 * Reconstructing per-AG Btrees 378 * 379 * When a space btree is corrupt, we don't bother trying to fix it. Instead, 380 * we scan secondary space metadata to derive the records that should be in 381 * the damaged btree, initialize a fresh btree root, and insert the records. 382 * Note that for rebuilding the rmapbt we scan all the primary data to 383 * generate the new records. 384 * 385 * However, that leaves the matter of removing all the metadata describing the 386 * old broken structure. For primary metadata we use the rmap data to collect 387 * every extent with a matching rmap owner (bitmap); we then iterate all other 388 * metadata structures with the same rmap owner to collect the extents that 389 * cannot be removed (sublist). We then subtract sublist from bitmap to 390 * derive the blocks that were used by the old btree. These blocks can be 391 * reaped. 392 * 393 * For rmapbt reconstructions we must use different tactics for extent 394 * collection. First we iterate all primary metadata (this excludes the old 395 * rmapbt, obviously) to generate new rmap records. The gaps in the rmap 396 * records are collected as bitmap. The bnobt records are collected as 397 * sublist. As with the other btrees we subtract sublist from bitmap, and the 398 * result (since the rmapbt lives in the free space) are the blocks from the 399 * old rmapbt. 400 */ 401 402 /* Ensure the freelist is the correct size. */ 403 int 404 xrep_fix_freelist( 405 struct xfs_scrub *sc, 406 int alloc_flags) 407 { 408 struct xfs_alloc_arg args = {0}; 409 410 args.mp = sc->mp; 411 args.tp = sc->tp; 412 args.agno = sc->sa.pag->pag_agno; 413 args.alignment = 1; 414 args.pag = sc->sa.pag; 415 416 return xfs_alloc_fix_freelist(&args, alloc_flags); 417 } 418 419 /* 420 * Finding per-AG Btree Roots for AGF/AGI Reconstruction 421 * 422 * If the AGF or AGI become slightly corrupted, it may be necessary to rebuild 423 * the AG headers by using the rmap data to rummage through the AG looking for 424 * btree roots. This is not guaranteed to work if the AG is heavily damaged 425 * or the rmap data are corrupt. 426 * 427 * Callers of xrep_find_ag_btree_roots must lock the AGF and AGFL 428 * buffers if the AGF is being rebuilt; or the AGF and AGI buffers if the 429 * AGI is being rebuilt. It must maintain these locks until it's safe for 430 * other threads to change the btrees' shapes. The caller provides 431 * information about the btrees to look for by passing in an array of 432 * xrep_find_ag_btree with the (rmap owner, buf_ops, magic) fields set. 433 * The (root, height) fields will be set on return if anything is found. The 434 * last element of the array should have a NULL buf_ops to mark the end of the 435 * array. 436 * 437 * For every rmapbt record matching any of the rmap owners in btree_info, 438 * read each block referenced by the rmap record. If the block is a btree 439 * block from this filesystem matching any of the magic numbers and has a 440 * level higher than what we've already seen, remember the block and the 441 * height of the tree required to have such a block. When the call completes, 442 * we return the highest block we've found for each btree description; those 443 * should be the roots. 444 */ 445 446 struct xrep_findroot { 447 struct xfs_scrub *sc; 448 struct xfs_buf *agfl_bp; 449 struct xfs_agf *agf; 450 struct xrep_find_ag_btree *btree_info; 451 }; 452 453 /* See if our block is in the AGFL. */ 454 STATIC int 455 xrep_findroot_agfl_walk( 456 struct xfs_mount *mp, 457 xfs_agblock_t bno, 458 void *priv) 459 { 460 xfs_agblock_t *agbno = priv; 461 462 return (*agbno == bno) ? -ECANCELED : 0; 463 } 464 465 /* Does this block match the btree information passed in? */ 466 STATIC int 467 xrep_findroot_block( 468 struct xrep_findroot *ri, 469 struct xrep_find_ag_btree *fab, 470 uint64_t owner, 471 xfs_agblock_t agbno, 472 bool *done_with_block) 473 { 474 struct xfs_mount *mp = ri->sc->mp; 475 struct xfs_buf *bp; 476 struct xfs_btree_block *btblock; 477 xfs_daddr_t daddr; 478 int block_level; 479 int error = 0; 480 481 daddr = XFS_AGB_TO_DADDR(mp, ri->sc->sa.pag->pag_agno, agbno); 482 483 /* 484 * Blocks in the AGFL have stale contents that might just happen to 485 * have a matching magic and uuid. We don't want to pull these blocks 486 * in as part of a tree root, so we have to filter out the AGFL stuff 487 * here. If the AGFL looks insane we'll just refuse to repair. 488 */ 489 if (owner == XFS_RMAP_OWN_AG) { 490 error = xfs_agfl_walk(mp, ri->agf, ri->agfl_bp, 491 xrep_findroot_agfl_walk, &agbno); 492 if (error == -ECANCELED) 493 return 0; 494 if (error) 495 return error; 496 } 497 498 /* 499 * Read the buffer into memory so that we can see if it's a match for 500 * our btree type. We have no clue if it is beforehand, and we want to 501 * avoid xfs_trans_read_buf's behavior of dumping the DONE state (which 502 * will cause needless disk reads in subsequent calls to this function) 503 * and logging metadata verifier failures. 504 * 505 * Therefore, pass in NULL buffer ops. If the buffer was already in 506 * memory from some other caller it will already have b_ops assigned. 507 * If it was in memory from a previous unsuccessful findroot_block 508 * call, the buffer won't have b_ops but it should be clean and ready 509 * for us to try to verify if the read call succeeds. The same applies 510 * if the buffer wasn't in memory at all. 511 * 512 * Note: If we never match a btree type with this buffer, it will be 513 * left in memory with NULL b_ops. This shouldn't be a problem unless 514 * the buffer gets written. 515 */ 516 error = xfs_trans_read_buf(mp, ri->sc->tp, mp->m_ddev_targp, daddr, 517 mp->m_bsize, 0, &bp, NULL); 518 if (error) 519 return error; 520 521 /* Ensure the block magic matches the btree type we're looking for. */ 522 btblock = XFS_BUF_TO_BLOCK(bp); 523 ASSERT(fab->buf_ops->magic[1] != 0); 524 if (btblock->bb_magic != fab->buf_ops->magic[1]) 525 goto out; 526 527 /* 528 * If the buffer already has ops applied and they're not the ones for 529 * this btree type, we know this block doesn't match the btree and we 530 * can bail out. 531 * 532 * If the buffer ops match ours, someone else has already validated 533 * the block for us, so we can move on to checking if this is a root 534 * block candidate. 535 * 536 * If the buffer does not have ops, nobody has successfully validated 537 * the contents and the buffer cannot be dirty. If the magic, uuid, 538 * and structure match this btree type then we'll move on to checking 539 * if it's a root block candidate. If there is no match, bail out. 540 */ 541 if (bp->b_ops) { 542 if (bp->b_ops != fab->buf_ops) 543 goto out; 544 } else { 545 ASSERT(!xfs_trans_buf_is_dirty(bp)); 546 if (!uuid_equal(&btblock->bb_u.s.bb_uuid, 547 &mp->m_sb.sb_meta_uuid)) 548 goto out; 549 /* 550 * Read verifiers can reference b_ops, so we set the pointer 551 * here. If the verifier fails we'll reset the buffer state 552 * to what it was before we touched the buffer. 553 */ 554 bp->b_ops = fab->buf_ops; 555 fab->buf_ops->verify_read(bp); 556 if (bp->b_error) { 557 bp->b_ops = NULL; 558 bp->b_error = 0; 559 goto out; 560 } 561 562 /* 563 * Some read verifiers will (re)set b_ops, so we must be 564 * careful not to change b_ops after running the verifier. 565 */ 566 } 567 568 /* 569 * This block passes the magic/uuid and verifier tests for this btree 570 * type. We don't need the caller to try the other tree types. 571 */ 572 *done_with_block = true; 573 574 /* 575 * Compare this btree block's level to the height of the current 576 * candidate root block. 577 * 578 * If the level matches the root we found previously, throw away both 579 * blocks because there can't be two candidate roots. 580 * 581 * If level is lower in the tree than the root we found previously, 582 * ignore this block. 583 */ 584 block_level = xfs_btree_get_level(btblock); 585 if (block_level + 1 == fab->height) { 586 fab->root = NULLAGBLOCK; 587 goto out; 588 } else if (block_level < fab->height) { 589 goto out; 590 } 591 592 /* 593 * This is the highest block in the tree that we've found so far. 594 * Update the btree height to reflect what we've learned from this 595 * block. 596 */ 597 fab->height = block_level + 1; 598 599 /* 600 * If this block doesn't have sibling pointers, then it's the new root 601 * block candidate. Otherwise, the root will be found farther up the 602 * tree. 603 */ 604 if (btblock->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) && 605 btblock->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK)) 606 fab->root = agbno; 607 else 608 fab->root = NULLAGBLOCK; 609 610 trace_xrep_findroot_block(mp, ri->sc->sa.pag->pag_agno, agbno, 611 be32_to_cpu(btblock->bb_magic), fab->height - 1); 612 out: 613 xfs_trans_brelse(ri->sc->tp, bp); 614 return error; 615 } 616 617 /* 618 * Do any of the blocks in this rmap record match one of the btrees we're 619 * looking for? 620 */ 621 STATIC int 622 xrep_findroot_rmap( 623 struct xfs_btree_cur *cur, 624 const struct xfs_rmap_irec *rec, 625 void *priv) 626 { 627 struct xrep_findroot *ri = priv; 628 struct xrep_find_ag_btree *fab; 629 xfs_agblock_t b; 630 bool done; 631 int error = 0; 632 633 /* Ignore anything that isn't AG metadata. */ 634 if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner)) 635 return 0; 636 637 /* Otherwise scan each block + btree type. */ 638 for (b = 0; b < rec->rm_blockcount; b++) { 639 done = false; 640 for (fab = ri->btree_info; fab->buf_ops; fab++) { 641 if (rec->rm_owner != fab->rmap_owner) 642 continue; 643 error = xrep_findroot_block(ri, fab, 644 rec->rm_owner, rec->rm_startblock + b, 645 &done); 646 if (error) 647 return error; 648 if (done) 649 break; 650 } 651 } 652 653 return 0; 654 } 655 656 /* Find the roots of the per-AG btrees described in btree_info. */ 657 int 658 xrep_find_ag_btree_roots( 659 struct xfs_scrub *sc, 660 struct xfs_buf *agf_bp, 661 struct xrep_find_ag_btree *btree_info, 662 struct xfs_buf *agfl_bp) 663 { 664 struct xfs_mount *mp = sc->mp; 665 struct xrep_findroot ri; 666 struct xrep_find_ag_btree *fab; 667 struct xfs_btree_cur *cur; 668 int error; 669 670 ASSERT(xfs_buf_islocked(agf_bp)); 671 ASSERT(agfl_bp == NULL || xfs_buf_islocked(agfl_bp)); 672 673 ri.sc = sc; 674 ri.btree_info = btree_info; 675 ri.agf = agf_bp->b_addr; 676 ri.agfl_bp = agfl_bp; 677 for (fab = btree_info; fab->buf_ops; fab++) { 678 ASSERT(agfl_bp || fab->rmap_owner != XFS_RMAP_OWN_AG); 679 ASSERT(XFS_RMAP_NON_INODE_OWNER(fab->rmap_owner)); 680 fab->root = NULLAGBLOCK; 681 fab->height = 0; 682 } 683 684 cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); 685 error = xfs_rmap_query_all(cur, xrep_findroot_rmap, &ri); 686 xfs_btree_del_cursor(cur, error); 687 688 return error; 689 } 690 691 #ifdef CONFIG_XFS_QUOTA 692 /* Update some quota flags in the superblock. */ 693 void 694 xrep_update_qflags( 695 struct xfs_scrub *sc, 696 unsigned int clear_flags, 697 unsigned int set_flags) 698 { 699 struct xfs_mount *mp = sc->mp; 700 struct xfs_buf *bp; 701 702 mutex_lock(&mp->m_quotainfo->qi_quotaofflock); 703 if ((mp->m_qflags & clear_flags) == 0 && 704 (mp->m_qflags & set_flags) == set_flags) 705 goto no_update; 706 707 mp->m_qflags &= ~clear_flags; 708 mp->m_qflags |= set_flags; 709 710 spin_lock(&mp->m_sb_lock); 711 mp->m_sb.sb_qflags &= ~clear_flags; 712 mp->m_sb.sb_qflags |= set_flags; 713 spin_unlock(&mp->m_sb_lock); 714 715 /* 716 * Update the quota flags in the ondisk superblock without touching 717 * the summary counters. We have not quiesced inode chunk allocation, 718 * so we cannot coordinate with updates to the icount and ifree percpu 719 * counters. 720 */ 721 bp = xfs_trans_getsb(sc->tp); 722 xfs_sb_to_disk(bp->b_addr, &mp->m_sb); 723 xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF); 724 xfs_trans_log_buf(sc->tp, bp, 0, sizeof(struct xfs_dsb) - 1); 725 726 no_update: 727 mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock); 728 } 729 730 /* Force a quotacheck the next time we mount. */ 731 void 732 xrep_force_quotacheck( 733 struct xfs_scrub *sc, 734 xfs_dqtype_t type) 735 { 736 uint flag; 737 738 flag = xfs_quota_chkd_flag(type); 739 if (!(flag & sc->mp->m_qflags)) 740 return; 741 742 xrep_update_qflags(sc, flag, 0); 743 } 744 745 /* 746 * Attach dquots to this inode, or schedule quotacheck to fix them. 747 * 748 * This function ensures that the appropriate dquots are attached to an inode. 749 * We cannot allow the dquot code to allocate an on-disk dquot block here 750 * because we're already in transaction context. The on-disk dquot should 751 * already exist anyway. If the quota code signals corruption or missing quota 752 * information, schedule quotacheck, which will repair corruptions in the quota 753 * metadata. 754 */ 755 int 756 xrep_ino_dqattach( 757 struct xfs_scrub *sc) 758 { 759 int error; 760 761 ASSERT(sc->tp != NULL); 762 ASSERT(sc->ip != NULL); 763 764 error = xfs_qm_dqattach(sc->ip); 765 switch (error) { 766 case -EFSBADCRC: 767 case -EFSCORRUPTED: 768 case -ENOENT: 769 xfs_err_ratelimited(sc->mp, 770 "inode %llu repair encountered quota error %d, quotacheck forced.", 771 (unsigned long long)sc->ip->i_ino, error); 772 if (XFS_IS_UQUOTA_ON(sc->mp) && !sc->ip->i_udquot) 773 xrep_force_quotacheck(sc, XFS_DQTYPE_USER); 774 if (XFS_IS_GQUOTA_ON(sc->mp) && !sc->ip->i_gdquot) 775 xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP); 776 if (XFS_IS_PQUOTA_ON(sc->mp) && !sc->ip->i_pdquot) 777 xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ); 778 fallthrough; 779 case -ESRCH: 780 error = 0; 781 break; 782 default: 783 break; 784 } 785 786 return error; 787 } 788 #endif /* CONFIG_XFS_QUOTA */ 789 790 /* 791 * Ensure that the inode being repaired is ready to handle a certain number of 792 * extents, or return EFSCORRUPTED. Caller must hold the ILOCK of the inode 793 * being repaired and have joined it to the scrub transaction. 794 */ 795 int 796 xrep_ino_ensure_extent_count( 797 struct xfs_scrub *sc, 798 int whichfork, 799 xfs_extnum_t nextents) 800 { 801 xfs_extnum_t max_extents; 802 bool inode_has_nrext64; 803 804 inode_has_nrext64 = xfs_inode_has_large_extent_counts(sc->ip); 805 max_extents = xfs_iext_max_nextents(inode_has_nrext64, whichfork); 806 if (nextents <= max_extents) 807 return 0; 808 if (inode_has_nrext64) 809 return -EFSCORRUPTED; 810 if (!xfs_has_large_extent_counts(sc->mp)) 811 return -EFSCORRUPTED; 812 813 max_extents = xfs_iext_max_nextents(true, whichfork); 814 if (nextents > max_extents) 815 return -EFSCORRUPTED; 816 817 sc->ip->i_diflags2 |= XFS_DIFLAG2_NREXT64; 818 xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); 819 return 0; 820 } 821 822 /* 823 * Initialize all the btree cursors for an AG repair except for the btree that 824 * we're rebuilding. 825 */ 826 void 827 xrep_ag_btcur_init( 828 struct xfs_scrub *sc, 829 struct xchk_ag *sa) 830 { 831 struct xfs_mount *mp = sc->mp; 832 833 /* Set up a bnobt cursor for cross-referencing. */ 834 if (sc->sm->sm_type != XFS_SCRUB_TYPE_BNOBT && 835 sc->sm->sm_type != XFS_SCRUB_TYPE_CNTBT) { 836 sa->bno_cur = xfs_bnobt_init_cursor(mp, sc->tp, sa->agf_bp, 837 sc->sa.pag); 838 sa->cnt_cur = xfs_cntbt_init_cursor(mp, sc->tp, sa->agf_bp, 839 sc->sa.pag); 840 } 841 842 /* Set up a inobt cursor for cross-referencing. */ 843 if (sc->sm->sm_type != XFS_SCRUB_TYPE_INOBT && 844 sc->sm->sm_type != XFS_SCRUB_TYPE_FINOBT) { 845 sa->ino_cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, 846 sa->agi_bp); 847 if (xfs_has_finobt(mp)) 848 sa->fino_cur = xfs_finobt_init_cursor(sc->sa.pag, 849 sc->tp, sa->agi_bp); 850 } 851 852 /* Set up a rmapbt cursor for cross-referencing. */ 853 if (sc->sm->sm_type != XFS_SCRUB_TYPE_RMAPBT && 854 xfs_has_rmapbt(mp)) 855 sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp, 856 sc->sa.pag); 857 858 /* Set up a refcountbt cursor for cross-referencing. */ 859 if (sc->sm->sm_type != XFS_SCRUB_TYPE_REFCNTBT && 860 xfs_has_reflink(mp)) 861 sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp, 862 sa->agf_bp, sc->sa.pag); 863 } 864 865 /* 866 * Reinitialize the in-core AG state after a repair by rereading the AGF 867 * buffer. We had better get the same AGF buffer as the one that's attached 868 * to the scrub context. 869 */ 870 int 871 xrep_reinit_pagf( 872 struct xfs_scrub *sc) 873 { 874 struct xfs_perag *pag = sc->sa.pag; 875 struct xfs_buf *bp; 876 int error; 877 878 ASSERT(pag); 879 ASSERT(xfs_perag_initialised_agf(pag)); 880 881 clear_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate); 882 error = xfs_alloc_read_agf(pag, sc->tp, 0, &bp); 883 if (error) 884 return error; 885 886 if (bp != sc->sa.agf_bp) { 887 ASSERT(bp == sc->sa.agf_bp); 888 return -EFSCORRUPTED; 889 } 890 891 return 0; 892 } 893 894 /* 895 * Reinitialize the in-core AG state after a repair by rereading the AGI 896 * buffer. We had better get the same AGI buffer as the one that's attached 897 * to the scrub context. 898 */ 899 int 900 xrep_reinit_pagi( 901 struct xfs_scrub *sc) 902 { 903 struct xfs_perag *pag = sc->sa.pag; 904 struct xfs_buf *bp; 905 int error; 906 907 ASSERT(pag); 908 ASSERT(xfs_perag_initialised_agi(pag)); 909 910 clear_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate); 911 error = xfs_ialloc_read_agi(pag, sc->tp, &bp); 912 if (error) 913 return error; 914 915 if (bp != sc->sa.agi_bp) { 916 ASSERT(bp == sc->sa.agi_bp); 917 return -EFSCORRUPTED; 918 } 919 920 return 0; 921 } 922 923 /* 924 * Given an active reference to a perag structure, load AG headers and cursors. 925 * This should only be called to scan an AG while repairing file-based metadata. 926 */ 927 int 928 xrep_ag_init( 929 struct xfs_scrub *sc, 930 struct xfs_perag *pag, 931 struct xchk_ag *sa) 932 { 933 int error; 934 935 ASSERT(!sa->pag); 936 937 error = xfs_ialloc_read_agi(pag, sc->tp, &sa->agi_bp); 938 if (error) 939 return error; 940 941 error = xfs_alloc_read_agf(pag, sc->tp, 0, &sa->agf_bp); 942 if (error) 943 return error; 944 945 /* Grab our own passive reference from the caller's ref. */ 946 sa->pag = xfs_perag_hold(pag); 947 xrep_ag_btcur_init(sc, sa); 948 return 0; 949 } 950 951 /* Reinitialize the per-AG block reservation for the AG we just fixed. */ 952 int 953 xrep_reset_perag_resv( 954 struct xfs_scrub *sc) 955 { 956 int error; 957 958 if (!(sc->flags & XREP_RESET_PERAG_RESV)) 959 return 0; 960 961 ASSERT(sc->sa.pag != NULL); 962 ASSERT(sc->ops->type == ST_PERAG); 963 ASSERT(sc->tp); 964 965 sc->flags &= ~XREP_RESET_PERAG_RESV; 966 error = xfs_ag_resv_free(sc->sa.pag); 967 if (error) 968 goto out; 969 error = xfs_ag_resv_init(sc->sa.pag, sc->tp); 970 if (error == -ENOSPC) { 971 xfs_err(sc->mp, 972 "Insufficient free space to reset per-AG reservation for AG %u after repair.", 973 sc->sa.pag->pag_agno); 974 error = 0; 975 } 976 977 out: 978 return error; 979 } 980 981 /* Decide if we are going to call the repair function for a scrub type. */ 982 bool 983 xrep_will_attempt( 984 struct xfs_scrub *sc) 985 { 986 /* Userspace asked us to rebuild the structure regardless. */ 987 if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) 988 return true; 989 990 /* Let debug users force us into the repair routines. */ 991 if (XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) 992 return true; 993 994 /* Metadata is corrupt or failed cross-referencing. */ 995 if (xchk_needs_repair(sc->sm)) 996 return true; 997 998 return false; 999 } 1000 1001 /* Try to fix some part of a metadata inode by calling another scrubber. */ 1002 STATIC int 1003 xrep_metadata_inode_subtype( 1004 struct xfs_scrub *sc, 1005 unsigned int scrub_type) 1006 { 1007 __u32 smtype = sc->sm->sm_type; 1008 __u32 smflags = sc->sm->sm_flags; 1009 unsigned int sick_mask = sc->sick_mask; 1010 int error; 1011 1012 /* 1013 * Let's see if the inode needs repair. We're going to open-code calls 1014 * to the scrub and repair functions so that we can hang on to the 1015 * resources that we already acquired instead of using the standard 1016 * setup/teardown routines. 1017 */ 1018 sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; 1019 sc->sm->sm_type = scrub_type; 1020 1021 switch (scrub_type) { 1022 case XFS_SCRUB_TYPE_INODE: 1023 error = xchk_inode(sc); 1024 break; 1025 case XFS_SCRUB_TYPE_BMBTD: 1026 error = xchk_bmap_data(sc); 1027 break; 1028 case XFS_SCRUB_TYPE_BMBTA: 1029 error = xchk_bmap_attr(sc); 1030 break; 1031 default: 1032 ASSERT(0); 1033 error = -EFSCORRUPTED; 1034 } 1035 if (error) 1036 goto out; 1037 1038 if (!xrep_will_attempt(sc)) 1039 goto out; 1040 1041 /* 1042 * Repair some part of the inode. This will potentially join the inode 1043 * to the transaction. 1044 */ 1045 switch (scrub_type) { 1046 case XFS_SCRUB_TYPE_INODE: 1047 error = xrep_inode(sc); 1048 break; 1049 case XFS_SCRUB_TYPE_BMBTD: 1050 error = xrep_bmap(sc, XFS_DATA_FORK, false); 1051 break; 1052 case XFS_SCRUB_TYPE_BMBTA: 1053 error = xrep_bmap(sc, XFS_ATTR_FORK, false); 1054 break; 1055 } 1056 if (error) 1057 goto out; 1058 1059 /* 1060 * Finish all deferred intent items and then roll the transaction so 1061 * that the inode will not be joined to the transaction when we exit 1062 * the function. 1063 */ 1064 error = xfs_defer_finish(&sc->tp); 1065 if (error) 1066 goto out; 1067 error = xfs_trans_roll(&sc->tp); 1068 if (error) 1069 goto out; 1070 1071 /* 1072 * Clear the corruption flags and re-check the metadata that we just 1073 * repaired. 1074 */ 1075 sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; 1076 1077 switch (scrub_type) { 1078 case XFS_SCRUB_TYPE_INODE: 1079 error = xchk_inode(sc); 1080 break; 1081 case XFS_SCRUB_TYPE_BMBTD: 1082 error = xchk_bmap_data(sc); 1083 break; 1084 case XFS_SCRUB_TYPE_BMBTA: 1085 error = xchk_bmap_attr(sc); 1086 break; 1087 } 1088 if (error) 1089 goto out; 1090 1091 /* If corruption persists, the repair has failed. */ 1092 if (xchk_needs_repair(sc->sm)) { 1093 error = -EFSCORRUPTED; 1094 goto out; 1095 } 1096 out: 1097 sc->sick_mask = sick_mask; 1098 sc->sm->sm_type = smtype; 1099 sc->sm->sm_flags = smflags; 1100 return error; 1101 } 1102 1103 /* 1104 * Repair the ondisk forks of a metadata inode. The caller must ensure that 1105 * sc->ip points to the metadata inode and the ILOCK is held on that inode. 1106 * The inode must not be joined to the transaction before the call, and will 1107 * not be afterwards. 1108 */ 1109 int 1110 xrep_metadata_inode_forks( 1111 struct xfs_scrub *sc) 1112 { 1113 bool dirty = false; 1114 int error; 1115 1116 /* Repair the inode record and the data fork. */ 1117 error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_INODE); 1118 if (error) 1119 return error; 1120 1121 error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTD); 1122 if (error) 1123 return error; 1124 1125 /* Make sure the attr fork looks ok before we delete it. */ 1126 error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTA); 1127 if (error) 1128 return error; 1129 1130 /* Clear the reflink flag since metadata never shares. */ 1131 if (xfs_is_reflink_inode(sc->ip)) { 1132 dirty = true; 1133 xfs_trans_ijoin(sc->tp, sc->ip, 0); 1134 error = xfs_reflink_clear_inode_flag(sc->ip, &sc->tp); 1135 if (error) 1136 return error; 1137 } 1138 1139 /* 1140 * If we modified the inode, roll the transaction but don't rejoin the 1141 * inode to the new transaction because xrep_bmap_data can do that. 1142 */ 1143 if (dirty) { 1144 error = xfs_trans_roll(&sc->tp); 1145 if (error) 1146 return error; 1147 dirty = false; 1148 } 1149 1150 return 0; 1151 } 1152 1153 /* 1154 * Set up an in-memory buffer cache so that we can use the xfbtree. Allocating 1155 * a shmem file might take loks, so we cannot be in transaction context. Park 1156 * our resources in the scrub context and let the teardown function take care 1157 * of them at the right time. 1158 */ 1159 int 1160 xrep_setup_xfbtree( 1161 struct xfs_scrub *sc, 1162 const char *descr) 1163 { 1164 ASSERT(sc->tp == NULL); 1165 1166 return xmbuf_alloc(sc->mp, descr, &sc->xmbtp); 1167 } 1168 1169 /* 1170 * Create a dummy transaction for use in a live update hook function. This 1171 * function MUST NOT be called from regular repair code because the current 1172 * process' transaction is saved via the cookie. 1173 */ 1174 int 1175 xrep_trans_alloc_hook_dummy( 1176 struct xfs_mount *mp, 1177 void **cookiep, 1178 struct xfs_trans **tpp) 1179 { 1180 int error; 1181 1182 *cookiep = current->journal_info; 1183 current->journal_info = NULL; 1184 1185 error = xfs_trans_alloc_empty(mp, tpp); 1186 if (!error) 1187 return 0; 1188 1189 current->journal_info = *cookiep; 1190 *cookiep = NULL; 1191 return error; 1192 } 1193 1194 /* Cancel a dummy transaction used by a live update hook function. */ 1195 void 1196 xrep_trans_cancel_hook_dummy( 1197 void **cookiep, 1198 struct xfs_trans *tp) 1199 { 1200 xfs_trans_cancel(tp); 1201 current->journal_info = *cookiep; 1202 *cookiep = NULL; 1203 } 1204