1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2017-2023 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_btree.h" 13 #include "xfs_log_format.h" 14 #include "xfs_trans.h" 15 #include "xfs_inode.h" 16 #include "xfs_icache.h" 17 #include "xfs_alloc.h" 18 #include "xfs_alloc_btree.h" 19 #include "xfs_ialloc.h" 20 #include "xfs_ialloc_btree.h" 21 #include "xfs_refcount_btree.h" 22 #include "xfs_rmap.h" 23 #include "xfs_rmap_btree.h" 24 #include "xfs_log.h" 25 #include "xfs_trans_priv.h" 26 #include "xfs_da_format.h" 27 #include "xfs_da_btree.h" 28 #include "xfs_dir2_priv.h" 29 #include "xfs_dir2.h" 30 #include "xfs_attr.h" 31 #include "xfs_reflink.h" 32 #include "xfs_ag.h" 33 #include "xfs_error.h" 34 #include "xfs_quota.h" 35 #include "xfs_exchmaps.h" 36 #include "xfs_rtbitmap.h" 37 #include "xfs_rtgroup.h" 38 #include "scrub/scrub.h" 39 #include "scrub/common.h" 40 #include "scrub/trace.h" 41 #include "scrub/repair.h" 42 #include "scrub/health.h" 43 #include "scrub/tempfile.h" 44 45 /* Common code for the metadata scrubbers. */ 46 47 /* 48 * Handling operational errors. 49 * 50 * The *_process_error() family of functions are used to process error return 51 * codes from functions called as part of a scrub operation. 52 * 53 * If there's no error, we return true to tell the caller that it's ok 54 * to move on to the next check in its list. 55 * 56 * For non-verifier errors (e.g. ENOMEM) we return false to tell the 57 * caller that something bad happened, and we preserve *error so that 58 * the caller can return the *error up the stack to userspace. 59 * 60 * Verifier errors (EFSBADCRC/EFSCORRUPTED) are recorded by setting 61 * OFLAG_CORRUPT in sm_flags and the *error is cleared. In other words, 62 * we track verifier errors (and failed scrub checks) via OFLAG_CORRUPT, 63 * not via return codes. We return false to tell the caller that 64 * something bad happened. Since the error has been cleared, the caller 65 * will (presumably) return that zero and scrubbing will move on to 66 * whatever's next. 67 * 68 * ftrace can be used to record the precise metadata location and the 69 * approximate code location of the failed operation. 70 */ 71 72 /* Check for operational errors. */ 73 static bool 74 __xchk_process_error( 75 struct xfs_scrub *sc, 76 xfs_agnumber_t agno, 77 xfs_agblock_t bno, 78 int *error, 79 __u32 errflag, 80 void *ret_ip) 81 { 82 switch (*error) { 83 case 0: 84 return true; 85 case -EDEADLOCK: 86 case -ECHRNG: 87 /* Used to restart an op with deadlock avoidance. */ 88 trace_xchk_deadlock_retry( 89 sc->ip ? sc->ip : XFS_I(file_inode(sc->file)), 90 sc->sm, *error); 91 break; 92 case -ECANCELED: 93 /* 94 * ECANCELED here means that the caller set one of the scrub 95 * outcome flags (corrupt, xfail, xcorrupt) and wants to exit 96 * quickly. Set error to zero and do not continue. 97 */ 98 trace_xchk_op_error(sc, agno, bno, *error, ret_ip); 99 *error = 0; 100 break; 101 case -EFSBADCRC: 102 case -EFSCORRUPTED: 103 /* Note the badness but don't abort. */ 104 sc->sm->sm_flags |= errflag; 105 *error = 0; 106 fallthrough; 107 default: 108 trace_xchk_op_error(sc, agno, bno, *error, ret_ip); 109 break; 110 } 111 return false; 112 } 113 114 bool 115 xchk_process_error( 116 struct xfs_scrub *sc, 117 xfs_agnumber_t agno, 118 xfs_agblock_t bno, 119 int *error) 120 { 121 return __xchk_process_error(sc, agno, bno, error, 122 XFS_SCRUB_OFLAG_CORRUPT, __return_address); 123 } 124 125 bool 126 xchk_process_rt_error( 127 struct xfs_scrub *sc, 128 xfs_rgnumber_t rgno, 129 xfs_rgblock_t rgbno, 130 int *error) 131 { 132 return __xchk_process_error(sc, rgno, rgbno, error, 133 XFS_SCRUB_OFLAG_CORRUPT, __return_address); 134 } 135 136 bool 137 xchk_xref_process_error( 138 struct xfs_scrub *sc, 139 xfs_agnumber_t agno, 140 xfs_agblock_t bno, 141 int *error) 142 { 143 return __xchk_process_error(sc, agno, bno, error, 144 XFS_SCRUB_OFLAG_XFAIL, __return_address); 145 } 146 147 /* Check for operational errors for a file offset. */ 148 static bool 149 __xchk_fblock_process_error( 150 struct xfs_scrub *sc, 151 int whichfork, 152 xfs_fileoff_t offset, 153 int *error, 154 __u32 errflag, 155 void *ret_ip) 156 { 157 switch (*error) { 158 case 0: 159 return true; 160 case -EDEADLOCK: 161 case -ECHRNG: 162 /* Used to restart an op with deadlock avoidance. */ 163 trace_xchk_deadlock_retry(sc->ip, sc->sm, *error); 164 break; 165 case -ECANCELED: 166 /* 167 * ECANCELED here means that the caller set one of the scrub 168 * outcome flags (corrupt, xfail, xcorrupt) and wants to exit 169 * quickly. Set error to zero and do not continue. 170 */ 171 trace_xchk_file_op_error(sc, whichfork, offset, *error, 172 ret_ip); 173 *error = 0; 174 break; 175 case -EFSBADCRC: 176 case -EFSCORRUPTED: 177 /* Note the badness but don't abort. */ 178 sc->sm->sm_flags |= errflag; 179 *error = 0; 180 fallthrough; 181 default: 182 trace_xchk_file_op_error(sc, whichfork, offset, *error, 183 ret_ip); 184 break; 185 } 186 return false; 187 } 188 189 bool 190 xchk_fblock_process_error( 191 struct xfs_scrub *sc, 192 int whichfork, 193 xfs_fileoff_t offset, 194 int *error) 195 { 196 return __xchk_fblock_process_error(sc, whichfork, offset, error, 197 XFS_SCRUB_OFLAG_CORRUPT, __return_address); 198 } 199 200 bool 201 xchk_fblock_xref_process_error( 202 struct xfs_scrub *sc, 203 int whichfork, 204 xfs_fileoff_t offset, 205 int *error) 206 { 207 return __xchk_fblock_process_error(sc, whichfork, offset, error, 208 XFS_SCRUB_OFLAG_XFAIL, __return_address); 209 } 210 211 /* 212 * Handling scrub corruption/optimization/warning checks. 213 * 214 * The *_set_{corrupt,preen,warning}() family of functions are used to 215 * record the presence of metadata that is incorrect (corrupt), could be 216 * optimized somehow (preen), or should be flagged for administrative 217 * review but is not incorrect (warn). 218 * 219 * ftrace can be used to record the precise metadata location and 220 * approximate code location of the failed check. 221 */ 222 223 /* Record a block which could be optimized. */ 224 void 225 xchk_block_set_preen( 226 struct xfs_scrub *sc, 227 struct xfs_buf *bp) 228 { 229 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN; 230 trace_xchk_block_preen(sc, xfs_buf_daddr(bp), __return_address); 231 } 232 233 /* 234 * Record an inode which could be optimized. The trace data will 235 * include the block given by bp if bp is given; otherwise it will use 236 * the block location of the inode record itself. 237 */ 238 void 239 xchk_ino_set_preen( 240 struct xfs_scrub *sc, 241 xfs_ino_t ino) 242 { 243 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN; 244 trace_xchk_ino_preen(sc, ino, __return_address); 245 } 246 247 /* Record something being wrong with the filesystem primary superblock. */ 248 void 249 xchk_set_corrupt( 250 struct xfs_scrub *sc) 251 { 252 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; 253 trace_xchk_fs_error(sc, 0, __return_address); 254 } 255 256 /* Record a corrupt block. */ 257 void 258 xchk_block_set_corrupt( 259 struct xfs_scrub *sc, 260 struct xfs_buf *bp) 261 { 262 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; 263 trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address); 264 } 265 266 #ifdef CONFIG_XFS_QUOTA 267 /* Record a corrupt quota counter. */ 268 void 269 xchk_qcheck_set_corrupt( 270 struct xfs_scrub *sc, 271 unsigned int dqtype, 272 xfs_dqid_t id) 273 { 274 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; 275 trace_xchk_qcheck_error(sc, dqtype, id, __return_address); 276 } 277 #endif 278 279 /* Record a corruption while cross-referencing. */ 280 void 281 xchk_block_xref_set_corrupt( 282 struct xfs_scrub *sc, 283 struct xfs_buf *bp) 284 { 285 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT; 286 trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address); 287 } 288 289 /* 290 * Record a corrupt inode. The trace data will include the block given 291 * by bp if bp is given; otherwise it will use the block location of the 292 * inode record itself. 293 */ 294 void 295 xchk_ino_set_corrupt( 296 struct xfs_scrub *sc, 297 xfs_ino_t ino) 298 { 299 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; 300 trace_xchk_ino_error(sc, ino, __return_address); 301 } 302 303 /* Record a corruption while cross-referencing with an inode. */ 304 void 305 xchk_ino_xref_set_corrupt( 306 struct xfs_scrub *sc, 307 xfs_ino_t ino) 308 { 309 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT; 310 trace_xchk_ino_error(sc, ino, __return_address); 311 } 312 313 /* Record corruption in a block indexed by a file fork. */ 314 void 315 xchk_fblock_set_corrupt( 316 struct xfs_scrub *sc, 317 int whichfork, 318 xfs_fileoff_t offset) 319 { 320 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; 321 trace_xchk_fblock_error(sc, whichfork, offset, __return_address); 322 } 323 324 /* Record a corruption while cross-referencing a fork block. */ 325 void 326 xchk_fblock_xref_set_corrupt( 327 struct xfs_scrub *sc, 328 int whichfork, 329 xfs_fileoff_t offset) 330 { 331 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT; 332 trace_xchk_fblock_error(sc, whichfork, offset, __return_address); 333 } 334 335 /* 336 * Warn about inodes that need administrative review but is not 337 * incorrect. 338 */ 339 void 340 xchk_ino_set_warning( 341 struct xfs_scrub *sc, 342 xfs_ino_t ino) 343 { 344 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_WARNING; 345 trace_xchk_ino_warning(sc, ino, __return_address); 346 } 347 348 /* Warn about a block indexed by a file fork that needs review. */ 349 void 350 xchk_fblock_set_warning( 351 struct xfs_scrub *sc, 352 int whichfork, 353 xfs_fileoff_t offset) 354 { 355 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_WARNING; 356 trace_xchk_fblock_warning(sc, whichfork, offset, __return_address); 357 } 358 359 /* Signal an incomplete scrub. */ 360 void 361 xchk_set_incomplete( 362 struct xfs_scrub *sc) 363 { 364 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_INCOMPLETE; 365 trace_xchk_incomplete(sc, __return_address); 366 } 367 368 /* 369 * rmap scrubbing -- compute the number of blocks with a given owner, 370 * at least according to the reverse mapping data. 371 */ 372 373 struct xchk_rmap_ownedby_info { 374 const struct xfs_owner_info *oinfo; 375 xfs_filblks_t *blocks; 376 }; 377 378 STATIC int 379 xchk_count_rmap_ownedby_irec( 380 struct xfs_btree_cur *cur, 381 const struct xfs_rmap_irec *rec, 382 void *priv) 383 { 384 struct xchk_rmap_ownedby_info *sroi = priv; 385 bool irec_attr; 386 bool oinfo_attr; 387 388 irec_attr = rec->rm_flags & XFS_RMAP_ATTR_FORK; 389 oinfo_attr = sroi->oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK; 390 391 if (rec->rm_owner != sroi->oinfo->oi_owner) 392 return 0; 393 394 if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) || irec_attr == oinfo_attr) 395 (*sroi->blocks) += rec->rm_blockcount; 396 397 return 0; 398 } 399 400 /* 401 * Calculate the number of blocks the rmap thinks are owned by something. 402 * The caller should pass us an rmapbt cursor. 403 */ 404 int 405 xchk_count_rmap_ownedby_ag( 406 struct xfs_scrub *sc, 407 struct xfs_btree_cur *cur, 408 const struct xfs_owner_info *oinfo, 409 xfs_filblks_t *blocks) 410 { 411 struct xchk_rmap_ownedby_info sroi = { 412 .oinfo = oinfo, 413 .blocks = blocks, 414 }; 415 416 *blocks = 0; 417 return xfs_rmap_query_all(cur, xchk_count_rmap_ownedby_irec, 418 &sroi); 419 } 420 421 /* 422 * AG scrubbing 423 * 424 * These helpers facilitate locking an allocation group's header 425 * buffers, setting up cursors for all btrees that are present, and 426 * cleaning everything up once we're through. 427 */ 428 429 /* Decide if we want to return an AG header read failure. */ 430 static inline bool 431 want_ag_read_header_failure( 432 struct xfs_scrub *sc, 433 unsigned int type) 434 { 435 /* Return all AG header read failures when scanning btrees. */ 436 if (sc->sm->sm_type != XFS_SCRUB_TYPE_AGF && 437 sc->sm->sm_type != XFS_SCRUB_TYPE_AGFL && 438 sc->sm->sm_type != XFS_SCRUB_TYPE_AGI) 439 return true; 440 /* 441 * If we're scanning a given type of AG header, we only want to 442 * see read failures from that specific header. We'd like the 443 * other headers to cross-check them, but this isn't required. 444 */ 445 if (sc->sm->sm_type == type) 446 return true; 447 return false; 448 } 449 450 /* 451 * Grab the AG header buffers for the attached perag structure. 452 * 453 * The headers should be released by xchk_ag_free, but as a fail safe we attach 454 * all the buffers we grab to the scrub transaction so they'll all be freed 455 * when we cancel it. 456 */ 457 static inline int 458 xchk_perag_read_headers( 459 struct xfs_scrub *sc, 460 struct xchk_ag *sa) 461 { 462 int error; 463 464 error = xfs_ialloc_read_agi(sa->pag, sc->tp, 0, &sa->agi_bp); 465 if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI)) 466 return error; 467 468 error = xfs_alloc_read_agf(sa->pag, sc->tp, 0, &sa->agf_bp); 469 if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGF)) 470 return error; 471 472 return 0; 473 } 474 475 /* 476 * Grab the AG headers for the attached perag structure and wait for pending 477 * intents to drain. 478 */ 479 int 480 xchk_perag_drain_and_lock( 481 struct xfs_scrub *sc) 482 { 483 struct xchk_ag *sa = &sc->sa; 484 int error = 0; 485 486 ASSERT(sa->pag != NULL); 487 ASSERT(sa->agi_bp == NULL); 488 ASSERT(sa->agf_bp == NULL); 489 490 do { 491 if (xchk_should_terminate(sc, &error)) 492 return error; 493 494 error = xchk_perag_read_headers(sc, sa); 495 if (error) 496 return error; 497 498 /* 499 * If we've grabbed an inode for scrubbing then we assume that 500 * holding its ILOCK will suffice to coordinate with any intent 501 * chains involving this inode. 502 */ 503 if (sc->ip) 504 return 0; 505 506 /* 507 * Decide if this AG is quiet enough for all metadata to be 508 * consistent with each other. XFS allows the AG header buffer 509 * locks to cycle across transaction rolls while processing 510 * chains of deferred ops, which means that there could be 511 * other threads in the middle of processing a chain of 512 * deferred ops. For regular operations we are careful about 513 * ordering operations to prevent collisions between threads 514 * (which is why we don't need a per-AG lock), but scrub and 515 * repair have to serialize against chained operations. 516 * 517 * We just locked all the AG headers buffers; now take a look 518 * to see if there are any intents in progress. If there are, 519 * drop the AG headers and wait for the intents to drain. 520 * Since we hold all the AG header locks for the duration of 521 * the scrub, this is the only time we have to sample the 522 * intents counter; any threads increasing it after this point 523 * can't possibly be in the middle of a chain of AG metadata 524 * updates. 525 * 526 * Obviously, this should be slanted against scrub and in favor 527 * of runtime threads. 528 */ 529 if (!xfs_group_intent_busy(pag_group(sa->pag))) 530 return 0; 531 532 if (sa->agf_bp) { 533 xfs_trans_brelse(sc->tp, sa->agf_bp); 534 sa->agf_bp = NULL; 535 } 536 537 if (sa->agi_bp) { 538 xfs_trans_brelse(sc->tp, sa->agi_bp); 539 sa->agi_bp = NULL; 540 } 541 542 if (!(sc->flags & XCHK_FSGATES_DRAIN)) 543 return -ECHRNG; 544 error = xfs_group_intent_drain(pag_group(sa->pag)); 545 if (error == -ERESTARTSYS) 546 error = -EINTR; 547 } while (!error); 548 549 return error; 550 } 551 552 /* 553 * Grab the per-AG structure, grab all AG header buffers, and wait until there 554 * aren't any pending intents. Returns -ENOENT if we can't grab the perag 555 * structure. 556 */ 557 int 558 xchk_ag_read_headers( 559 struct xfs_scrub *sc, 560 xfs_agnumber_t agno, 561 struct xchk_ag *sa) 562 { 563 struct xfs_mount *mp = sc->mp; 564 565 ASSERT(!sa->pag); 566 sa->pag = xfs_perag_get(mp, agno); 567 if (!sa->pag) 568 return -ENOENT; 569 570 return xchk_perag_drain_and_lock(sc); 571 } 572 573 /* Release all the AG btree cursors. */ 574 void 575 xchk_ag_btcur_free( 576 struct xchk_ag *sa) 577 { 578 if (sa->refc_cur) 579 xfs_btree_del_cursor(sa->refc_cur, XFS_BTREE_ERROR); 580 if (sa->rmap_cur) 581 xfs_btree_del_cursor(sa->rmap_cur, XFS_BTREE_ERROR); 582 if (sa->fino_cur) 583 xfs_btree_del_cursor(sa->fino_cur, XFS_BTREE_ERROR); 584 if (sa->ino_cur) 585 xfs_btree_del_cursor(sa->ino_cur, XFS_BTREE_ERROR); 586 if (sa->cnt_cur) 587 xfs_btree_del_cursor(sa->cnt_cur, XFS_BTREE_ERROR); 588 if (sa->bno_cur) 589 xfs_btree_del_cursor(sa->bno_cur, XFS_BTREE_ERROR); 590 591 sa->refc_cur = NULL; 592 sa->rmap_cur = NULL; 593 sa->fino_cur = NULL; 594 sa->ino_cur = NULL; 595 sa->bno_cur = NULL; 596 sa->cnt_cur = NULL; 597 } 598 599 /* Initialize all the btree cursors for an AG. */ 600 void 601 xchk_ag_btcur_init( 602 struct xfs_scrub *sc, 603 struct xchk_ag *sa) 604 { 605 struct xfs_mount *mp = sc->mp; 606 607 if (sa->agf_bp) { 608 /* Set up a bnobt cursor for cross-referencing. */ 609 sa->bno_cur = xfs_bnobt_init_cursor(mp, sc->tp, sa->agf_bp, 610 sa->pag); 611 xchk_ag_btree_del_cursor_if_sick(sc, &sa->bno_cur, 612 XFS_SCRUB_TYPE_BNOBT); 613 614 /* Set up a cntbt cursor for cross-referencing. */ 615 sa->cnt_cur = xfs_cntbt_init_cursor(mp, sc->tp, sa->agf_bp, 616 sa->pag); 617 xchk_ag_btree_del_cursor_if_sick(sc, &sa->cnt_cur, 618 XFS_SCRUB_TYPE_CNTBT); 619 620 /* Set up a rmapbt cursor for cross-referencing. */ 621 if (xfs_has_rmapbt(mp)) { 622 sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, 623 sa->agf_bp, sa->pag); 624 xchk_ag_btree_del_cursor_if_sick(sc, &sa->rmap_cur, 625 XFS_SCRUB_TYPE_RMAPBT); 626 } 627 628 /* Set up a refcountbt cursor for cross-referencing. */ 629 if (xfs_has_reflink(mp)) { 630 sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp, 631 sa->agf_bp, sa->pag); 632 xchk_ag_btree_del_cursor_if_sick(sc, &sa->refc_cur, 633 XFS_SCRUB_TYPE_REFCNTBT); 634 } 635 } 636 637 if (sa->agi_bp) { 638 /* Set up a inobt cursor for cross-referencing. */ 639 sa->ino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp, 640 sa->agi_bp); 641 xchk_ag_btree_del_cursor_if_sick(sc, &sa->ino_cur, 642 XFS_SCRUB_TYPE_INOBT); 643 644 /* Set up a finobt cursor for cross-referencing. */ 645 if (xfs_has_finobt(mp)) { 646 sa->fino_cur = xfs_finobt_init_cursor(sa->pag, sc->tp, 647 sa->agi_bp); 648 xchk_ag_btree_del_cursor_if_sick(sc, &sa->fino_cur, 649 XFS_SCRUB_TYPE_FINOBT); 650 } 651 } 652 } 653 654 /* Release the AG header context and btree cursors. */ 655 void 656 xchk_ag_free( 657 struct xfs_scrub *sc, 658 struct xchk_ag *sa) 659 { 660 xchk_ag_btcur_free(sa); 661 xrep_reset_perag_resv(sc); 662 if (sa->agf_bp) { 663 xfs_trans_brelse(sc->tp, sa->agf_bp); 664 sa->agf_bp = NULL; 665 } 666 if (sa->agi_bp) { 667 xfs_trans_brelse(sc->tp, sa->agi_bp); 668 sa->agi_bp = NULL; 669 } 670 if (sa->pag) { 671 xfs_perag_put(sa->pag); 672 sa->pag = NULL; 673 } 674 } 675 676 /* 677 * For scrub, grab the perag structure, the AGI, and the AGF headers, in that 678 * order. Locking order requires us to get the AGI before the AGF. We use the 679 * transaction to avoid deadlocking on crosslinked metadata buffers; either the 680 * caller passes one in (bmap scrub) or we have to create a transaction 681 * ourselves. Returns ENOENT if the perag struct cannot be grabbed. 682 */ 683 int 684 xchk_ag_init( 685 struct xfs_scrub *sc, 686 xfs_agnumber_t agno, 687 struct xchk_ag *sa) 688 { 689 int error; 690 691 error = xchk_ag_read_headers(sc, agno, sa); 692 if (error) 693 return error; 694 695 xchk_ag_btcur_init(sc, sa); 696 return 0; 697 } 698 699 #ifdef CONFIG_XFS_RT 700 /* 701 * For scrubbing a realtime group, grab all the in-core resources we'll need to 702 * check the metadata, which means taking the ILOCK of the realtime group's 703 * metadata inodes. Callers must not join these inodes to the transaction with 704 * non-zero lockflags or concurrency problems will result. The @rtglock_flags 705 * argument takes XFS_RTGLOCK_* flags. 706 */ 707 int 708 xchk_rtgroup_init( 709 struct xfs_scrub *sc, 710 xfs_rgnumber_t rgno, 711 struct xchk_rt *sr) 712 { 713 ASSERT(sr->rtg == NULL); 714 ASSERT(sr->rtlock_flags == 0); 715 716 sr->rtg = xfs_rtgroup_get(sc->mp, rgno); 717 if (!sr->rtg) 718 return -ENOENT; 719 return 0; 720 } 721 722 void 723 xchk_rtgroup_lock( 724 struct xchk_rt *sr, 725 unsigned int rtglock_flags) 726 { 727 xfs_rtgroup_lock(sr->rtg, rtglock_flags); 728 sr->rtlock_flags = rtglock_flags; 729 } 730 731 /* 732 * Unlock the realtime group. This must be done /after/ committing (or 733 * cancelling) the scrub transaction. 734 */ 735 static void 736 xchk_rtgroup_unlock( 737 struct xchk_rt *sr) 738 { 739 ASSERT(sr->rtg != NULL); 740 741 if (sr->rtlock_flags) { 742 xfs_rtgroup_unlock(sr->rtg, sr->rtlock_flags); 743 sr->rtlock_flags = 0; 744 } 745 } 746 747 /* 748 * Unlock the realtime group and release its resources. This must be done 749 * /after/ committing (or cancelling) the scrub transaction. 750 */ 751 void 752 xchk_rtgroup_free( 753 struct xfs_scrub *sc, 754 struct xchk_rt *sr) 755 { 756 ASSERT(sr->rtg != NULL); 757 758 xchk_rtgroup_unlock(sr); 759 760 xfs_rtgroup_put(sr->rtg); 761 sr->rtg = NULL; 762 } 763 #endif /* CONFIG_XFS_RT */ 764 765 /* Per-scrubber setup functions */ 766 767 void 768 xchk_trans_cancel( 769 struct xfs_scrub *sc) 770 { 771 xfs_trans_cancel(sc->tp); 772 sc->tp = NULL; 773 } 774 775 int 776 xchk_trans_alloc_empty( 777 struct xfs_scrub *sc) 778 { 779 return xfs_trans_alloc_empty(sc->mp, &sc->tp); 780 } 781 782 /* 783 * Grab an empty transaction so that we can re-grab locked buffers if 784 * one of our btrees turns out to be cyclic. 785 * 786 * If we're going to repair something, we need to ask for the largest possible 787 * log reservation so that we can handle the worst case scenario for metadata 788 * updates while rebuilding a metadata item. We also need to reserve as many 789 * blocks in the head transaction as we think we're going to need to rebuild 790 * the metadata object. 791 */ 792 int 793 xchk_trans_alloc( 794 struct xfs_scrub *sc, 795 uint resblks) 796 { 797 if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) 798 return xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate, 799 resblks, 0, 0, &sc->tp); 800 801 return xchk_trans_alloc_empty(sc); 802 } 803 804 /* Set us up with a transaction and an empty context. */ 805 int 806 xchk_setup_fs( 807 struct xfs_scrub *sc) 808 { 809 uint resblks; 810 811 resblks = xrep_calc_ag_resblks(sc); 812 return xchk_trans_alloc(sc, resblks); 813 } 814 815 /* Set us up with AG headers and btree cursors. */ 816 int 817 xchk_setup_ag_btree( 818 struct xfs_scrub *sc, 819 bool force_log) 820 { 821 struct xfs_mount *mp = sc->mp; 822 int error; 823 824 /* 825 * If the caller asks us to checkpont the log, do so. This 826 * expensive operation should be performed infrequently and only 827 * as a last resort. Any caller that sets force_log should 828 * document why they need to do so. 829 */ 830 if (force_log) { 831 error = xchk_checkpoint_log(mp); 832 if (error) 833 return error; 834 } 835 836 error = xchk_setup_fs(sc); 837 if (error) 838 return error; 839 840 return xchk_ag_init(sc, sc->sm->sm_agno, &sc->sa); 841 } 842 843 /* Push everything out of the log onto disk. */ 844 int 845 xchk_checkpoint_log( 846 struct xfs_mount *mp) 847 { 848 int error; 849 850 error = xfs_log_force(mp, XFS_LOG_SYNC); 851 if (error) 852 return error; 853 xfs_ail_push_all_sync(mp->m_ail); 854 return 0; 855 } 856 857 /* Verify that an inode is allocated ondisk, then return its cached inode. */ 858 int 859 xchk_iget( 860 struct xfs_scrub *sc, 861 xfs_ino_t inum, 862 struct xfs_inode **ipp) 863 { 864 ASSERT(sc->tp != NULL); 865 866 return xfs_iget(sc->mp, sc->tp, inum, XCHK_IGET_FLAGS, 0, ipp); 867 } 868 869 /* 870 * Try to grab an inode in a manner that avoids races with physical inode 871 * allocation. If we can't, return the locked AGI buffer so that the caller 872 * can single-step the loading process to see where things went wrong. 873 * Callers must have a valid scrub transaction. 874 * 875 * If the iget succeeds, return 0, a NULL AGI, and the inode. 876 * 877 * If the iget fails, return the error, the locked AGI, and a NULL inode. This 878 * can include -EINVAL and -ENOENT for invalid inode numbers or inodes that are 879 * no longer allocated; or any other corruption or runtime error. 880 * 881 * If the AGI read fails, return the error, a NULL AGI, and NULL inode. 882 * 883 * If a fatal signal is pending, return -EINTR, a NULL AGI, and a NULL inode. 884 */ 885 int 886 xchk_iget_agi( 887 struct xfs_scrub *sc, 888 xfs_ino_t inum, 889 struct xfs_buf **agi_bpp, 890 struct xfs_inode **ipp) 891 { 892 struct xfs_mount *mp = sc->mp; 893 struct xfs_trans *tp = sc->tp; 894 struct xfs_perag *pag; 895 int error; 896 897 ASSERT(sc->tp != NULL); 898 899 again: 900 *agi_bpp = NULL; 901 *ipp = NULL; 902 error = 0; 903 904 if (xchk_should_terminate(sc, &error)) 905 return error; 906 907 /* 908 * Attach the AGI buffer to the scrub transaction to avoid deadlocks 909 * in the iget cache miss path. 910 */ 911 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); 912 error = xfs_ialloc_read_agi(pag, tp, 0, agi_bpp); 913 xfs_perag_put(pag); 914 if (error) 915 return error; 916 917 error = xfs_iget(mp, tp, inum, XFS_IGET_NORETRY | XCHK_IGET_FLAGS, 0, 918 ipp); 919 if (error == -EAGAIN) { 920 /* 921 * The inode may be in core but temporarily unavailable and may 922 * require the AGI buffer before it can be returned. Drop the 923 * AGI buffer and retry the lookup. 924 * 925 * Incore lookup will fail with EAGAIN on a cache hit if the 926 * inode is queued to the inactivation list. The inactivation 927 * worker may remove the inode from the unlinked list and hence 928 * needs the AGI. 929 * 930 * Hence xchk_iget_agi() needs to drop the AGI lock on EAGAIN 931 * to allow inodegc to make progress and move the inode to 932 * IRECLAIMABLE state where xfs_iget will be able to return it 933 * again if it can lock the inode. 934 */ 935 xfs_trans_brelse(tp, *agi_bpp); 936 delay(1); 937 goto again; 938 } 939 if (error) 940 return error; 941 942 /* We got the inode, so we can release the AGI. */ 943 ASSERT(*ipp != NULL); 944 xfs_trans_brelse(tp, *agi_bpp); 945 *agi_bpp = NULL; 946 return 0; 947 } 948 949 #ifdef CONFIG_XFS_QUOTA 950 /* 951 * Try to attach dquots to this inode if we think we might want to repair it. 952 * Callers must not hold any ILOCKs. If the dquots are broken and cannot be 953 * attached, a quotacheck will be scheduled. 954 */ 955 int 956 xchk_ino_dqattach( 957 struct xfs_scrub *sc) 958 { 959 ASSERT(sc->tp != NULL); 960 ASSERT(sc->ip != NULL); 961 962 if (!xchk_could_repair(sc)) 963 return 0; 964 965 return xrep_ino_dqattach(sc); 966 } 967 #endif 968 969 /* Install an inode that we opened by handle for scrubbing. */ 970 int 971 xchk_install_handle_inode( 972 struct xfs_scrub *sc, 973 struct xfs_inode *ip) 974 { 975 if (VFS_I(ip)->i_generation != sc->sm->sm_gen) { 976 xchk_irele(sc, ip); 977 return -ENOENT; 978 } 979 980 sc->ip = ip; 981 return 0; 982 } 983 984 /* 985 * Install an already-referenced inode for scrubbing. Get our own reference to 986 * the inode to make disposal simpler. The inode must not be in I_FREEING or 987 * I_WILL_FREE state! 988 */ 989 int 990 xchk_install_live_inode( 991 struct xfs_scrub *sc, 992 struct xfs_inode *ip) 993 { 994 if (!igrab(VFS_I(ip))) { 995 xchk_ino_set_corrupt(sc, ip->i_ino); 996 return -EFSCORRUPTED; 997 } 998 999 sc->ip = ip; 1000 return 0; 1001 } 1002 1003 /* 1004 * In preparation to scrub metadata structures that hang off of an inode, 1005 * grab either the inode referenced in the scrub control structure or the 1006 * inode passed in. If the inumber does not reference an allocated inode 1007 * record, the function returns ENOENT to end the scrub early. The inode 1008 * is not locked. 1009 */ 1010 int 1011 xchk_iget_for_scrubbing( 1012 struct xfs_scrub *sc) 1013 { 1014 struct xfs_imap imap; 1015 struct xfs_mount *mp = sc->mp; 1016 struct xfs_perag *pag; 1017 struct xfs_buf *agi_bp; 1018 struct xfs_inode *ip_in = XFS_I(file_inode(sc->file)); 1019 struct xfs_inode *ip = NULL; 1020 xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, sc->sm->sm_ino); 1021 int error; 1022 1023 ASSERT(sc->tp == NULL); 1024 1025 /* We want to scan the inode we already had opened. */ 1026 if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) 1027 return xchk_install_live_inode(sc, ip_in); 1028 1029 /* 1030 * On pre-metadir filesystems, reject internal metadata files. For 1031 * metadir filesystems, limited scrubbing of any file in the metadata 1032 * directory tree by handle is allowed, because that is the only way to 1033 * validate the lack of parent pointers in the sb-root metadata inodes. 1034 */ 1035 if (!xfs_has_metadir(mp) && xfs_is_sb_inum(mp, sc->sm->sm_ino)) 1036 return -ENOENT; 1037 /* Reject obviously bad inode numbers. */ 1038 if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino)) 1039 return -ENOENT; 1040 1041 /* Try a safe untrusted iget. */ 1042 error = xchk_iget_safe(sc, sc->sm->sm_ino, &ip); 1043 if (!error) 1044 return xchk_install_handle_inode(sc, ip); 1045 if (error == -ENOENT) 1046 return error; 1047 if (error != -EINVAL) 1048 goto out_error; 1049 1050 /* 1051 * EINVAL with IGET_UNTRUSTED probably means one of several things: 1052 * userspace gave us an inode number that doesn't correspond to fs 1053 * space; the inode btree lacks a record for this inode; or there is a 1054 * record, and it says this inode is free. 1055 * 1056 * We want to look up this inode in the inobt to distinguish two 1057 * scenarios: (1) the inobt says the inode is free, in which case 1058 * there's nothing to do; and (2) the inobt says the inode is 1059 * allocated, but loading it failed due to corruption. 1060 * 1061 * Allocate a transaction and grab the AGI to prevent inobt activity 1062 * in this AG. Retry the iget in case someone allocated a new inode 1063 * after the first iget failed. 1064 */ 1065 error = xchk_trans_alloc(sc, 0); 1066 if (error) 1067 goto out_error; 1068 1069 error = xchk_iget_agi(sc, sc->sm->sm_ino, &agi_bp, &ip); 1070 if (error == 0) { 1071 /* Actually got the inode, so install it. */ 1072 xchk_trans_cancel(sc); 1073 return xchk_install_handle_inode(sc, ip); 1074 } 1075 if (error == -ENOENT) 1076 goto out_gone; 1077 if (error != -EINVAL) 1078 goto out_cancel; 1079 1080 /* Ensure that we have protected against inode allocation/freeing. */ 1081 if (agi_bp == NULL) { 1082 ASSERT(agi_bp != NULL); 1083 error = -ECANCELED; 1084 goto out_cancel; 1085 } 1086 1087 /* 1088 * Untrusted iget failed a second time. Let's try an inobt lookup. 1089 * If the inobt thinks this the inode neither can exist inside the 1090 * filesystem nor is allocated, return ENOENT to signal that the check 1091 * can be skipped. 1092 * 1093 * If the lookup returns corruption, we'll mark this inode corrupt and 1094 * exit to userspace. There's little chance of fixing anything until 1095 * the inobt is straightened out, but there's nothing we can do here. 1096 * 1097 * If the lookup encounters any other error, exit to userspace. 1098 * 1099 * If the lookup succeeds, something else must be very wrong in the fs 1100 * such that setting up the incore inode failed in some strange way. 1101 * Treat those as corruptions. 1102 */ 1103 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sc->sm->sm_ino)); 1104 if (!pag) { 1105 error = -EFSCORRUPTED; 1106 goto out_cancel; 1107 } 1108 1109 error = xfs_imap(pag, sc->tp, sc->sm->sm_ino, &imap, 1110 XFS_IGET_UNTRUSTED); 1111 xfs_perag_put(pag); 1112 if (error == -EINVAL || error == -ENOENT) 1113 goto out_gone; 1114 if (!error) 1115 error = -EFSCORRUPTED; 1116 1117 out_cancel: 1118 xchk_trans_cancel(sc); 1119 out_error: 1120 trace_xchk_op_error(sc, agno, XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino), 1121 error, __return_address); 1122 return error; 1123 out_gone: 1124 /* The file is gone, so there's nothing to check. */ 1125 xchk_trans_cancel(sc); 1126 return -ENOENT; 1127 } 1128 1129 /* Release an inode, possibly dropping it in the process. */ 1130 void 1131 xchk_irele( 1132 struct xfs_scrub *sc, 1133 struct xfs_inode *ip) 1134 { 1135 if (sc->tp) { 1136 /* 1137 * If we are in a transaction, we /cannot/ drop the inode 1138 * ourselves, because the VFS will trigger writeback, which 1139 * can require a transaction. Clear DONTCACHE to force the 1140 * inode to the LRU, where someone else can take care of 1141 * dropping it. 1142 * 1143 * Note that when we grabbed our reference to the inode, it 1144 * could have had an active ref and DONTCACHE set if a sysadmin 1145 * is trying to coerce a change in file access mode. icache 1146 * hits do not clear DONTCACHE, so we must do it here. 1147 */ 1148 spin_lock(&VFS_I(ip)->i_lock); 1149 VFS_I(ip)->i_state &= ~I_DONTCACHE; 1150 spin_unlock(&VFS_I(ip)->i_lock); 1151 } 1152 1153 xfs_irele(ip); 1154 } 1155 1156 /* 1157 * Set us up to scrub metadata mapped by a file's fork. Callers must not use 1158 * this to operate on user-accessible regular file data because the MMAPLOCK is 1159 * not taken. 1160 */ 1161 int 1162 xchk_setup_inode_contents( 1163 struct xfs_scrub *sc, 1164 unsigned int resblks) 1165 { 1166 int error; 1167 1168 error = xchk_iget_for_scrubbing(sc); 1169 if (error) 1170 return error; 1171 1172 error = xrep_tempfile_adjust_directory_tree(sc); 1173 if (error) 1174 return error; 1175 1176 /* Lock the inode so the VFS cannot touch this file. */ 1177 xchk_ilock(sc, XFS_IOLOCK_EXCL); 1178 1179 error = xchk_trans_alloc(sc, resblks); 1180 if (error) 1181 goto out; 1182 1183 error = xchk_ino_dqattach(sc); 1184 if (error) 1185 goto out; 1186 1187 xchk_ilock(sc, XFS_ILOCK_EXCL); 1188 out: 1189 /* scrub teardown will unlock and release the inode for us */ 1190 return error; 1191 } 1192 1193 void 1194 xchk_ilock( 1195 struct xfs_scrub *sc, 1196 unsigned int ilock_flags) 1197 { 1198 xfs_ilock(sc->ip, ilock_flags); 1199 sc->ilock_flags |= ilock_flags; 1200 } 1201 1202 bool 1203 xchk_ilock_nowait( 1204 struct xfs_scrub *sc, 1205 unsigned int ilock_flags) 1206 { 1207 if (xfs_ilock_nowait(sc->ip, ilock_flags)) { 1208 sc->ilock_flags |= ilock_flags; 1209 return true; 1210 } 1211 1212 return false; 1213 } 1214 1215 void 1216 xchk_iunlock( 1217 struct xfs_scrub *sc, 1218 unsigned int ilock_flags) 1219 { 1220 sc->ilock_flags &= ~ilock_flags; 1221 xfs_iunlock(sc->ip, ilock_flags); 1222 } 1223 1224 /* 1225 * Predicate that decides if we need to evaluate the cross-reference check. 1226 * If there was an error accessing the cross-reference btree, just delete 1227 * the cursor and skip the check. 1228 */ 1229 bool 1230 xchk_should_check_xref( 1231 struct xfs_scrub *sc, 1232 int *error, 1233 struct xfs_btree_cur **curpp) 1234 { 1235 /* No point in xref if we already know we're corrupt. */ 1236 if (xchk_skip_xref(sc->sm)) 1237 return false; 1238 1239 if (*error == 0) 1240 return true; 1241 1242 if (curpp) { 1243 /* If we've already given up on xref, just bail out. */ 1244 if (!*curpp) 1245 return false; 1246 1247 /* xref error, delete cursor and bail out. */ 1248 xfs_btree_del_cursor(*curpp, XFS_BTREE_ERROR); 1249 *curpp = NULL; 1250 } 1251 1252 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XFAIL; 1253 trace_xchk_xref_error(sc, *error, __return_address); 1254 1255 /* 1256 * Errors encountered during cross-referencing with another 1257 * data structure should not cause this scrubber to abort. 1258 */ 1259 *error = 0; 1260 return false; 1261 } 1262 1263 /* Run the structure verifiers on in-memory buffers to detect bad memory. */ 1264 void 1265 xchk_buffer_recheck( 1266 struct xfs_scrub *sc, 1267 struct xfs_buf *bp) 1268 { 1269 xfs_failaddr_t fa; 1270 1271 if (bp->b_ops == NULL) { 1272 xchk_block_set_corrupt(sc, bp); 1273 return; 1274 } 1275 if (bp->b_ops->verify_struct == NULL) { 1276 xchk_set_incomplete(sc); 1277 return; 1278 } 1279 fa = bp->b_ops->verify_struct(bp); 1280 if (!fa) 1281 return; 1282 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; 1283 trace_xchk_block_error(sc, xfs_buf_daddr(bp), fa); 1284 } 1285 1286 static inline int 1287 xchk_metadata_inode_subtype( 1288 struct xfs_scrub *sc, 1289 unsigned int scrub_type) 1290 { 1291 struct xfs_scrub_subord *sub; 1292 int error; 1293 1294 sub = xchk_scrub_create_subord(sc, scrub_type); 1295 error = sub->sc.ops->scrub(&sub->sc); 1296 xchk_scrub_free_subord(sub); 1297 return error; 1298 } 1299 1300 /* 1301 * Scrub the attr/data forks of a metadata inode. The metadata inode must be 1302 * pointed to by sc->ip and the ILOCK must be held. 1303 */ 1304 int 1305 xchk_metadata_inode_forks( 1306 struct xfs_scrub *sc) 1307 { 1308 bool shared; 1309 int error; 1310 1311 if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) 1312 return 0; 1313 1314 /* Check the inode record. */ 1315 error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_INODE); 1316 if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) 1317 return error; 1318 1319 /* Metadata inodes don't live on the rt device. */ 1320 if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) { 1321 xchk_ino_set_corrupt(sc, sc->ip->i_ino); 1322 return 0; 1323 } 1324 1325 /* They should never participate in reflink. */ 1326 if (xfs_is_reflink_inode(sc->ip)) { 1327 xchk_ino_set_corrupt(sc, sc->ip->i_ino); 1328 return 0; 1329 } 1330 1331 /* Invoke the data fork scrubber. */ 1332 error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTD); 1333 if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) 1334 return error; 1335 1336 /* Look for incorrect shared blocks. */ 1337 if (xfs_has_reflink(sc->mp)) { 1338 error = xfs_reflink_inode_has_shared_extents(sc->tp, sc->ip, 1339 &shared); 1340 if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, 1341 &error)) 1342 return error; 1343 if (shared) 1344 xchk_ino_set_corrupt(sc, sc->ip->i_ino); 1345 } 1346 1347 /* 1348 * Metadata files can only have extended attributes on metadir 1349 * filesystems, either for parent pointers or for actual xattr data. 1350 */ 1351 if (xfs_inode_hasattr(sc->ip)) { 1352 if (!xfs_has_metadir(sc->mp)) { 1353 xchk_ino_set_corrupt(sc, sc->ip->i_ino); 1354 return 0; 1355 } 1356 1357 error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTA); 1358 if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) 1359 return error; 1360 } 1361 1362 return 0; 1363 } 1364 1365 /* 1366 * Enable filesystem hooks (i.e. runtime code patching) before starting a scrub 1367 * operation. Callers must not hold any locks that intersect with the CPU 1368 * hotplug lock (e.g. writeback locks) because code patching must halt the CPUs 1369 * to change kernel code. 1370 */ 1371 void 1372 xchk_fsgates_enable( 1373 struct xfs_scrub *sc, 1374 unsigned int scrub_fsgates) 1375 { 1376 ASSERT(!(scrub_fsgates & ~XCHK_FSGATES_ALL)); 1377 ASSERT(!(sc->flags & scrub_fsgates)); 1378 1379 trace_xchk_fsgates_enable(sc, scrub_fsgates); 1380 1381 if (scrub_fsgates & XCHK_FSGATES_DRAIN) 1382 xfs_drain_wait_enable(); 1383 1384 if (scrub_fsgates & XCHK_FSGATES_QUOTA) 1385 xfs_dqtrx_hook_enable(); 1386 1387 if (scrub_fsgates & XCHK_FSGATES_DIRENTS) 1388 xfs_dir_hook_enable(); 1389 1390 if (scrub_fsgates & XCHK_FSGATES_RMAP) 1391 xfs_rmap_hook_enable(); 1392 1393 sc->flags |= scrub_fsgates; 1394 } 1395 1396 /* 1397 * Decide if this is this a cached inode that's also allocated. The caller 1398 * must hold a reference to an AG and the AGI buffer lock to prevent inodes 1399 * from being allocated or freed. 1400 * 1401 * Look up an inode by number in the given file system. If the inode number 1402 * is invalid, return -EINVAL. If the inode is not in cache, return -ENODATA. 1403 * If the inode is being reclaimed, return -ENODATA because we know the inode 1404 * cache cannot be updating the ondisk metadata. 1405 * 1406 * Otherwise, the incore inode is the one we want, and it is either live, 1407 * somewhere in the inactivation machinery, or reclaimable. The inode is 1408 * allocated if i_mode is nonzero. In all three cases, the cached inode will 1409 * be more up to date than the ondisk inode buffer, so we must use the incore 1410 * i_mode. 1411 */ 1412 int 1413 xchk_inode_is_allocated( 1414 struct xfs_scrub *sc, 1415 xfs_agino_t agino, 1416 bool *inuse) 1417 { 1418 struct xfs_mount *mp = sc->mp; 1419 struct xfs_perag *pag = sc->sa.pag; 1420 xfs_ino_t ino; 1421 struct xfs_inode *ip; 1422 int error; 1423 1424 /* caller must hold perag reference */ 1425 if (pag == NULL) { 1426 ASSERT(pag != NULL); 1427 return -EINVAL; 1428 } 1429 1430 /* caller must have AGI buffer */ 1431 if (sc->sa.agi_bp == NULL) { 1432 ASSERT(sc->sa.agi_bp != NULL); 1433 return -EINVAL; 1434 } 1435 1436 /* reject inode numbers outside existing AGs */ 1437 ino = xfs_agino_to_ino(pag, agino); 1438 if (!xfs_verify_ino(mp, ino)) 1439 return -EINVAL; 1440 1441 error = -ENODATA; 1442 rcu_read_lock(); 1443 ip = radix_tree_lookup(&pag->pag_ici_root, agino); 1444 if (!ip) { 1445 /* cache miss */ 1446 goto out_rcu; 1447 } 1448 1449 /* 1450 * If the inode number doesn't match, the incore inode got reused 1451 * during an RCU grace period and the radix tree hasn't been updated. 1452 * This isn't the inode we want. 1453 */ 1454 spin_lock(&ip->i_flags_lock); 1455 if (ip->i_ino != ino) 1456 goto out_skip; 1457 1458 trace_xchk_inode_is_allocated(ip); 1459 1460 /* 1461 * We have an incore inode that matches the inode we want, and the 1462 * caller holds the perag structure and the AGI buffer. Let's check 1463 * our assumptions below: 1464 */ 1465 1466 #ifdef DEBUG 1467 /* 1468 * (1) If the incore inode is live (i.e. referenced from the dcache), 1469 * it will not be INEW, nor will it be in the inactivation or reclaim 1470 * machinery. The ondisk inode had better be allocated. This is the 1471 * most trivial case. 1472 */ 1473 if (!(ip->i_flags & (XFS_NEED_INACTIVE | XFS_INEW | XFS_IRECLAIMABLE | 1474 XFS_INACTIVATING))) { 1475 /* live inode */ 1476 ASSERT(VFS_I(ip)->i_mode != 0); 1477 } 1478 1479 /* 1480 * If the incore inode is INEW, there are several possibilities: 1481 * 1482 * (2) For a file that is being created, note that we allocate the 1483 * ondisk inode before allocating, initializing, and adding the incore 1484 * inode to the radix tree. 1485 * 1486 * (3) If the incore inode is being recycled, the inode has to be 1487 * allocated because we don't allow freed inodes to be recycled. 1488 * Recycling doesn't touch i_mode. 1489 */ 1490 if (ip->i_flags & XFS_INEW) { 1491 /* created on disk already or recycling */ 1492 ASSERT(VFS_I(ip)->i_mode != 0); 1493 } 1494 1495 /* 1496 * (4) If the inode is queued for inactivation (NEED_INACTIVE) but 1497 * inactivation has not started (!INACTIVATING), it is still allocated. 1498 */ 1499 if ((ip->i_flags & XFS_NEED_INACTIVE) && 1500 !(ip->i_flags & XFS_INACTIVATING)) { 1501 /* definitely before difree */ 1502 ASSERT(VFS_I(ip)->i_mode != 0); 1503 } 1504 #endif 1505 1506 /* 1507 * If the incore inode is undergoing inactivation (INACTIVATING), there 1508 * are two possibilities: 1509 * 1510 * (5) It is before the point where it would get freed ondisk, in which 1511 * case i_mode is still nonzero. 1512 * 1513 * (6) It has already been freed, in which case i_mode is zero. 1514 * 1515 * We don't take the ILOCK here, but difree and dialloc update the AGI, 1516 * and we've taken the AGI buffer lock, which prevents that from 1517 * happening. 1518 */ 1519 1520 /* 1521 * (7) Inodes undergoing inactivation (INACTIVATING) or queued for 1522 * reclaim (IRECLAIMABLE) could be allocated or free. i_mode still 1523 * reflects the ondisk state. 1524 */ 1525 1526 /* 1527 * (8) If the inode is in IFLUSHING, it's safe to query i_mode because 1528 * the flush code uses i_mode to format the ondisk inode. 1529 */ 1530 1531 /* 1532 * (9) If the inode is in IRECLAIM and was reachable via the radix 1533 * tree, it still has the same i_mode as it did before it entered 1534 * reclaim. The inode object is still alive because we hold the RCU 1535 * read lock. 1536 */ 1537 1538 *inuse = VFS_I(ip)->i_mode != 0; 1539 error = 0; 1540 1541 out_skip: 1542 spin_unlock(&ip->i_flags_lock); 1543 out_rcu: 1544 rcu_read_unlock(); 1545 return error; 1546 } 1547 1548 /* Is this inode a root directory for either tree? */ 1549 bool 1550 xchk_inode_is_dirtree_root(const struct xfs_inode *ip) 1551 { 1552 struct xfs_mount *mp = ip->i_mount; 1553 1554 return ip == mp->m_rootip || 1555 (xfs_has_metadir(mp) && ip == mp->m_metadirip); 1556 } 1557 1558 /* Does the superblock point down to this inode? */ 1559 bool 1560 xchk_inode_is_sb_rooted(const struct xfs_inode *ip) 1561 { 1562 return xchk_inode_is_dirtree_root(ip) || 1563 xfs_is_sb_inum(ip->i_mount, ip->i_ino); 1564 } 1565 1566 /* What is the root directory inumber for this inode? */ 1567 xfs_ino_t 1568 xchk_inode_rootdir_inum(const struct xfs_inode *ip) 1569 { 1570 struct xfs_mount *mp = ip->i_mount; 1571 1572 if (xfs_is_metadir_inode(ip)) 1573 return mp->m_metadirip->i_ino; 1574 return mp->m_rootip->i_ino; 1575 } 1576