1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2017-2023 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_btree.h" 13 #include "xfs_log_format.h" 14 #include "xfs_trans.h" 15 #include "xfs_inode.h" 16 #include "xfs_icache.h" 17 #include "xfs_alloc.h" 18 #include "xfs_alloc_btree.h" 19 #include "xfs_ialloc.h" 20 #include "xfs_ialloc_btree.h" 21 #include "xfs_refcount_btree.h" 22 #include "xfs_rmap.h" 23 #include "xfs_rmap_btree.h" 24 #include "xfs_log.h" 25 #include "xfs_trans_priv.h" 26 #include "xfs_da_format.h" 27 #include "xfs_da_btree.h" 28 #include "xfs_dir2_priv.h" 29 #include "xfs_attr.h" 30 #include "xfs_reflink.h" 31 #include "xfs_ag.h" 32 #include "xfs_error.h" 33 #include "xfs_quota.h" 34 #include "xfs_exchmaps.h" 35 #include "scrub/scrub.h" 36 #include "scrub/common.h" 37 #include "scrub/trace.h" 38 #include "scrub/repair.h" 39 #include "scrub/health.h" 40 41 /* Common code for the metadata scrubbers. */ 42 43 /* 44 * Handling operational errors. 45 * 46 * The *_process_error() family of functions are used to process error return 47 * codes from functions called as part of a scrub operation. 48 * 49 * If there's no error, we return true to tell the caller that it's ok 50 * to move on to the next check in its list. 51 * 52 * For non-verifier errors (e.g. ENOMEM) we return false to tell the 53 * caller that something bad happened, and we preserve *error so that 54 * the caller can return the *error up the stack to userspace. 55 * 56 * Verifier errors (EFSBADCRC/EFSCORRUPTED) are recorded by setting 57 * OFLAG_CORRUPT in sm_flags and the *error is cleared. In other words, 58 * we track verifier errors (and failed scrub checks) via OFLAG_CORRUPT, 59 * not via return codes. We return false to tell the caller that 60 * something bad happened. Since the error has been cleared, the caller 61 * will (presumably) return that zero and scrubbing will move on to 62 * whatever's next. 63 * 64 * ftrace can be used to record the precise metadata location and the 65 * approximate code location of the failed operation. 66 */ 67 68 /* Check for operational errors. */ 69 static bool 70 __xchk_process_error( 71 struct xfs_scrub *sc, 72 xfs_agnumber_t agno, 73 xfs_agblock_t bno, 74 int *error, 75 __u32 errflag, 76 void *ret_ip) 77 { 78 switch (*error) { 79 case 0: 80 return true; 81 case -EDEADLOCK: 82 case -ECHRNG: 83 /* Used to restart an op with deadlock avoidance. */ 84 trace_xchk_deadlock_retry( 85 sc->ip ? sc->ip : XFS_I(file_inode(sc->file)), 86 sc->sm, *error); 87 break; 88 case -ECANCELED: 89 /* 90 * ECANCELED here means that the caller set one of the scrub 91 * outcome flags (corrupt, xfail, xcorrupt) and wants to exit 92 * quickly. Set error to zero and do not continue. 93 */ 94 trace_xchk_op_error(sc, agno, bno, *error, ret_ip); 95 *error = 0; 96 break; 97 case -EFSBADCRC: 98 case -EFSCORRUPTED: 99 /* Note the badness but don't abort. */ 100 sc->sm->sm_flags |= errflag; 101 *error = 0; 102 fallthrough; 103 default: 104 trace_xchk_op_error(sc, agno, bno, *error, ret_ip); 105 break; 106 } 107 return false; 108 } 109 110 bool 111 xchk_process_error( 112 struct xfs_scrub *sc, 113 xfs_agnumber_t agno, 114 xfs_agblock_t bno, 115 int *error) 116 { 117 return __xchk_process_error(sc, agno, bno, error, 118 XFS_SCRUB_OFLAG_CORRUPT, __return_address); 119 } 120 121 bool 122 xchk_xref_process_error( 123 struct xfs_scrub *sc, 124 xfs_agnumber_t agno, 125 xfs_agblock_t bno, 126 int *error) 127 { 128 return __xchk_process_error(sc, agno, bno, error, 129 XFS_SCRUB_OFLAG_XFAIL, __return_address); 130 } 131 132 /* Check for operational errors for a file offset. */ 133 static bool 134 __xchk_fblock_process_error( 135 struct xfs_scrub *sc, 136 int whichfork, 137 xfs_fileoff_t offset, 138 int *error, 139 __u32 errflag, 140 void *ret_ip) 141 { 142 switch (*error) { 143 case 0: 144 return true; 145 case -EDEADLOCK: 146 case -ECHRNG: 147 /* Used to restart an op with deadlock avoidance. */ 148 trace_xchk_deadlock_retry(sc->ip, sc->sm, *error); 149 break; 150 case -ECANCELED: 151 /* 152 * ECANCELED here means that the caller set one of the scrub 153 * outcome flags (corrupt, xfail, xcorrupt) and wants to exit 154 * quickly. Set error to zero and do not continue. 155 */ 156 trace_xchk_file_op_error(sc, whichfork, offset, *error, 157 ret_ip); 158 *error = 0; 159 break; 160 case -EFSBADCRC: 161 case -EFSCORRUPTED: 162 /* Note the badness but don't abort. */ 163 sc->sm->sm_flags |= errflag; 164 *error = 0; 165 fallthrough; 166 default: 167 trace_xchk_file_op_error(sc, whichfork, offset, *error, 168 ret_ip); 169 break; 170 } 171 return false; 172 } 173 174 bool 175 xchk_fblock_process_error( 176 struct xfs_scrub *sc, 177 int whichfork, 178 xfs_fileoff_t offset, 179 int *error) 180 { 181 return __xchk_fblock_process_error(sc, whichfork, offset, error, 182 XFS_SCRUB_OFLAG_CORRUPT, __return_address); 183 } 184 185 bool 186 xchk_fblock_xref_process_error( 187 struct xfs_scrub *sc, 188 int whichfork, 189 xfs_fileoff_t offset, 190 int *error) 191 { 192 return __xchk_fblock_process_error(sc, whichfork, offset, error, 193 XFS_SCRUB_OFLAG_XFAIL, __return_address); 194 } 195 196 /* 197 * Handling scrub corruption/optimization/warning checks. 198 * 199 * The *_set_{corrupt,preen,warning}() family of functions are used to 200 * record the presence of metadata that is incorrect (corrupt), could be 201 * optimized somehow (preen), or should be flagged for administrative 202 * review but is not incorrect (warn). 203 * 204 * ftrace can be used to record the precise metadata location and 205 * approximate code location of the failed check. 206 */ 207 208 /* Record a block which could be optimized. */ 209 void 210 xchk_block_set_preen( 211 struct xfs_scrub *sc, 212 struct xfs_buf *bp) 213 { 214 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN; 215 trace_xchk_block_preen(sc, xfs_buf_daddr(bp), __return_address); 216 } 217 218 /* 219 * Record an inode which could be optimized. The trace data will 220 * include the block given by bp if bp is given; otherwise it will use 221 * the block location of the inode record itself. 222 */ 223 void 224 xchk_ino_set_preen( 225 struct xfs_scrub *sc, 226 xfs_ino_t ino) 227 { 228 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN; 229 trace_xchk_ino_preen(sc, ino, __return_address); 230 } 231 232 /* Record something being wrong with the filesystem primary superblock. */ 233 void 234 xchk_set_corrupt( 235 struct xfs_scrub *sc) 236 { 237 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; 238 trace_xchk_fs_error(sc, 0, __return_address); 239 } 240 241 /* Record a corrupt block. */ 242 void 243 xchk_block_set_corrupt( 244 struct xfs_scrub *sc, 245 struct xfs_buf *bp) 246 { 247 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; 248 trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address); 249 } 250 251 #ifdef CONFIG_XFS_QUOTA 252 /* Record a corrupt quota counter. */ 253 void 254 xchk_qcheck_set_corrupt( 255 struct xfs_scrub *sc, 256 unsigned int dqtype, 257 xfs_dqid_t id) 258 { 259 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; 260 trace_xchk_qcheck_error(sc, dqtype, id, __return_address); 261 } 262 #endif 263 264 /* Record a corruption while cross-referencing. */ 265 void 266 xchk_block_xref_set_corrupt( 267 struct xfs_scrub *sc, 268 struct xfs_buf *bp) 269 { 270 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT; 271 trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address); 272 } 273 274 /* 275 * Record a corrupt inode. The trace data will include the block given 276 * by bp if bp is given; otherwise it will use the block location of the 277 * inode record itself. 278 */ 279 void 280 xchk_ino_set_corrupt( 281 struct xfs_scrub *sc, 282 xfs_ino_t ino) 283 { 284 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; 285 trace_xchk_ino_error(sc, ino, __return_address); 286 } 287 288 /* Record a corruption while cross-referencing with an inode. */ 289 void 290 xchk_ino_xref_set_corrupt( 291 struct xfs_scrub *sc, 292 xfs_ino_t ino) 293 { 294 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT; 295 trace_xchk_ino_error(sc, ino, __return_address); 296 } 297 298 /* Record corruption in a block indexed by a file fork. */ 299 void 300 xchk_fblock_set_corrupt( 301 struct xfs_scrub *sc, 302 int whichfork, 303 xfs_fileoff_t offset) 304 { 305 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; 306 trace_xchk_fblock_error(sc, whichfork, offset, __return_address); 307 } 308 309 /* Record a corruption while cross-referencing a fork block. */ 310 void 311 xchk_fblock_xref_set_corrupt( 312 struct xfs_scrub *sc, 313 int whichfork, 314 xfs_fileoff_t offset) 315 { 316 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT; 317 trace_xchk_fblock_error(sc, whichfork, offset, __return_address); 318 } 319 320 /* 321 * Warn about inodes that need administrative review but is not 322 * incorrect. 323 */ 324 void 325 xchk_ino_set_warning( 326 struct xfs_scrub *sc, 327 xfs_ino_t ino) 328 { 329 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_WARNING; 330 trace_xchk_ino_warning(sc, ino, __return_address); 331 } 332 333 /* Warn about a block indexed by a file fork that needs review. */ 334 void 335 xchk_fblock_set_warning( 336 struct xfs_scrub *sc, 337 int whichfork, 338 xfs_fileoff_t offset) 339 { 340 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_WARNING; 341 trace_xchk_fblock_warning(sc, whichfork, offset, __return_address); 342 } 343 344 /* Signal an incomplete scrub. */ 345 void 346 xchk_set_incomplete( 347 struct xfs_scrub *sc) 348 { 349 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_INCOMPLETE; 350 trace_xchk_incomplete(sc, __return_address); 351 } 352 353 /* 354 * rmap scrubbing -- compute the number of blocks with a given owner, 355 * at least according to the reverse mapping data. 356 */ 357 358 struct xchk_rmap_ownedby_info { 359 const struct xfs_owner_info *oinfo; 360 xfs_filblks_t *blocks; 361 }; 362 363 STATIC int 364 xchk_count_rmap_ownedby_irec( 365 struct xfs_btree_cur *cur, 366 const struct xfs_rmap_irec *rec, 367 void *priv) 368 { 369 struct xchk_rmap_ownedby_info *sroi = priv; 370 bool irec_attr; 371 bool oinfo_attr; 372 373 irec_attr = rec->rm_flags & XFS_RMAP_ATTR_FORK; 374 oinfo_attr = sroi->oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK; 375 376 if (rec->rm_owner != sroi->oinfo->oi_owner) 377 return 0; 378 379 if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) || irec_attr == oinfo_attr) 380 (*sroi->blocks) += rec->rm_blockcount; 381 382 return 0; 383 } 384 385 /* 386 * Calculate the number of blocks the rmap thinks are owned by something. 387 * The caller should pass us an rmapbt cursor. 388 */ 389 int 390 xchk_count_rmap_ownedby_ag( 391 struct xfs_scrub *sc, 392 struct xfs_btree_cur *cur, 393 const struct xfs_owner_info *oinfo, 394 xfs_filblks_t *blocks) 395 { 396 struct xchk_rmap_ownedby_info sroi = { 397 .oinfo = oinfo, 398 .blocks = blocks, 399 }; 400 401 *blocks = 0; 402 return xfs_rmap_query_all(cur, xchk_count_rmap_ownedby_irec, 403 &sroi); 404 } 405 406 /* 407 * AG scrubbing 408 * 409 * These helpers facilitate locking an allocation group's header 410 * buffers, setting up cursors for all btrees that are present, and 411 * cleaning everything up once we're through. 412 */ 413 414 /* Decide if we want to return an AG header read failure. */ 415 static inline bool 416 want_ag_read_header_failure( 417 struct xfs_scrub *sc, 418 unsigned int type) 419 { 420 /* Return all AG header read failures when scanning btrees. */ 421 if (sc->sm->sm_type != XFS_SCRUB_TYPE_AGF && 422 sc->sm->sm_type != XFS_SCRUB_TYPE_AGFL && 423 sc->sm->sm_type != XFS_SCRUB_TYPE_AGI) 424 return true; 425 /* 426 * If we're scanning a given type of AG header, we only want to 427 * see read failures from that specific header. We'd like the 428 * other headers to cross-check them, but this isn't required. 429 */ 430 if (sc->sm->sm_type == type) 431 return true; 432 return false; 433 } 434 435 /* 436 * Grab the AG header buffers for the attached perag structure. 437 * 438 * The headers should be released by xchk_ag_free, but as a fail safe we attach 439 * all the buffers we grab to the scrub transaction so they'll all be freed 440 * when we cancel it. 441 */ 442 static inline int 443 xchk_perag_read_headers( 444 struct xfs_scrub *sc, 445 struct xchk_ag *sa) 446 { 447 int error; 448 449 error = xfs_ialloc_read_agi(sa->pag, sc->tp, 0, &sa->agi_bp); 450 if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI)) 451 return error; 452 453 error = xfs_alloc_read_agf(sa->pag, sc->tp, 0, &sa->agf_bp); 454 if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGF)) 455 return error; 456 457 return 0; 458 } 459 460 /* 461 * Grab the AG headers for the attached perag structure and wait for pending 462 * intents to drain. 463 */ 464 int 465 xchk_perag_drain_and_lock( 466 struct xfs_scrub *sc) 467 { 468 struct xchk_ag *sa = &sc->sa; 469 int error = 0; 470 471 ASSERT(sa->pag != NULL); 472 ASSERT(sa->agi_bp == NULL); 473 ASSERT(sa->agf_bp == NULL); 474 475 do { 476 if (xchk_should_terminate(sc, &error)) 477 return error; 478 479 error = xchk_perag_read_headers(sc, sa); 480 if (error) 481 return error; 482 483 /* 484 * If we've grabbed an inode for scrubbing then we assume that 485 * holding its ILOCK will suffice to coordinate with any intent 486 * chains involving this inode. 487 */ 488 if (sc->ip) 489 return 0; 490 491 /* 492 * Decide if this AG is quiet enough for all metadata to be 493 * consistent with each other. XFS allows the AG header buffer 494 * locks to cycle across transaction rolls while processing 495 * chains of deferred ops, which means that there could be 496 * other threads in the middle of processing a chain of 497 * deferred ops. For regular operations we are careful about 498 * ordering operations to prevent collisions between threads 499 * (which is why we don't need a per-AG lock), but scrub and 500 * repair have to serialize against chained operations. 501 * 502 * We just locked all the AG headers buffers; now take a look 503 * to see if there are any intents in progress. If there are, 504 * drop the AG headers and wait for the intents to drain. 505 * Since we hold all the AG header locks for the duration of 506 * the scrub, this is the only time we have to sample the 507 * intents counter; any threads increasing it after this point 508 * can't possibly be in the middle of a chain of AG metadata 509 * updates. 510 * 511 * Obviously, this should be slanted against scrub and in favor 512 * of runtime threads. 513 */ 514 if (!xfs_perag_intent_busy(sa->pag)) 515 return 0; 516 517 if (sa->agf_bp) { 518 xfs_trans_brelse(sc->tp, sa->agf_bp); 519 sa->agf_bp = NULL; 520 } 521 522 if (sa->agi_bp) { 523 xfs_trans_brelse(sc->tp, sa->agi_bp); 524 sa->agi_bp = NULL; 525 } 526 527 if (!(sc->flags & XCHK_FSGATES_DRAIN)) 528 return -ECHRNG; 529 error = xfs_perag_intent_drain(sa->pag); 530 if (error == -ERESTARTSYS) 531 error = -EINTR; 532 } while (!error); 533 534 return error; 535 } 536 537 /* 538 * Grab the per-AG structure, grab all AG header buffers, and wait until there 539 * aren't any pending intents. Returns -ENOENT if we can't grab the perag 540 * structure. 541 */ 542 int 543 xchk_ag_read_headers( 544 struct xfs_scrub *sc, 545 xfs_agnumber_t agno, 546 struct xchk_ag *sa) 547 { 548 struct xfs_mount *mp = sc->mp; 549 550 ASSERT(!sa->pag); 551 sa->pag = xfs_perag_get(mp, agno); 552 if (!sa->pag) 553 return -ENOENT; 554 555 return xchk_perag_drain_and_lock(sc); 556 } 557 558 /* Release all the AG btree cursors. */ 559 void 560 xchk_ag_btcur_free( 561 struct xchk_ag *sa) 562 { 563 if (sa->refc_cur) 564 xfs_btree_del_cursor(sa->refc_cur, XFS_BTREE_ERROR); 565 if (sa->rmap_cur) 566 xfs_btree_del_cursor(sa->rmap_cur, XFS_BTREE_ERROR); 567 if (sa->fino_cur) 568 xfs_btree_del_cursor(sa->fino_cur, XFS_BTREE_ERROR); 569 if (sa->ino_cur) 570 xfs_btree_del_cursor(sa->ino_cur, XFS_BTREE_ERROR); 571 if (sa->cnt_cur) 572 xfs_btree_del_cursor(sa->cnt_cur, XFS_BTREE_ERROR); 573 if (sa->bno_cur) 574 xfs_btree_del_cursor(sa->bno_cur, XFS_BTREE_ERROR); 575 576 sa->refc_cur = NULL; 577 sa->rmap_cur = NULL; 578 sa->fino_cur = NULL; 579 sa->ino_cur = NULL; 580 sa->bno_cur = NULL; 581 sa->cnt_cur = NULL; 582 } 583 584 /* Initialize all the btree cursors for an AG. */ 585 void 586 xchk_ag_btcur_init( 587 struct xfs_scrub *sc, 588 struct xchk_ag *sa) 589 { 590 struct xfs_mount *mp = sc->mp; 591 592 if (sa->agf_bp) { 593 /* Set up a bnobt cursor for cross-referencing. */ 594 sa->bno_cur = xfs_bnobt_init_cursor(mp, sc->tp, sa->agf_bp, 595 sa->pag); 596 xchk_ag_btree_del_cursor_if_sick(sc, &sa->bno_cur, 597 XFS_SCRUB_TYPE_BNOBT); 598 599 /* Set up a cntbt cursor for cross-referencing. */ 600 sa->cnt_cur = xfs_cntbt_init_cursor(mp, sc->tp, sa->agf_bp, 601 sa->pag); 602 xchk_ag_btree_del_cursor_if_sick(sc, &sa->cnt_cur, 603 XFS_SCRUB_TYPE_CNTBT); 604 605 /* Set up a rmapbt cursor for cross-referencing. */ 606 if (xfs_has_rmapbt(mp)) { 607 sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, 608 sa->agf_bp, sa->pag); 609 xchk_ag_btree_del_cursor_if_sick(sc, &sa->rmap_cur, 610 XFS_SCRUB_TYPE_RMAPBT); 611 } 612 613 /* Set up a refcountbt cursor for cross-referencing. */ 614 if (xfs_has_reflink(mp)) { 615 sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp, 616 sa->agf_bp, sa->pag); 617 xchk_ag_btree_del_cursor_if_sick(sc, &sa->refc_cur, 618 XFS_SCRUB_TYPE_REFCNTBT); 619 } 620 } 621 622 if (sa->agi_bp) { 623 /* Set up a inobt cursor for cross-referencing. */ 624 sa->ino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp, 625 sa->agi_bp); 626 xchk_ag_btree_del_cursor_if_sick(sc, &sa->ino_cur, 627 XFS_SCRUB_TYPE_INOBT); 628 629 /* Set up a finobt cursor for cross-referencing. */ 630 if (xfs_has_finobt(mp)) { 631 sa->fino_cur = xfs_finobt_init_cursor(sa->pag, sc->tp, 632 sa->agi_bp); 633 xchk_ag_btree_del_cursor_if_sick(sc, &sa->fino_cur, 634 XFS_SCRUB_TYPE_FINOBT); 635 } 636 } 637 } 638 639 /* Release the AG header context and btree cursors. */ 640 void 641 xchk_ag_free( 642 struct xfs_scrub *sc, 643 struct xchk_ag *sa) 644 { 645 xchk_ag_btcur_free(sa); 646 xrep_reset_perag_resv(sc); 647 if (sa->agf_bp) { 648 xfs_trans_brelse(sc->tp, sa->agf_bp); 649 sa->agf_bp = NULL; 650 } 651 if (sa->agi_bp) { 652 xfs_trans_brelse(sc->tp, sa->agi_bp); 653 sa->agi_bp = NULL; 654 } 655 if (sa->pag) { 656 xfs_perag_put(sa->pag); 657 sa->pag = NULL; 658 } 659 } 660 661 /* 662 * For scrub, grab the perag structure, the AGI, and the AGF headers, in that 663 * order. Locking order requires us to get the AGI before the AGF. We use the 664 * transaction to avoid deadlocking on crosslinked metadata buffers; either the 665 * caller passes one in (bmap scrub) or we have to create a transaction 666 * ourselves. Returns ENOENT if the perag struct cannot be grabbed. 667 */ 668 int 669 xchk_ag_init( 670 struct xfs_scrub *sc, 671 xfs_agnumber_t agno, 672 struct xchk_ag *sa) 673 { 674 int error; 675 676 error = xchk_ag_read_headers(sc, agno, sa); 677 if (error) 678 return error; 679 680 xchk_ag_btcur_init(sc, sa); 681 return 0; 682 } 683 684 /* Per-scrubber setup functions */ 685 686 void 687 xchk_trans_cancel( 688 struct xfs_scrub *sc) 689 { 690 xfs_trans_cancel(sc->tp); 691 sc->tp = NULL; 692 } 693 694 int 695 xchk_trans_alloc_empty( 696 struct xfs_scrub *sc) 697 { 698 return xfs_trans_alloc_empty(sc->mp, &sc->tp); 699 } 700 701 /* 702 * Grab an empty transaction so that we can re-grab locked buffers if 703 * one of our btrees turns out to be cyclic. 704 * 705 * If we're going to repair something, we need to ask for the largest possible 706 * log reservation so that we can handle the worst case scenario for metadata 707 * updates while rebuilding a metadata item. We also need to reserve as many 708 * blocks in the head transaction as we think we're going to need to rebuild 709 * the metadata object. 710 */ 711 int 712 xchk_trans_alloc( 713 struct xfs_scrub *sc, 714 uint resblks) 715 { 716 if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) 717 return xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate, 718 resblks, 0, 0, &sc->tp); 719 720 return xchk_trans_alloc_empty(sc); 721 } 722 723 /* Set us up with a transaction and an empty context. */ 724 int 725 xchk_setup_fs( 726 struct xfs_scrub *sc) 727 { 728 uint resblks; 729 730 resblks = xrep_calc_ag_resblks(sc); 731 return xchk_trans_alloc(sc, resblks); 732 } 733 734 /* Set us up with AG headers and btree cursors. */ 735 int 736 xchk_setup_ag_btree( 737 struct xfs_scrub *sc, 738 bool force_log) 739 { 740 struct xfs_mount *mp = sc->mp; 741 int error; 742 743 /* 744 * If the caller asks us to checkpont the log, do so. This 745 * expensive operation should be performed infrequently and only 746 * as a last resort. Any caller that sets force_log should 747 * document why they need to do so. 748 */ 749 if (force_log) { 750 error = xchk_checkpoint_log(mp); 751 if (error) 752 return error; 753 } 754 755 error = xchk_setup_fs(sc); 756 if (error) 757 return error; 758 759 return xchk_ag_init(sc, sc->sm->sm_agno, &sc->sa); 760 } 761 762 /* Push everything out of the log onto disk. */ 763 int 764 xchk_checkpoint_log( 765 struct xfs_mount *mp) 766 { 767 int error; 768 769 error = xfs_log_force(mp, XFS_LOG_SYNC); 770 if (error) 771 return error; 772 xfs_ail_push_all_sync(mp->m_ail); 773 return 0; 774 } 775 776 /* Verify that an inode is allocated ondisk, then return its cached inode. */ 777 int 778 xchk_iget( 779 struct xfs_scrub *sc, 780 xfs_ino_t inum, 781 struct xfs_inode **ipp) 782 { 783 ASSERT(sc->tp != NULL); 784 785 return xfs_iget(sc->mp, sc->tp, inum, XFS_IGET_UNTRUSTED, 0, ipp); 786 } 787 788 /* 789 * Try to grab an inode in a manner that avoids races with physical inode 790 * allocation. If we can't, return the locked AGI buffer so that the caller 791 * can single-step the loading process to see where things went wrong. 792 * Callers must have a valid scrub transaction. 793 * 794 * If the iget succeeds, return 0, a NULL AGI, and the inode. 795 * 796 * If the iget fails, return the error, the locked AGI, and a NULL inode. This 797 * can include -EINVAL and -ENOENT for invalid inode numbers or inodes that are 798 * no longer allocated; or any other corruption or runtime error. 799 * 800 * If the AGI read fails, return the error, a NULL AGI, and NULL inode. 801 * 802 * If a fatal signal is pending, return -EINTR, a NULL AGI, and a NULL inode. 803 */ 804 int 805 xchk_iget_agi( 806 struct xfs_scrub *sc, 807 xfs_ino_t inum, 808 struct xfs_buf **agi_bpp, 809 struct xfs_inode **ipp) 810 { 811 struct xfs_mount *mp = sc->mp; 812 struct xfs_trans *tp = sc->tp; 813 struct xfs_perag *pag; 814 int error; 815 816 ASSERT(sc->tp != NULL); 817 818 again: 819 *agi_bpp = NULL; 820 *ipp = NULL; 821 error = 0; 822 823 if (xchk_should_terminate(sc, &error)) 824 return error; 825 826 /* 827 * Attach the AGI buffer to the scrub transaction to avoid deadlocks 828 * in the iget cache miss path. 829 */ 830 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); 831 error = xfs_ialloc_read_agi(pag, tp, 0, agi_bpp); 832 xfs_perag_put(pag); 833 if (error) 834 return error; 835 836 error = xfs_iget(mp, tp, inum, 837 XFS_IGET_NORETRY | XFS_IGET_UNTRUSTED, 0, ipp); 838 if (error == -EAGAIN) { 839 /* 840 * The inode may be in core but temporarily unavailable and may 841 * require the AGI buffer before it can be returned. Drop the 842 * AGI buffer and retry the lookup. 843 * 844 * Incore lookup will fail with EAGAIN on a cache hit if the 845 * inode is queued to the inactivation list. The inactivation 846 * worker may remove the inode from the unlinked list and hence 847 * needs the AGI. 848 * 849 * Hence xchk_iget_agi() needs to drop the AGI lock on EAGAIN 850 * to allow inodegc to make progress and move the inode to 851 * IRECLAIMABLE state where xfs_iget will be able to return it 852 * again if it can lock the inode. 853 */ 854 xfs_trans_brelse(tp, *agi_bpp); 855 delay(1); 856 goto again; 857 } 858 if (error) 859 return error; 860 861 /* We got the inode, so we can release the AGI. */ 862 ASSERT(*ipp != NULL); 863 xfs_trans_brelse(tp, *agi_bpp); 864 *agi_bpp = NULL; 865 return 0; 866 } 867 868 #ifdef CONFIG_XFS_QUOTA 869 /* 870 * Try to attach dquots to this inode if we think we might want to repair it. 871 * Callers must not hold any ILOCKs. If the dquots are broken and cannot be 872 * attached, a quotacheck will be scheduled. 873 */ 874 int 875 xchk_ino_dqattach( 876 struct xfs_scrub *sc) 877 { 878 ASSERT(sc->tp != NULL); 879 ASSERT(sc->ip != NULL); 880 881 if (!xchk_could_repair(sc)) 882 return 0; 883 884 return xrep_ino_dqattach(sc); 885 } 886 #endif 887 888 /* Install an inode that we opened by handle for scrubbing. */ 889 int 890 xchk_install_handle_inode( 891 struct xfs_scrub *sc, 892 struct xfs_inode *ip) 893 { 894 if (VFS_I(ip)->i_generation != sc->sm->sm_gen) { 895 xchk_irele(sc, ip); 896 return -ENOENT; 897 } 898 899 sc->ip = ip; 900 return 0; 901 } 902 903 /* 904 * Install an already-referenced inode for scrubbing. Get our own reference to 905 * the inode to make disposal simpler. The inode must not be in I_FREEING or 906 * I_WILL_FREE state! 907 */ 908 int 909 xchk_install_live_inode( 910 struct xfs_scrub *sc, 911 struct xfs_inode *ip) 912 { 913 if (!igrab(VFS_I(ip))) { 914 xchk_ino_set_corrupt(sc, ip->i_ino); 915 return -EFSCORRUPTED; 916 } 917 918 sc->ip = ip; 919 return 0; 920 } 921 922 /* 923 * In preparation to scrub metadata structures that hang off of an inode, 924 * grab either the inode referenced in the scrub control structure or the 925 * inode passed in. If the inumber does not reference an allocated inode 926 * record, the function returns ENOENT to end the scrub early. The inode 927 * is not locked. 928 */ 929 int 930 xchk_iget_for_scrubbing( 931 struct xfs_scrub *sc) 932 { 933 struct xfs_imap imap; 934 struct xfs_mount *mp = sc->mp; 935 struct xfs_perag *pag; 936 struct xfs_buf *agi_bp; 937 struct xfs_inode *ip_in = XFS_I(file_inode(sc->file)); 938 struct xfs_inode *ip = NULL; 939 xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, sc->sm->sm_ino); 940 int error; 941 942 ASSERT(sc->tp == NULL); 943 944 /* We want to scan the inode we already had opened. */ 945 if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) 946 return xchk_install_live_inode(sc, ip_in); 947 948 /* Reject internal metadata files and obviously bad inode numbers. */ 949 if (xfs_internal_inum(mp, sc->sm->sm_ino)) 950 return -ENOENT; 951 if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino)) 952 return -ENOENT; 953 954 /* Try a safe untrusted iget. */ 955 error = xchk_iget_safe(sc, sc->sm->sm_ino, &ip); 956 if (!error) 957 return xchk_install_handle_inode(sc, ip); 958 if (error == -ENOENT) 959 return error; 960 if (error != -EINVAL) 961 goto out_error; 962 963 /* 964 * EINVAL with IGET_UNTRUSTED probably means one of several things: 965 * userspace gave us an inode number that doesn't correspond to fs 966 * space; the inode btree lacks a record for this inode; or there is a 967 * record, and it says this inode is free. 968 * 969 * We want to look up this inode in the inobt to distinguish two 970 * scenarios: (1) the inobt says the inode is free, in which case 971 * there's nothing to do; and (2) the inobt says the inode is 972 * allocated, but loading it failed due to corruption. 973 * 974 * Allocate a transaction and grab the AGI to prevent inobt activity 975 * in this AG. Retry the iget in case someone allocated a new inode 976 * after the first iget failed. 977 */ 978 error = xchk_trans_alloc(sc, 0); 979 if (error) 980 goto out_error; 981 982 error = xchk_iget_agi(sc, sc->sm->sm_ino, &agi_bp, &ip); 983 if (error == 0) { 984 /* Actually got the inode, so install it. */ 985 xchk_trans_cancel(sc); 986 return xchk_install_handle_inode(sc, ip); 987 } 988 if (error == -ENOENT) 989 goto out_gone; 990 if (error != -EINVAL) 991 goto out_cancel; 992 993 /* Ensure that we have protected against inode allocation/freeing. */ 994 if (agi_bp == NULL) { 995 ASSERT(agi_bp != NULL); 996 error = -ECANCELED; 997 goto out_cancel; 998 } 999 1000 /* 1001 * Untrusted iget failed a second time. Let's try an inobt lookup. 1002 * If the inobt thinks this the inode neither can exist inside the 1003 * filesystem nor is allocated, return ENOENT to signal that the check 1004 * can be skipped. 1005 * 1006 * If the lookup returns corruption, we'll mark this inode corrupt and 1007 * exit to userspace. There's little chance of fixing anything until 1008 * the inobt is straightened out, but there's nothing we can do here. 1009 * 1010 * If the lookup encounters any other error, exit to userspace. 1011 * 1012 * If the lookup succeeds, something else must be very wrong in the fs 1013 * such that setting up the incore inode failed in some strange way. 1014 * Treat those as corruptions. 1015 */ 1016 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sc->sm->sm_ino)); 1017 if (!pag) { 1018 error = -EFSCORRUPTED; 1019 goto out_cancel; 1020 } 1021 1022 error = xfs_imap(pag, sc->tp, sc->sm->sm_ino, &imap, 1023 XFS_IGET_UNTRUSTED); 1024 xfs_perag_put(pag); 1025 if (error == -EINVAL || error == -ENOENT) 1026 goto out_gone; 1027 if (!error) 1028 error = -EFSCORRUPTED; 1029 1030 out_cancel: 1031 xchk_trans_cancel(sc); 1032 out_error: 1033 trace_xchk_op_error(sc, agno, XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino), 1034 error, __return_address); 1035 return error; 1036 out_gone: 1037 /* The file is gone, so there's nothing to check. */ 1038 xchk_trans_cancel(sc); 1039 return -ENOENT; 1040 } 1041 1042 /* Release an inode, possibly dropping it in the process. */ 1043 void 1044 xchk_irele( 1045 struct xfs_scrub *sc, 1046 struct xfs_inode *ip) 1047 { 1048 if (sc->tp) { 1049 /* 1050 * If we are in a transaction, we /cannot/ drop the inode 1051 * ourselves, because the VFS will trigger writeback, which 1052 * can require a transaction. Clear DONTCACHE to force the 1053 * inode to the LRU, where someone else can take care of 1054 * dropping it. 1055 * 1056 * Note that when we grabbed our reference to the inode, it 1057 * could have had an active ref and DONTCACHE set if a sysadmin 1058 * is trying to coerce a change in file access mode. icache 1059 * hits do not clear DONTCACHE, so we must do it here. 1060 */ 1061 spin_lock(&VFS_I(ip)->i_lock); 1062 VFS_I(ip)->i_state &= ~I_DONTCACHE; 1063 spin_unlock(&VFS_I(ip)->i_lock); 1064 } else if (atomic_read(&VFS_I(ip)->i_count) == 1) { 1065 /* 1066 * If this is the last reference to the inode and the caller 1067 * permits it, set DONTCACHE to avoid thrashing. 1068 */ 1069 d_mark_dontcache(VFS_I(ip)); 1070 } 1071 1072 xfs_irele(ip); 1073 } 1074 1075 /* 1076 * Set us up to scrub metadata mapped by a file's fork. Callers must not use 1077 * this to operate on user-accessible regular file data because the MMAPLOCK is 1078 * not taken. 1079 */ 1080 int 1081 xchk_setup_inode_contents( 1082 struct xfs_scrub *sc, 1083 unsigned int resblks) 1084 { 1085 int error; 1086 1087 error = xchk_iget_for_scrubbing(sc); 1088 if (error) 1089 return error; 1090 1091 /* Lock the inode so the VFS cannot touch this file. */ 1092 xchk_ilock(sc, XFS_IOLOCK_EXCL); 1093 1094 error = xchk_trans_alloc(sc, resblks); 1095 if (error) 1096 goto out; 1097 1098 error = xchk_ino_dqattach(sc); 1099 if (error) 1100 goto out; 1101 1102 xchk_ilock(sc, XFS_ILOCK_EXCL); 1103 out: 1104 /* scrub teardown will unlock and release the inode for us */ 1105 return error; 1106 } 1107 1108 void 1109 xchk_ilock( 1110 struct xfs_scrub *sc, 1111 unsigned int ilock_flags) 1112 { 1113 xfs_ilock(sc->ip, ilock_flags); 1114 sc->ilock_flags |= ilock_flags; 1115 } 1116 1117 bool 1118 xchk_ilock_nowait( 1119 struct xfs_scrub *sc, 1120 unsigned int ilock_flags) 1121 { 1122 if (xfs_ilock_nowait(sc->ip, ilock_flags)) { 1123 sc->ilock_flags |= ilock_flags; 1124 return true; 1125 } 1126 1127 return false; 1128 } 1129 1130 void 1131 xchk_iunlock( 1132 struct xfs_scrub *sc, 1133 unsigned int ilock_flags) 1134 { 1135 sc->ilock_flags &= ~ilock_flags; 1136 xfs_iunlock(sc->ip, ilock_flags); 1137 } 1138 1139 /* 1140 * Predicate that decides if we need to evaluate the cross-reference check. 1141 * If there was an error accessing the cross-reference btree, just delete 1142 * the cursor and skip the check. 1143 */ 1144 bool 1145 xchk_should_check_xref( 1146 struct xfs_scrub *sc, 1147 int *error, 1148 struct xfs_btree_cur **curpp) 1149 { 1150 /* No point in xref if we already know we're corrupt. */ 1151 if (xchk_skip_xref(sc->sm)) 1152 return false; 1153 1154 if (*error == 0) 1155 return true; 1156 1157 if (curpp) { 1158 /* If we've already given up on xref, just bail out. */ 1159 if (!*curpp) 1160 return false; 1161 1162 /* xref error, delete cursor and bail out. */ 1163 xfs_btree_del_cursor(*curpp, XFS_BTREE_ERROR); 1164 *curpp = NULL; 1165 } 1166 1167 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XFAIL; 1168 trace_xchk_xref_error(sc, *error, __return_address); 1169 1170 /* 1171 * Errors encountered during cross-referencing with another 1172 * data structure should not cause this scrubber to abort. 1173 */ 1174 *error = 0; 1175 return false; 1176 } 1177 1178 /* Run the structure verifiers on in-memory buffers to detect bad memory. */ 1179 void 1180 xchk_buffer_recheck( 1181 struct xfs_scrub *sc, 1182 struct xfs_buf *bp) 1183 { 1184 xfs_failaddr_t fa; 1185 1186 if (bp->b_ops == NULL) { 1187 xchk_block_set_corrupt(sc, bp); 1188 return; 1189 } 1190 if (bp->b_ops->verify_struct == NULL) { 1191 xchk_set_incomplete(sc); 1192 return; 1193 } 1194 fa = bp->b_ops->verify_struct(bp); 1195 if (!fa) 1196 return; 1197 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; 1198 trace_xchk_block_error(sc, xfs_buf_daddr(bp), fa); 1199 } 1200 1201 static inline int 1202 xchk_metadata_inode_subtype( 1203 struct xfs_scrub *sc, 1204 unsigned int scrub_type) 1205 { 1206 struct xfs_scrub_subord *sub; 1207 int error; 1208 1209 sub = xchk_scrub_create_subord(sc, scrub_type); 1210 error = sub->sc.ops->scrub(&sub->sc); 1211 xchk_scrub_free_subord(sub); 1212 return error; 1213 } 1214 1215 /* 1216 * Scrub the attr/data forks of a metadata inode. The metadata inode must be 1217 * pointed to by sc->ip and the ILOCK must be held. 1218 */ 1219 int 1220 xchk_metadata_inode_forks( 1221 struct xfs_scrub *sc) 1222 { 1223 bool shared; 1224 int error; 1225 1226 if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) 1227 return 0; 1228 1229 /* Check the inode record. */ 1230 error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_INODE); 1231 if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) 1232 return error; 1233 1234 /* Metadata inodes don't live on the rt device. */ 1235 if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) { 1236 xchk_ino_set_corrupt(sc, sc->ip->i_ino); 1237 return 0; 1238 } 1239 1240 /* They should never participate in reflink. */ 1241 if (xfs_is_reflink_inode(sc->ip)) { 1242 xchk_ino_set_corrupt(sc, sc->ip->i_ino); 1243 return 0; 1244 } 1245 1246 /* They also should never have extended attributes. */ 1247 if (xfs_inode_hasattr(sc->ip)) { 1248 xchk_ino_set_corrupt(sc, sc->ip->i_ino); 1249 return 0; 1250 } 1251 1252 /* Invoke the data fork scrubber. */ 1253 error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTD); 1254 if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) 1255 return error; 1256 1257 /* Look for incorrect shared blocks. */ 1258 if (xfs_has_reflink(sc->mp)) { 1259 error = xfs_reflink_inode_has_shared_extents(sc->tp, sc->ip, 1260 &shared); 1261 if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, 1262 &error)) 1263 return error; 1264 if (shared) 1265 xchk_ino_set_corrupt(sc, sc->ip->i_ino); 1266 } 1267 1268 return 0; 1269 } 1270 1271 /* 1272 * Enable filesystem hooks (i.e. runtime code patching) before starting a scrub 1273 * operation. Callers must not hold any locks that intersect with the CPU 1274 * hotplug lock (e.g. writeback locks) because code patching must halt the CPUs 1275 * to change kernel code. 1276 */ 1277 void 1278 xchk_fsgates_enable( 1279 struct xfs_scrub *sc, 1280 unsigned int scrub_fsgates) 1281 { 1282 ASSERT(!(scrub_fsgates & ~XCHK_FSGATES_ALL)); 1283 ASSERT(!(sc->flags & scrub_fsgates)); 1284 1285 trace_xchk_fsgates_enable(sc, scrub_fsgates); 1286 1287 if (scrub_fsgates & XCHK_FSGATES_DRAIN) 1288 xfs_drain_wait_enable(); 1289 1290 if (scrub_fsgates & XCHK_FSGATES_QUOTA) 1291 xfs_dqtrx_hook_enable(); 1292 1293 if (scrub_fsgates & XCHK_FSGATES_DIRENTS) 1294 xfs_dir_hook_enable(); 1295 1296 if (scrub_fsgates & XCHK_FSGATES_RMAP) 1297 xfs_rmap_hook_enable(); 1298 1299 sc->flags |= scrub_fsgates; 1300 } 1301 1302 /* 1303 * Decide if this is this a cached inode that's also allocated. The caller 1304 * must hold a reference to an AG and the AGI buffer lock to prevent inodes 1305 * from being allocated or freed. 1306 * 1307 * Look up an inode by number in the given file system. If the inode number 1308 * is invalid, return -EINVAL. If the inode is not in cache, return -ENODATA. 1309 * If the inode is being reclaimed, return -ENODATA because we know the inode 1310 * cache cannot be updating the ondisk metadata. 1311 * 1312 * Otherwise, the incore inode is the one we want, and it is either live, 1313 * somewhere in the inactivation machinery, or reclaimable. The inode is 1314 * allocated if i_mode is nonzero. In all three cases, the cached inode will 1315 * be more up to date than the ondisk inode buffer, so we must use the incore 1316 * i_mode. 1317 */ 1318 int 1319 xchk_inode_is_allocated( 1320 struct xfs_scrub *sc, 1321 xfs_agino_t agino, 1322 bool *inuse) 1323 { 1324 struct xfs_mount *mp = sc->mp; 1325 struct xfs_perag *pag = sc->sa.pag; 1326 xfs_ino_t ino; 1327 struct xfs_inode *ip; 1328 int error; 1329 1330 /* caller must hold perag reference */ 1331 if (pag == NULL) { 1332 ASSERT(pag != NULL); 1333 return -EINVAL; 1334 } 1335 1336 /* caller must have AGI buffer */ 1337 if (sc->sa.agi_bp == NULL) { 1338 ASSERT(sc->sa.agi_bp != NULL); 1339 return -EINVAL; 1340 } 1341 1342 /* reject inode numbers outside existing AGs */ 1343 ino = XFS_AGINO_TO_INO(sc->mp, pag->pag_agno, agino); 1344 if (!xfs_verify_ino(mp, ino)) 1345 return -EINVAL; 1346 1347 error = -ENODATA; 1348 rcu_read_lock(); 1349 ip = radix_tree_lookup(&pag->pag_ici_root, agino); 1350 if (!ip) { 1351 /* cache miss */ 1352 goto out_rcu; 1353 } 1354 1355 /* 1356 * If the inode number doesn't match, the incore inode got reused 1357 * during an RCU grace period and the radix tree hasn't been updated. 1358 * This isn't the inode we want. 1359 */ 1360 spin_lock(&ip->i_flags_lock); 1361 if (ip->i_ino != ino) 1362 goto out_skip; 1363 1364 trace_xchk_inode_is_allocated(ip); 1365 1366 /* 1367 * We have an incore inode that matches the inode we want, and the 1368 * caller holds the perag structure and the AGI buffer. Let's check 1369 * our assumptions below: 1370 */ 1371 1372 #ifdef DEBUG 1373 /* 1374 * (1) If the incore inode is live (i.e. referenced from the dcache), 1375 * it will not be INEW, nor will it be in the inactivation or reclaim 1376 * machinery. The ondisk inode had better be allocated. This is the 1377 * most trivial case. 1378 */ 1379 if (!(ip->i_flags & (XFS_NEED_INACTIVE | XFS_INEW | XFS_IRECLAIMABLE | 1380 XFS_INACTIVATING))) { 1381 /* live inode */ 1382 ASSERT(VFS_I(ip)->i_mode != 0); 1383 } 1384 1385 /* 1386 * If the incore inode is INEW, there are several possibilities: 1387 * 1388 * (2) For a file that is being created, note that we allocate the 1389 * ondisk inode before allocating, initializing, and adding the incore 1390 * inode to the radix tree. 1391 * 1392 * (3) If the incore inode is being recycled, the inode has to be 1393 * allocated because we don't allow freed inodes to be recycled. 1394 * Recycling doesn't touch i_mode. 1395 */ 1396 if (ip->i_flags & XFS_INEW) { 1397 /* created on disk already or recycling */ 1398 ASSERT(VFS_I(ip)->i_mode != 0); 1399 } 1400 1401 /* 1402 * (4) If the inode is queued for inactivation (NEED_INACTIVE) but 1403 * inactivation has not started (!INACTIVATING), it is still allocated. 1404 */ 1405 if ((ip->i_flags & XFS_NEED_INACTIVE) && 1406 !(ip->i_flags & XFS_INACTIVATING)) { 1407 /* definitely before difree */ 1408 ASSERT(VFS_I(ip)->i_mode != 0); 1409 } 1410 #endif 1411 1412 /* 1413 * If the incore inode is undergoing inactivation (INACTIVATING), there 1414 * are two possibilities: 1415 * 1416 * (5) It is before the point where it would get freed ondisk, in which 1417 * case i_mode is still nonzero. 1418 * 1419 * (6) It has already been freed, in which case i_mode is zero. 1420 * 1421 * We don't take the ILOCK here, but difree and dialloc update the AGI, 1422 * and we've taken the AGI buffer lock, which prevents that from 1423 * happening. 1424 */ 1425 1426 /* 1427 * (7) Inodes undergoing inactivation (INACTIVATING) or queued for 1428 * reclaim (IRECLAIMABLE) could be allocated or free. i_mode still 1429 * reflects the ondisk state. 1430 */ 1431 1432 /* 1433 * (8) If the inode is in IFLUSHING, it's safe to query i_mode because 1434 * the flush code uses i_mode to format the ondisk inode. 1435 */ 1436 1437 /* 1438 * (9) If the inode is in IRECLAIM and was reachable via the radix 1439 * tree, it still has the same i_mode as it did before it entered 1440 * reclaim. The inode object is still alive because we hold the RCU 1441 * read lock. 1442 */ 1443 1444 *inuse = VFS_I(ip)->i_mode != 0; 1445 error = 0; 1446 1447 out_skip: 1448 spin_unlock(&ip->i_flags_lock); 1449 out_rcu: 1450 rcu_read_unlock(); 1451 return error; 1452 } 1453