1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (c) 2018-2024 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs_platform.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_defer.h" 13 #include "xfs_btree.h" 14 #include "xfs_btree_staging.h" 15 #include "xfs_buf_mem.h" 16 #include "xfs_btree_mem.h" 17 #include "xfs_bit.h" 18 #include "xfs_log_format.h" 19 #include "xfs_trans.h" 20 #include "xfs_sb.h" 21 #include "xfs_alloc.h" 22 #include "xfs_alloc_btree.h" 23 #include "xfs_ialloc.h" 24 #include "xfs_ialloc_btree.h" 25 #include "xfs_rmap.h" 26 #include "xfs_rmap_btree.h" 27 #include "xfs_inode.h" 28 #include "xfs_icache.h" 29 #include "xfs_bmap.h" 30 #include "xfs_bmap_btree.h" 31 #include "xfs_refcount.h" 32 #include "xfs_refcount_btree.h" 33 #include "xfs_ag.h" 34 #include "xfs_rtrmap_btree.h" 35 #include "xfs_rtgroup.h" 36 #include "xfs_rtrefcount_btree.h" 37 #include "scrub/xfs_scrub.h" 38 #include "scrub/scrub.h" 39 #include "scrub/common.h" 40 #include "scrub/btree.h" 41 #include "scrub/trace.h" 42 #include "scrub/repair.h" 43 #include "scrub/bitmap.h" 44 #include "scrub/agb_bitmap.h" 45 #include "scrub/xfile.h" 46 #include "scrub/xfarray.h" 47 #include "scrub/iscan.h" 48 #include "scrub/newbt.h" 49 #include "scrub/reap.h" 50 51 /* 52 * Reverse Mapping Btree Repair 53 * ============================ 54 * 55 * This is the most involved of all the AG space btree rebuilds. Everywhere 56 * else in XFS we lock inodes and then AG data structures, but generating the 57 * list of rmap records requires that we be able to scan both block mapping 58 * btrees of every inode in the filesystem to see if it owns any extents in 59 * this AG. We can't tolerate any inode updates while we do this, so we 60 * freeze the filesystem to lock everyone else out, and grant ourselves 61 * special privileges to run transactions with regular background reclamation 62 * turned off. 63 * 64 * We also have to be very careful not to allow inode reclaim to start a 65 * transaction because all transactions (other than our own) will block. 66 * Deferred inode inactivation helps us out there. 67 * 68 * I) Reverse mappings for all non-space metadata and file data are collected 69 * according to the following algorithm: 70 * 71 * 1. For each fork of each inode: 72 * 1.1. Create a bitmap BMBIT to track bmbt blocks if necessary. 73 * 1.2. If the incore extent map isn't loaded, walk the bmbt to accumulate 74 * bmaps into rmap records (see 1.1.4). Set bits in BMBIT for each btree 75 * block. 76 * 1.3. If the incore extent map is loaded but the fork is in btree format, 77 * just visit the bmbt blocks to set the corresponding BMBIT areas. 78 * 1.4. From the incore extent map, accumulate each bmap that falls into our 79 * target AG. Remember, multiple bmap records can map to a single rmap 80 * record, so we cannot simply emit rmap records 1:1. 81 * 1.5. Emit rmap records for each extent in BMBIT and free it. 82 * 2. Create bitmaps INOBIT and ICHUNKBIT. 83 * 3. For each record in the inobt, set the corresponding areas in ICHUNKBIT, 84 * and set bits in INOBIT for each btree block. If the inobt has no records 85 * at all, we must be careful to record its root in INOBIT. 86 * 4. For each block in the finobt, set the corresponding INOBIT area. 87 * 5. Emit rmap records for each extent in INOBIT and ICHUNKBIT and free them. 88 * 6. Create bitmaps REFCBIT and COWBIT. 89 * 7. For each CoW staging extent in the refcountbt, set the corresponding 90 * areas in COWBIT. 91 * 8. For each block in the refcountbt, set the corresponding REFCBIT area. 92 * 9. Emit rmap records for each extent in REFCBIT and COWBIT and free them. 93 * A. Emit rmap for the AG headers. 94 * B. Emit rmap for the log, if there is one. 95 * 96 * II) The rmapbt shape and space metadata rmaps are computed as follows: 97 * 98 * 1. Count the rmaps collected in the previous step. (= NR) 99 * 2. Estimate the number of rmapbt blocks needed to store NR records. (= RMB) 100 * 3. Reserve RMB blocks through the newbt using the allocator in normap mode. 101 * 4. Create bitmap AGBIT. 102 * 5. For each reservation in the newbt, set the corresponding areas in AGBIT. 103 * 6. For each block in the AGFL, bnobt, and cntbt, set the bits in AGBIT. 104 * 7. Count the extents in AGBIT. (= AGNR) 105 * 8. Estimate the number of rmapbt blocks needed for NR + AGNR rmaps. (= RMB') 106 * 9. If RMB' >= RMB, reserve RMB' - RMB more newbt blocks, set RMB = RMB', 107 * and clear AGBIT. Go to step 5. 108 * A. Emit rmaps for each extent in AGBIT. 109 * 110 * III) The rmapbt is constructed and set in place as follows: 111 * 112 * 1. Sort the rmap records. 113 * 2. Bulk load the rmaps. 114 * 115 * IV) Reap the old btree blocks. 116 * 117 * 1. Create a bitmap OLDRMBIT. 118 * 2. For each gap in the new rmapbt, set the corresponding areas of OLDRMBIT. 119 * 3. For each extent in the bnobt, clear the corresponding parts of OLDRMBIT. 120 * 4. Reap the extents corresponding to the set areas in OLDRMBIT. These are 121 * the parts of the AG that the rmap didn't find during its scan of the 122 * primary metadata and aren't known to be in the free space, which implies 123 * that they were the old rmapbt blocks. 124 * 5. Commit. 125 * 126 * We use the 'xrep_rmap' prefix for all the rmap functions. 127 */ 128 129 /* Context for collecting rmaps */ 130 struct xrep_rmap { 131 /* new rmapbt information */ 132 struct xrep_newbt new_btree; 133 134 /* lock for the xfbtree and xfile */ 135 struct mutex lock; 136 137 /* rmap records generated from primary metadata */ 138 struct xfbtree rmap_btree; 139 140 struct xfs_scrub *sc; 141 142 /* in-memory btree cursor for the xfs_btree_bload iteration */ 143 struct xfs_btree_cur *mcur; 144 145 /* Hooks into rmap update code. */ 146 struct xfs_rmap_hook rhook; 147 148 /* inode scan cursor */ 149 struct xchk_iscan iscan; 150 151 /* Number of non-freespace records found. */ 152 unsigned long long nr_records; 153 154 /* bnobt/cntbt contribution to btreeblks */ 155 xfs_agblock_t freesp_btblocks; 156 157 /* old agf_rmap_blocks counter */ 158 unsigned int old_rmapbt_fsbcount; 159 }; 160 161 /* Set us up to repair reverse mapping btrees. */ 162 int 163 xrep_setup_ag_rmapbt( 164 struct xfs_scrub *sc) 165 { 166 struct xrep_rmap *rr; 167 int error; 168 169 xchk_fsgates_enable(sc, XCHK_FSGATES_RMAP); 170 171 error = xrep_setup_xfbtree(sc, "reverse mapping records"); 172 if (error) 173 return error; 174 175 rr = kzalloc(sizeof(struct xrep_rmap), XCHK_GFP_FLAGS); 176 if (!rr) 177 return -ENOMEM; 178 179 rr->sc = sc; 180 sc->buf = rr; 181 return 0; 182 } 183 184 /* Make sure there's nothing funny about this mapping. */ 185 STATIC int 186 xrep_rmap_check_mapping( 187 struct xfs_scrub *sc, 188 const struct xfs_rmap_irec *rec) 189 { 190 enum xbtree_recpacking outcome; 191 int error; 192 193 if (xfs_rmap_check_irec(sc->sa.pag, rec) != NULL) 194 return -EFSCORRUPTED; 195 196 /* Make sure this isn't free space. */ 197 error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rm_startblock, 198 rec->rm_blockcount, &outcome); 199 if (error) 200 return error; 201 if (outcome != XBTREE_RECPACKING_EMPTY) 202 return -EFSCORRUPTED; 203 204 return 0; 205 } 206 207 /* Store a reverse-mapping record. */ 208 static inline int 209 xrep_rmap_stash( 210 struct xrep_rmap *rr, 211 xfs_agblock_t startblock, 212 xfs_extlen_t blockcount, 213 uint64_t owner, 214 uint64_t offset, 215 unsigned int flags) 216 { 217 struct xfs_rmap_irec rmap = { 218 .rm_startblock = startblock, 219 .rm_blockcount = blockcount, 220 .rm_owner = owner, 221 .rm_offset = offset, 222 .rm_flags = flags, 223 }; 224 struct xfs_scrub *sc = rr->sc; 225 struct xfs_btree_cur *mcur; 226 int error = 0; 227 228 if (xchk_should_terminate(sc, &error)) 229 return error; 230 231 if (xchk_iscan_aborted(&rr->iscan)) 232 return -EFSCORRUPTED; 233 234 trace_xrep_rmap_found(sc->sa.pag, &rmap); 235 236 mutex_lock(&rr->lock); 237 mcur = xfs_rmapbt_mem_cursor(sc->sa.pag, sc->tp, &rr->rmap_btree); 238 error = xfs_rmap_map_raw(mcur, &rmap); 239 xfs_btree_del_cursor(mcur, error); 240 if (error) 241 goto out_cancel; 242 243 error = xfbtree_trans_commit(&rr->rmap_btree, sc->tp); 244 if (error) 245 goto out_abort; 246 247 mutex_unlock(&rr->lock); 248 return 0; 249 250 out_cancel: 251 xfbtree_trans_cancel(&rr->rmap_btree, sc->tp); 252 out_abort: 253 xchk_iscan_abort(&rr->iscan); 254 mutex_unlock(&rr->lock); 255 return error; 256 } 257 258 struct xrep_rmap_stash_run { 259 struct xrep_rmap *rr; 260 uint64_t owner; 261 unsigned int rmap_flags; 262 }; 263 264 static int 265 xrep_rmap_stash_run( 266 uint32_t start, 267 uint32_t len, 268 void *priv) 269 { 270 struct xrep_rmap_stash_run *rsr = priv; 271 struct xrep_rmap *rr = rsr->rr; 272 273 return xrep_rmap_stash(rr, start, len, rsr->owner, 0, rsr->rmap_flags); 274 } 275 276 /* 277 * Emit rmaps for every extent of bits set in the bitmap. Caller must ensure 278 * that the ranges are in units of FS blocks. 279 */ 280 STATIC int 281 xrep_rmap_stash_bitmap( 282 struct xrep_rmap *rr, 283 struct xagb_bitmap *bitmap, 284 const struct xfs_owner_info *oinfo) 285 { 286 struct xrep_rmap_stash_run rsr = { 287 .rr = rr, 288 .owner = oinfo->oi_owner, 289 .rmap_flags = 0, 290 }; 291 292 if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK) 293 rsr.rmap_flags |= XFS_RMAP_ATTR_FORK; 294 if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK) 295 rsr.rmap_flags |= XFS_RMAP_BMBT_BLOCK; 296 297 return xagb_bitmap_walk(bitmap, xrep_rmap_stash_run, &rsr); 298 } 299 300 /* Section (I): Finding all file and bmbt extents. */ 301 302 /* Context for accumulating rmaps for an inode fork. */ 303 struct xrep_rmap_ifork { 304 /* 305 * Accumulate rmap data here to turn multiple adjacent bmaps into a 306 * single rmap. 307 */ 308 struct xfs_rmap_irec accum; 309 310 /* Bitmap of bmbt blocks in this AG. */ 311 struct xagb_bitmap bmbt_blocks; 312 313 struct xrep_rmap *rr; 314 315 /* Which inode fork? */ 316 int whichfork; 317 }; 318 319 /* Stash an rmap that we accumulated while walking an inode fork. */ 320 STATIC int 321 xrep_rmap_stash_accumulated( 322 struct xrep_rmap_ifork *rf) 323 { 324 if (rf->accum.rm_blockcount == 0) 325 return 0; 326 327 return xrep_rmap_stash(rf->rr, rf->accum.rm_startblock, 328 rf->accum.rm_blockcount, rf->accum.rm_owner, 329 rf->accum.rm_offset, rf->accum.rm_flags); 330 } 331 332 /* Accumulate a bmbt record. */ 333 STATIC int 334 xrep_rmap_visit_bmbt( 335 struct xfs_btree_cur *cur, 336 struct xfs_bmbt_irec *rec, 337 void *priv) 338 { 339 struct xrep_rmap_ifork *rf = priv; 340 struct xfs_mount *mp = rf->rr->sc->mp; 341 struct xfs_rmap_irec *accum = &rf->accum; 342 xfs_agblock_t agbno; 343 unsigned int rmap_flags = 0; 344 int error; 345 346 if (XFS_FSB_TO_AGNO(mp, rec->br_startblock) != 347 pag_agno(rf->rr->sc->sa.pag)) 348 return 0; 349 350 agbno = XFS_FSB_TO_AGBNO(mp, rec->br_startblock); 351 if (rf->whichfork == XFS_ATTR_FORK) 352 rmap_flags |= XFS_RMAP_ATTR_FORK; 353 if (rec->br_state == XFS_EXT_UNWRITTEN) 354 rmap_flags |= XFS_RMAP_UNWRITTEN; 355 356 /* If this bmap is adjacent to the previous one, just add it. */ 357 if (accum->rm_blockcount > 0 && 358 rec->br_startoff == accum->rm_offset + accum->rm_blockcount && 359 agbno == accum->rm_startblock + accum->rm_blockcount && 360 rmap_flags == accum->rm_flags) { 361 accum->rm_blockcount += rec->br_blockcount; 362 return 0; 363 } 364 365 /* Otherwise stash the old rmap and start accumulating a new one. */ 366 error = xrep_rmap_stash_accumulated(rf); 367 if (error) 368 return error; 369 370 accum->rm_startblock = agbno; 371 accum->rm_blockcount = rec->br_blockcount; 372 accum->rm_offset = rec->br_startoff; 373 accum->rm_flags = rmap_flags; 374 return 0; 375 } 376 377 /* Add a btree block to the bitmap. */ 378 STATIC int 379 xrep_rmap_visit_iroot_btree_block( 380 struct xfs_btree_cur *cur, 381 int level, 382 void *priv) 383 { 384 struct xrep_rmap_ifork *rf = priv; 385 struct xfs_buf *bp; 386 xfs_fsblock_t fsbno; 387 xfs_agblock_t agbno; 388 389 xfs_btree_get_block(cur, level, &bp); 390 if (!bp) 391 return 0; 392 393 fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); 394 if (XFS_FSB_TO_AGNO(cur->bc_mp, fsbno) != pag_agno(rf->rr->sc->sa.pag)) 395 return 0; 396 397 agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); 398 return xagb_bitmap_set(&rf->bmbt_blocks, agbno, 1); 399 } 400 401 /* 402 * Iterate a metadata btree rooted in an inode to collect rmap records for 403 * anything in this fork that matches the AG. 404 */ 405 STATIC int 406 xrep_rmap_scan_iroot_btree( 407 struct xrep_rmap_ifork *rf, 408 struct xfs_btree_cur *cur) 409 { 410 struct xfs_owner_info oinfo; 411 struct xrep_rmap *rr = rf->rr; 412 int error; 413 414 xagb_bitmap_init(&rf->bmbt_blocks); 415 416 /* Record all the blocks in the btree itself. */ 417 error = xfs_btree_visit_blocks(cur, xrep_rmap_visit_iroot_btree_block, 418 XFS_BTREE_VISIT_ALL, rf); 419 if (error) 420 goto out; 421 422 /* Emit rmaps for the btree blocks. */ 423 xfs_rmap_ino_bmbt_owner(&oinfo, rf->accum.rm_owner, rf->whichfork); 424 error = xrep_rmap_stash_bitmap(rr, &rf->bmbt_blocks, &oinfo); 425 if (error) 426 goto out; 427 428 /* Stash any remaining accumulated rmaps. */ 429 error = xrep_rmap_stash_accumulated(rf); 430 out: 431 xagb_bitmap_destroy(&rf->bmbt_blocks); 432 return error; 433 } 434 435 /* 436 * Iterate the block mapping btree to collect rmap records for anything in this 437 * fork that matches the AG. Sets @mappings_done to true if we've scanned the 438 * block mappings in this fork. 439 */ 440 STATIC int 441 xrep_rmap_scan_bmbt( 442 struct xrep_rmap_ifork *rf, 443 struct xfs_inode *ip, 444 bool *mappings_done) 445 { 446 struct xrep_rmap *rr = rf->rr; 447 struct xfs_btree_cur *cur; 448 struct xfs_ifork *ifp; 449 int error; 450 451 *mappings_done = false; 452 ifp = xfs_ifork_ptr(ip, rf->whichfork); 453 cur = xfs_bmbt_init_cursor(rr->sc->mp, rr->sc->tp, ip, rf->whichfork); 454 455 if (!xfs_ifork_is_realtime(ip, rf->whichfork) && 456 xfs_need_iread_extents(ifp)) { 457 /* 458 * If the incore extent cache isn't loaded, scan the bmbt for 459 * mapping records. This avoids loading the incore extent 460 * tree, which will increase memory pressure at a time when 461 * we're trying to run as quickly as we possibly can. Ignore 462 * realtime extents. 463 */ 464 error = xfs_bmap_query_all(cur, xrep_rmap_visit_bmbt, rf); 465 if (error) 466 goto out_cur; 467 468 *mappings_done = true; 469 } 470 471 /* Scan for the bmbt blocks, which always live on the data device. */ 472 error = xrep_rmap_scan_iroot_btree(rf, cur); 473 out_cur: 474 xfs_btree_del_cursor(cur, error); 475 return error; 476 } 477 478 /* 479 * Iterate the in-core extent cache to collect rmap records for anything in 480 * this fork that matches the AG. 481 */ 482 STATIC int 483 xrep_rmap_scan_iext( 484 struct xrep_rmap_ifork *rf, 485 struct xfs_ifork *ifp) 486 { 487 struct xfs_bmbt_irec rec; 488 struct xfs_iext_cursor icur; 489 int error; 490 491 for_each_xfs_iext(ifp, &icur, &rec) { 492 if (isnullstartblock(rec.br_startblock)) 493 continue; 494 error = xrep_rmap_visit_bmbt(NULL, &rec, rf); 495 if (error) 496 return error; 497 } 498 499 return xrep_rmap_stash_accumulated(rf); 500 } 501 502 static int 503 xrep_rmap_scan_meta_btree( 504 struct xrep_rmap_ifork *rf, 505 struct xfs_inode *ip) 506 { 507 struct xfs_scrub *sc = rf->rr->sc; 508 struct xfs_rtgroup *rtg = NULL; 509 struct xfs_btree_cur *cur = NULL; 510 enum xfs_rtg_inodes type; 511 int error; 512 513 if (rf->whichfork != XFS_DATA_FORK) 514 return -EFSCORRUPTED; 515 516 switch (ip->i_metatype) { 517 case XFS_METAFILE_RTRMAP: 518 type = XFS_RTGI_RMAP; 519 break; 520 case XFS_METAFILE_RTREFCOUNT: 521 type = XFS_RTGI_REFCOUNT; 522 break; 523 default: 524 ASSERT(0); 525 return -EFSCORRUPTED; 526 } 527 528 while ((rtg = xfs_rtgroup_next(sc->mp, rtg))) { 529 if (ip == rtg->rtg_inodes[type]) 530 goto found; 531 } 532 533 /* 534 * We should never find an rt metadata btree inode that isn't 535 * associated with an rtgroup yet has ondisk blocks allocated to it. 536 */ 537 if (ip->i_nblocks) { 538 ASSERT(0); 539 return -EFSCORRUPTED; 540 } 541 542 return 0; 543 544 found: 545 switch (ip->i_metatype) { 546 case XFS_METAFILE_RTRMAP: 547 cur = xfs_rtrmapbt_init_cursor(sc->tp, rtg); 548 break; 549 case XFS_METAFILE_RTREFCOUNT: 550 cur = xfs_rtrefcountbt_init_cursor(sc->tp, rtg); 551 break; 552 default: 553 ASSERT(0); 554 error = -EFSCORRUPTED; 555 goto out_rtg; 556 } 557 558 error = xrep_rmap_scan_iroot_btree(rf, cur); 559 xfs_btree_del_cursor(cur, error); 560 out_rtg: 561 xfs_rtgroup_rele(rtg); 562 return error; 563 } 564 565 /* Find all the extents from a given AG in an inode fork. */ 566 STATIC int 567 xrep_rmap_scan_ifork( 568 struct xrep_rmap *rr, 569 struct xfs_inode *ip, 570 int whichfork) 571 { 572 struct xrep_rmap_ifork rf = { 573 .accum = { .rm_owner = ip->i_ino, }, 574 .rr = rr, 575 .whichfork = whichfork, 576 }; 577 struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); 578 bool mappings_done; 579 int error = 0; 580 581 if (!ifp) 582 return 0; 583 584 switch (ifp->if_format) { 585 case XFS_DINODE_FMT_BTREE: 586 /* 587 * Scan the bmap btree for data device mappings. This includes 588 * the btree blocks themselves, even if this is a realtime 589 * file. 590 */ 591 error = xrep_rmap_scan_bmbt(&rf, ip, &mappings_done); 592 if (error || mappings_done) 593 return error; 594 fallthrough; 595 case XFS_DINODE_FMT_EXTENTS: 596 /* Scan incore extent cache if this isn't a realtime file. */ 597 if (xfs_ifork_is_realtime(ip, whichfork)) 598 return 0; 599 600 return xrep_rmap_scan_iext(&rf, ifp); 601 case XFS_DINODE_FMT_META_BTREE: 602 return xrep_rmap_scan_meta_btree(&rf, ip); 603 } 604 605 return 0; 606 } 607 608 /* 609 * Take ILOCK on a file that we want to scan. 610 * 611 * Select ILOCK_EXCL if the file has an unloaded data bmbt or has an unloaded 612 * attr bmbt. Otherwise, take ILOCK_SHARED. 613 */ 614 static inline unsigned int 615 xrep_rmap_scan_ilock( 616 struct xfs_inode *ip) 617 { 618 uint lock_mode = XFS_ILOCK_SHARED; 619 620 if (xfs_need_iread_extents(&ip->i_df)) { 621 lock_mode = XFS_ILOCK_EXCL; 622 goto lock; 623 } 624 625 if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af)) 626 lock_mode = XFS_ILOCK_EXCL; 627 628 lock: 629 xfs_ilock(ip, lock_mode); 630 return lock_mode; 631 } 632 633 /* Record reverse mappings for a file. */ 634 STATIC int 635 xrep_rmap_scan_inode( 636 struct xrep_rmap *rr, 637 struct xfs_inode *ip) 638 { 639 unsigned int lock_mode = xrep_rmap_scan_ilock(ip); 640 int error; 641 642 /* Check the data fork. */ 643 error = xrep_rmap_scan_ifork(rr, ip, XFS_DATA_FORK); 644 if (error) 645 goto out_unlock; 646 647 /* Check the attr fork. */ 648 error = xrep_rmap_scan_ifork(rr, ip, XFS_ATTR_FORK); 649 if (error) 650 goto out_unlock; 651 652 /* COW fork extents are "owned" by the refcount btree. */ 653 654 xchk_iscan_mark_visited(&rr->iscan, ip); 655 out_unlock: 656 xfs_iunlock(ip, lock_mode); 657 return error; 658 } 659 660 /* Section (I): Find all AG metadata extents except for free space metadata. */ 661 662 struct xrep_rmap_inodes { 663 struct xrep_rmap *rr; 664 struct xagb_bitmap inobt_blocks; /* INOBIT */ 665 struct xagb_bitmap ichunk_blocks; /* ICHUNKBIT */ 666 }; 667 668 /* Record inode btree rmaps. */ 669 STATIC int 670 xrep_rmap_walk_inobt( 671 struct xfs_btree_cur *cur, 672 const union xfs_btree_rec *rec, 673 void *priv) 674 { 675 struct xfs_inobt_rec_incore irec; 676 struct xrep_rmap_inodes *ri = priv; 677 struct xfs_mount *mp = cur->bc_mp; 678 xfs_agblock_t agbno; 679 xfs_extlen_t aglen; 680 xfs_agino_t agino; 681 xfs_agino_t iperhole; 682 unsigned int i; 683 int error; 684 685 /* Record the inobt blocks. */ 686 error = xagb_bitmap_set_btcur_path(&ri->inobt_blocks, cur); 687 if (error) 688 return error; 689 690 xfs_inobt_btrec_to_irec(mp, rec, &irec); 691 if (xfs_inobt_check_irec(to_perag(cur->bc_group), &irec) != NULL) 692 return -EFSCORRUPTED; 693 694 agino = irec.ir_startino; 695 696 /* Record a non-sparse inode chunk. */ 697 if (!xfs_inobt_issparse(irec.ir_holemask)) { 698 agbno = XFS_AGINO_TO_AGBNO(mp, agino); 699 aglen = max_t(xfs_extlen_t, 1, 700 XFS_INODES_PER_CHUNK / mp->m_sb.sb_inopblock); 701 702 return xagb_bitmap_set(&ri->ichunk_blocks, agbno, aglen); 703 } 704 705 /* Iterate each chunk. */ 706 iperhole = max_t(xfs_agino_t, mp->m_sb.sb_inopblock, 707 XFS_INODES_PER_HOLEMASK_BIT); 708 aglen = iperhole / mp->m_sb.sb_inopblock; 709 for (i = 0, agino = irec.ir_startino; 710 i < XFS_INOBT_HOLEMASK_BITS; 711 i += iperhole / XFS_INODES_PER_HOLEMASK_BIT, agino += iperhole) { 712 /* Skip holes. */ 713 if (irec.ir_holemask & (1 << i)) 714 continue; 715 716 /* Record the inode chunk otherwise. */ 717 agbno = XFS_AGINO_TO_AGBNO(mp, agino); 718 error = xagb_bitmap_set(&ri->ichunk_blocks, agbno, aglen); 719 if (error) 720 return error; 721 } 722 723 return 0; 724 } 725 726 /* Collect rmaps for the blocks containing inode btrees and the inode chunks. */ 727 STATIC int 728 xrep_rmap_find_inode_rmaps( 729 struct xrep_rmap *rr) 730 { 731 struct xrep_rmap_inodes ri = { 732 .rr = rr, 733 }; 734 struct xfs_scrub *sc = rr->sc; 735 int error; 736 737 xagb_bitmap_init(&ri.inobt_blocks); 738 xagb_bitmap_init(&ri.ichunk_blocks); 739 740 /* 741 * Iterate every record in the inobt so we can capture all the inode 742 * chunks and the blocks in the inobt itself. 743 */ 744 error = xfs_btree_query_all(sc->sa.ino_cur, xrep_rmap_walk_inobt, &ri); 745 if (error) 746 goto out_bitmap; 747 748 /* 749 * Note that if there are zero records in the inobt then query_all does 750 * nothing and we have to account the empty inobt root manually. 751 */ 752 if (xagb_bitmap_empty(&ri.ichunk_blocks)) { 753 struct xfs_agi *agi = sc->sa.agi_bp->b_addr; 754 755 error = xagb_bitmap_set(&ri.inobt_blocks, 756 be32_to_cpu(agi->agi_root), 1); 757 if (error) 758 goto out_bitmap; 759 } 760 761 /* Scan the finobt too. */ 762 if (xfs_has_finobt(sc->mp)) { 763 error = xagb_bitmap_set_btblocks(&ri.inobt_blocks, 764 sc->sa.fino_cur); 765 if (error) 766 goto out_bitmap; 767 } 768 769 /* Generate rmaps for everything. */ 770 error = xrep_rmap_stash_bitmap(rr, &ri.inobt_blocks, 771 &XFS_RMAP_OINFO_INOBT); 772 if (error) 773 goto out_bitmap; 774 error = xrep_rmap_stash_bitmap(rr, &ri.ichunk_blocks, 775 &XFS_RMAP_OINFO_INODES); 776 777 out_bitmap: 778 xagb_bitmap_destroy(&ri.inobt_blocks); 779 xagb_bitmap_destroy(&ri.ichunk_blocks); 780 return error; 781 } 782 783 /* Record a CoW staging extent. */ 784 STATIC int 785 xrep_rmap_walk_cowblocks( 786 struct xfs_btree_cur *cur, 787 const struct xfs_refcount_irec *irec, 788 void *priv) 789 { 790 struct xagb_bitmap *bitmap = priv; 791 792 if (!xfs_refcount_check_domain(irec) || 793 irec->rc_domain != XFS_REFC_DOMAIN_COW) 794 return -EFSCORRUPTED; 795 796 return xagb_bitmap_set(bitmap, irec->rc_startblock, irec->rc_blockcount); 797 } 798 799 /* 800 * Collect rmaps for the blocks containing the refcount btree, and all CoW 801 * staging extents. 802 */ 803 STATIC int 804 xrep_rmap_find_refcount_rmaps( 805 struct xrep_rmap *rr) 806 { 807 struct xagb_bitmap refcountbt_blocks; /* REFCBIT */ 808 struct xagb_bitmap cow_blocks; /* COWBIT */ 809 struct xfs_refcount_irec low = { 810 .rc_startblock = 0, 811 .rc_domain = XFS_REFC_DOMAIN_COW, 812 }; 813 struct xfs_refcount_irec high = { 814 .rc_startblock = -1U, 815 .rc_domain = XFS_REFC_DOMAIN_COW, 816 }; 817 struct xfs_scrub *sc = rr->sc; 818 int error; 819 820 if (!xfs_has_reflink(sc->mp)) 821 return 0; 822 823 xagb_bitmap_init(&refcountbt_blocks); 824 xagb_bitmap_init(&cow_blocks); 825 826 /* refcountbt */ 827 error = xagb_bitmap_set_btblocks(&refcountbt_blocks, sc->sa.refc_cur); 828 if (error) 829 goto out_bitmap; 830 831 /* Collect rmaps for CoW staging extents. */ 832 error = xfs_refcount_query_range(sc->sa.refc_cur, &low, &high, 833 xrep_rmap_walk_cowblocks, &cow_blocks); 834 if (error) 835 goto out_bitmap; 836 837 /* Generate rmaps for everything. */ 838 error = xrep_rmap_stash_bitmap(rr, &cow_blocks, &XFS_RMAP_OINFO_COW); 839 if (error) 840 goto out_bitmap; 841 error = xrep_rmap_stash_bitmap(rr, &refcountbt_blocks, 842 &XFS_RMAP_OINFO_REFC); 843 844 out_bitmap: 845 xagb_bitmap_destroy(&cow_blocks); 846 xagb_bitmap_destroy(&refcountbt_blocks); 847 return error; 848 } 849 850 /* Generate rmaps for the AG headers (AGI/AGF/AGFL) */ 851 STATIC int 852 xrep_rmap_find_agheader_rmaps( 853 struct xrep_rmap *rr) 854 { 855 struct xfs_scrub *sc = rr->sc; 856 857 /* Create a record for the AG sb->agfl. */ 858 return xrep_rmap_stash(rr, XFS_SB_BLOCK(sc->mp), 859 XFS_AGFL_BLOCK(sc->mp) - XFS_SB_BLOCK(sc->mp) + 1, 860 XFS_RMAP_OWN_FS, 0, 0); 861 } 862 863 /* Generate rmaps for the log, if it's in this AG. */ 864 STATIC int 865 xrep_rmap_find_log_rmaps( 866 struct xrep_rmap *rr) 867 { 868 struct xfs_scrub *sc = rr->sc; 869 870 if (!xfs_ag_contains_log(sc->mp, pag_agno(sc->sa.pag))) 871 return 0; 872 873 return xrep_rmap_stash(rr, 874 XFS_FSB_TO_AGBNO(sc->mp, sc->mp->m_sb.sb_logstart), 875 sc->mp->m_sb.sb_logblocks, XFS_RMAP_OWN_LOG, 0, 0); 876 } 877 878 /* Check and count all the records that we gathered. */ 879 STATIC int 880 xrep_rmap_check_record( 881 struct xfs_btree_cur *cur, 882 const struct xfs_rmap_irec *rec, 883 void *priv) 884 { 885 struct xrep_rmap *rr = priv; 886 int error; 887 888 error = xrep_rmap_check_mapping(rr->sc, rec); 889 if (error) 890 return error; 891 892 rr->nr_records++; 893 return 0; 894 } 895 896 /* 897 * Generate all the reverse-mappings for this AG, a list of the old rmapbt 898 * blocks, and the new btreeblks count. Figure out if we have enough free 899 * space to reconstruct the inode btrees. The caller must clean up the lists 900 * if anything goes wrong. This implements section (I) above. 901 */ 902 STATIC int 903 xrep_rmap_find_rmaps( 904 struct xrep_rmap *rr) 905 { 906 struct xfs_scrub *sc = rr->sc; 907 struct xchk_ag *sa = &sc->sa; 908 struct xfs_inode *ip; 909 struct xfs_btree_cur *mcur; 910 int error; 911 912 /* Find all the per-AG metadata. */ 913 xrep_ag_btcur_init(sc, &sc->sa); 914 915 error = xrep_rmap_find_inode_rmaps(rr); 916 if (error) 917 goto end_agscan; 918 919 error = xrep_rmap_find_refcount_rmaps(rr); 920 if (error) 921 goto end_agscan; 922 923 error = xrep_rmap_find_agheader_rmaps(rr); 924 if (error) 925 goto end_agscan; 926 927 error = xrep_rmap_find_log_rmaps(rr); 928 end_agscan: 929 xchk_ag_btcur_free(&sc->sa); 930 if (error) 931 return error; 932 933 /* 934 * Set up for a potentially lengthy filesystem scan by reducing our 935 * transaction resource usage for the duration. Specifically: 936 * 937 * Unlock the AG header buffers and cancel the transaction to release 938 * the log grant space while we scan the filesystem. 939 * 940 * Create a new empty transaction to eliminate the possibility of the 941 * inode scan deadlocking on cyclical metadata. 942 * 943 * We pass the empty transaction to the file scanning function to avoid 944 * repeatedly cycling empty transactions. This can be done even though 945 * we take the IOLOCK to quiesce the file because empty transactions 946 * do not take sb_internal. 947 */ 948 sa->agf_bp = NULL; 949 sa->agi_bp = NULL; 950 xchk_trans_cancel(sc); 951 xchk_trans_alloc_empty(sc); 952 953 /* Iterate all AGs for inodes rmaps. */ 954 while ((error = xchk_iscan_iter(&rr->iscan, &ip)) == 1) { 955 error = xrep_rmap_scan_inode(rr, ip); 956 xchk_irele(sc, ip); 957 if (error) 958 break; 959 960 if (xchk_should_terminate(sc, &error)) 961 break; 962 } 963 xchk_iscan_iter_finish(&rr->iscan); 964 if (error) 965 return error; 966 967 /* 968 * Switch out for a real transaction and lock the AG headers in 969 * preparation for building a new tree. 970 */ 971 xchk_trans_cancel(sc); 972 error = xchk_setup_fs(sc); 973 if (error) 974 return error; 975 error = xchk_perag_drain_and_lock(sc); 976 if (error) 977 return error; 978 979 /* 980 * If a hook failed to update the in-memory btree, we lack the data to 981 * continue the repair. 982 */ 983 if (xchk_iscan_aborted(&rr->iscan)) 984 return -EFSCORRUPTED; 985 986 /* 987 * Now that we have everything locked again, we need to count the 988 * number of rmap records stashed in the btree. This should reflect 989 * all actively-owned space in the filesystem. At the same time, check 990 * all our records before we start building a new btree, which requires 991 * a bnobt cursor. 992 */ 993 mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL, &rr->rmap_btree); 994 sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, 995 sc->sa.pag); 996 997 rr->nr_records = 0; 998 error = xfs_rmap_query_all(mcur, xrep_rmap_check_record, rr); 999 1000 xfs_btree_del_cursor(sc->sa.bno_cur, error); 1001 sc->sa.bno_cur = NULL; 1002 xfs_btree_del_cursor(mcur, error); 1003 1004 return error; 1005 } 1006 1007 /* Section (II): Reserving space for new rmapbt and setting free space bitmap */ 1008 1009 struct xrep_rmap_agfl { 1010 struct xagb_bitmap *bitmap; 1011 xfs_agnumber_t agno; 1012 }; 1013 1014 /* Add an AGFL block to the rmap list. */ 1015 STATIC int 1016 xrep_rmap_walk_agfl( 1017 struct xfs_mount *mp, 1018 xfs_agblock_t agbno, 1019 void *priv) 1020 { 1021 struct xrep_rmap_agfl *ra = priv; 1022 1023 return xagb_bitmap_set(ra->bitmap, agbno, 1); 1024 } 1025 1026 /* 1027 * Run one round of reserving space for the new rmapbt and recomputing the 1028 * number of blocks needed to store the previously observed rmapbt records and 1029 * the ones we'll create for the free space metadata. When we don't need more 1030 * blocks, return a bitmap of OWN_AG extents in @freesp_blocks and set @done to 1031 * true. 1032 */ 1033 STATIC int 1034 xrep_rmap_try_reserve( 1035 struct xrep_rmap *rr, 1036 struct xfs_btree_cur *rmap_cur, 1037 struct xagb_bitmap *freesp_blocks, 1038 uint64_t *blocks_reserved, 1039 bool *done) 1040 { 1041 struct xrep_rmap_agfl ra = { 1042 .bitmap = freesp_blocks, 1043 .agno = pag_agno(rr->sc->sa.pag), 1044 }; 1045 struct xfs_scrub *sc = rr->sc; 1046 struct xrep_newbt_resv *resv, *n; 1047 struct xfs_agf *agf = sc->sa.agf_bp->b_addr; 1048 struct xfs_buf *agfl_bp; 1049 uint64_t nr_blocks; /* RMB */ 1050 uint64_t freesp_records; 1051 int error; 1052 1053 /* 1054 * We're going to recompute new_btree.bload.nr_blocks at the end of 1055 * this function to reflect however many btree blocks we need to store 1056 * all the rmap records (including the ones that reflect the changes we 1057 * made to support the new rmapbt blocks), so we save the old value 1058 * here so we can decide if we've reserved enough blocks. 1059 */ 1060 nr_blocks = rr->new_btree.bload.nr_blocks; 1061 1062 /* 1063 * Make sure we've reserved enough space for the new btree. This can 1064 * change the shape of the free space btrees, which can cause secondary 1065 * interactions with the rmap records because all three space btrees 1066 * have the same rmap owner. We'll account for all that below. 1067 */ 1068 error = xrep_newbt_alloc_blocks(&rr->new_btree, 1069 nr_blocks - *blocks_reserved); 1070 if (error) 1071 return error; 1072 1073 *blocks_reserved = rr->new_btree.bload.nr_blocks; 1074 1075 /* Clear everything in the bitmap. */ 1076 xagb_bitmap_destroy(freesp_blocks); 1077 1078 /* Set all the bnobt blocks in the bitmap. */ 1079 sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, 1080 sc->sa.pag); 1081 error = xagb_bitmap_set_btblocks(freesp_blocks, sc->sa.bno_cur); 1082 xfs_btree_del_cursor(sc->sa.bno_cur, error); 1083 sc->sa.bno_cur = NULL; 1084 if (error) 1085 return error; 1086 1087 /* Set all the cntbt blocks in the bitmap. */ 1088 sc->sa.cnt_cur = xfs_cntbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, 1089 sc->sa.pag); 1090 error = xagb_bitmap_set_btblocks(freesp_blocks, sc->sa.cnt_cur); 1091 xfs_btree_del_cursor(sc->sa.cnt_cur, error); 1092 sc->sa.cnt_cur = NULL; 1093 if (error) 1094 return error; 1095 1096 /* Record our new btreeblks value. */ 1097 rr->freesp_btblocks = xagb_bitmap_hweight(freesp_blocks) - 2; 1098 1099 /* Set all the new rmapbt blocks in the bitmap. */ 1100 list_for_each_entry_safe(resv, n, &rr->new_btree.resv_list, list) { 1101 error = xagb_bitmap_set(freesp_blocks, resv->agbno, resv->len); 1102 if (error) 1103 return error; 1104 } 1105 1106 /* Set all the AGFL blocks in the bitmap. */ 1107 error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp); 1108 if (error) 1109 return error; 1110 1111 error = xfs_agfl_walk(sc->mp, agf, agfl_bp, xrep_rmap_walk_agfl, &ra); 1112 if (error) 1113 return error; 1114 1115 /* Count the extents in the bitmap. */ 1116 freesp_records = xagb_bitmap_count_set_regions(freesp_blocks); 1117 1118 /* Compute how many blocks we'll need for all the rmaps. */ 1119 error = xfs_btree_bload_compute_geometry(rmap_cur, 1120 &rr->new_btree.bload, rr->nr_records + freesp_records); 1121 if (error) 1122 return error; 1123 1124 /* We're done when we don't need more blocks. */ 1125 *done = nr_blocks >= rr->new_btree.bload.nr_blocks; 1126 return 0; 1127 } 1128 1129 /* 1130 * Iteratively reserve space for rmap btree while recording OWN_AG rmaps for 1131 * the free space metadata. This implements section (II) above. 1132 */ 1133 STATIC int 1134 xrep_rmap_reserve_space( 1135 struct xrep_rmap *rr, 1136 struct xfs_btree_cur *rmap_cur) 1137 { 1138 struct xagb_bitmap freesp_blocks; /* AGBIT */ 1139 uint64_t blocks_reserved = 0; 1140 bool done = false; 1141 int error; 1142 1143 /* Compute how many blocks we'll need for the rmaps collected so far. */ 1144 error = xfs_btree_bload_compute_geometry(rmap_cur, 1145 &rr->new_btree.bload, rr->nr_records); 1146 if (error) 1147 return error; 1148 1149 /* Last chance to abort before we start committing fixes. */ 1150 if (xchk_should_terminate(rr->sc, &error)) 1151 return error; 1152 1153 xagb_bitmap_init(&freesp_blocks); 1154 1155 /* 1156 * Iteratively reserve space for the new rmapbt and recompute the 1157 * number of blocks needed to store the previously observed rmapbt 1158 * records and the ones we'll create for the free space metadata. 1159 * Finish when we don't need more blocks. 1160 */ 1161 do { 1162 error = xrep_rmap_try_reserve(rr, rmap_cur, &freesp_blocks, 1163 &blocks_reserved, &done); 1164 if (error) 1165 goto out_bitmap; 1166 } while (!done); 1167 1168 /* Emit rmaps for everything in the free space bitmap. */ 1169 xrep_ag_btcur_init(rr->sc, &rr->sc->sa); 1170 error = xrep_rmap_stash_bitmap(rr, &freesp_blocks, &XFS_RMAP_OINFO_AG); 1171 xchk_ag_btcur_free(&rr->sc->sa); 1172 1173 out_bitmap: 1174 xagb_bitmap_destroy(&freesp_blocks); 1175 return error; 1176 } 1177 1178 /* Section (III): Building the new rmap btree. */ 1179 1180 /* Update the AGF counters. */ 1181 STATIC int 1182 xrep_rmap_reset_counters( 1183 struct xrep_rmap *rr) 1184 { 1185 struct xfs_scrub *sc = rr->sc; 1186 struct xfs_perag *pag = sc->sa.pag; 1187 struct xfs_agf *agf = sc->sa.agf_bp->b_addr; 1188 xfs_agblock_t rmap_btblocks; 1189 1190 /* 1191 * The AGF header contains extra information related to the reverse 1192 * mapping btree, so we must update those fields here. 1193 */ 1194 rmap_btblocks = rr->new_btree.afake.af_blocks - 1; 1195 agf->agf_btreeblks = cpu_to_be32(rr->freesp_btblocks + rmap_btblocks); 1196 xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_BTREEBLKS); 1197 1198 /* 1199 * After we commit the new btree to disk, it is possible that the 1200 * process to reap the old btree blocks will race with the AIL trying 1201 * to checkpoint the old btree blocks into the filesystem. If the new 1202 * tree is shorter than the old one, the rmapbt write verifier will 1203 * fail and the AIL will shut down the filesystem. 1204 * 1205 * To avoid this, save the old incore btree height values as the alt 1206 * height values before re-initializing the perag info from the updated 1207 * AGF to capture all the new values. 1208 */ 1209 pag->pagf_repair_rmap_level = pag->pagf_rmap_level; 1210 1211 /* Reinitialize with the values we just logged. */ 1212 return xrep_reinit_pagf(sc); 1213 } 1214 1215 /* Retrieve rmapbt data for bulk load. */ 1216 STATIC int 1217 xrep_rmap_get_records( 1218 struct xfs_btree_cur *cur, 1219 unsigned int idx, 1220 struct xfs_btree_block *block, 1221 unsigned int nr_wanted, 1222 void *priv) 1223 { 1224 struct xrep_rmap *rr = priv; 1225 union xfs_btree_rec *block_rec; 1226 unsigned int loaded; 1227 int error; 1228 1229 for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { 1230 int stat = 0; 1231 1232 error = xfs_btree_increment(rr->mcur, 0, &stat); 1233 if (error) 1234 return error; 1235 if (!stat) 1236 return -EFSCORRUPTED; 1237 1238 error = xfs_rmap_get_rec(rr->mcur, &cur->bc_rec.r, &stat); 1239 if (error) 1240 return error; 1241 if (!stat) 1242 return -EFSCORRUPTED; 1243 1244 block_rec = xfs_btree_rec_addr(cur, idx, block); 1245 cur->bc_ops->init_rec_from_cur(cur, block_rec); 1246 } 1247 1248 return loaded; 1249 } 1250 1251 /* Feed one of the new btree blocks to the bulk loader. */ 1252 STATIC int 1253 xrep_rmap_claim_block( 1254 struct xfs_btree_cur *cur, 1255 union xfs_btree_ptr *ptr, 1256 void *priv) 1257 { 1258 struct xrep_rmap *rr = priv; 1259 1260 return xrep_newbt_claim_block(cur, &rr->new_btree, ptr); 1261 } 1262 1263 /* Custom allocation function for new rmap btrees. */ 1264 STATIC int 1265 xrep_rmap_alloc_vextent( 1266 struct xfs_scrub *sc, 1267 struct xfs_alloc_arg *args, 1268 xfs_fsblock_t alloc_hint) 1269 { 1270 int error; 1271 1272 /* 1273 * We don't want an rmap update on the allocation, since we iteratively 1274 * compute the OWN_AG records /after/ allocating blocks for the records 1275 * that we already know we need to store. Therefore, fix the freelist 1276 * with the NORMAP flag set so that we don't also try to create an rmap 1277 * for new AGFL blocks. 1278 */ 1279 error = xrep_fix_freelist(sc, XFS_ALLOC_FLAG_NORMAP); 1280 if (error) 1281 return error; 1282 1283 /* 1284 * If xrep_fix_freelist fixed the freelist by moving blocks from the 1285 * free space btrees or by removing blocks from the AGFL and queueing 1286 * an EFI to free the block, the transaction will be dirty. This 1287 * second case is of interest to us. 1288 * 1289 * Later on, we will need to compare gaps in the new recordset against 1290 * the block usage of all OWN_AG owners in order to free the old 1291 * btree's blocks, which means that we can't have EFIs for former AGFL 1292 * blocks attached to the repair transaction when we commit the new 1293 * btree. 1294 * 1295 * xrep_newbt_alloc_blocks guarantees this for us by calling 1296 * xrep_defer_finish to commit anything that fix_freelist may have 1297 * added to the transaction. 1298 */ 1299 return xfs_alloc_vextent_near_bno(args, alloc_hint); 1300 } 1301 1302 1303 /* Count the records in this btree. */ 1304 STATIC int 1305 xrep_rmap_count_records( 1306 struct xfs_btree_cur *cur, 1307 unsigned long long *nr) 1308 { 1309 int running = 1; 1310 int error; 1311 1312 *nr = 0; 1313 1314 error = xfs_btree_goto_left_edge(cur); 1315 if (error) 1316 return error; 1317 1318 while (running && !(error = xfs_btree_increment(cur, 0, &running))) { 1319 if (running) 1320 (*nr)++; 1321 } 1322 1323 return error; 1324 } 1325 /* 1326 * Use the collected rmap information to stage a new rmap btree. If this is 1327 * successful we'll return with the new btree root information logged to the 1328 * repair transaction but not yet committed. This implements section (III) 1329 * above. 1330 */ 1331 STATIC int 1332 xrep_rmap_build_new_tree( 1333 struct xrep_rmap *rr) 1334 { 1335 struct xfs_scrub *sc = rr->sc; 1336 struct xfs_perag *pag = sc->sa.pag; 1337 struct xfs_agf *agf = sc->sa.agf_bp->b_addr; 1338 struct xfs_btree_cur *rmap_cur; 1339 int error; 1340 1341 /* 1342 * Preserve the old rmapbt block count so that we can adjust the 1343 * per-AG rmapbt reservation after we commit the new btree root and 1344 * want to dispose of the old btree blocks. 1345 */ 1346 rr->old_rmapbt_fsbcount = be32_to_cpu(agf->agf_rmap_blocks); 1347 1348 /* 1349 * Prepare to construct the new btree by reserving disk space for the 1350 * new btree and setting up all the accounting information we'll need 1351 * to root the new btree while it's under construction and before we 1352 * attach it to the AG header. The new blocks are accounted to the 1353 * rmapbt per-AG reservation, which we will adjust further after 1354 * committing the new btree. 1355 */ 1356 xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_SKIP_UPDATE, 1357 xfs_agbno_to_fsb(pag, XFS_RMAP_BLOCK(sc->mp)), 1358 XFS_AG_RESV_RMAPBT); 1359 rr->new_btree.bload.get_records = xrep_rmap_get_records; 1360 rr->new_btree.bload.claim_block = xrep_rmap_claim_block; 1361 rr->new_btree.alloc_vextent = xrep_rmap_alloc_vextent; 1362 rmap_cur = xfs_rmapbt_init_cursor(sc->mp, NULL, NULL, pag); 1363 xfs_btree_stage_afakeroot(rmap_cur, &rr->new_btree.afake); 1364 1365 /* 1366 * Initialize @rr->new_btree, reserve space for the new rmapbt, 1367 * and compute OWN_AG rmaps. 1368 */ 1369 error = xrep_rmap_reserve_space(rr, rmap_cur); 1370 if (error) 1371 goto err_cur; 1372 1373 /* 1374 * Count the rmapbt records again, because the space reservation 1375 * for the rmapbt itself probably added more records to the btree. 1376 */ 1377 rr->mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL, 1378 &rr->rmap_btree); 1379 1380 error = xrep_rmap_count_records(rr->mcur, &rr->nr_records); 1381 if (error) 1382 goto err_mcur; 1383 1384 /* 1385 * Due to btree slack factors, it's possible for a new btree to be one 1386 * level taller than the old btree. Update the incore btree height so 1387 * that we don't trip the verifiers when writing the new btree blocks 1388 * to disk. 1389 */ 1390 pag->pagf_repair_rmap_level = rr->new_btree.bload.btree_height; 1391 1392 /* 1393 * Move the cursor to the left edge of the tree so that the first 1394 * increment in ->get_records positions us at the first record. 1395 */ 1396 error = xfs_btree_goto_left_edge(rr->mcur); 1397 if (error) 1398 goto err_level; 1399 1400 /* Add all observed rmap records. */ 1401 error = xfs_btree_bload(rmap_cur, &rr->new_btree.bload, rr); 1402 if (error) 1403 goto err_level; 1404 1405 /* 1406 * Install the new btree in the AG header. After this point the old 1407 * btree is no longer accessible and the new tree is live. 1408 */ 1409 xfs_rmapbt_commit_staged_btree(rmap_cur, sc->tp, sc->sa.agf_bp); 1410 xfs_btree_del_cursor(rmap_cur, 0); 1411 xfs_btree_del_cursor(rr->mcur, 0); 1412 rr->mcur = NULL; 1413 1414 /* 1415 * Now that we've written the new btree to disk, we don't need to keep 1416 * updating the in-memory btree. Abort the scan to stop live updates. 1417 */ 1418 xchk_iscan_abort(&rr->iscan); 1419 1420 /* 1421 * The newly committed rmap recordset includes mappings for the blocks 1422 * that we reserved to build the new btree. If there is excess space 1423 * reservation to be freed, the corresponding rmap records must also be 1424 * removed. 1425 */ 1426 rr->new_btree.oinfo = XFS_RMAP_OINFO_AG; 1427 1428 /* Reset the AGF counters now that we've changed the btree shape. */ 1429 error = xrep_rmap_reset_counters(rr); 1430 if (error) 1431 goto err_newbt; 1432 1433 /* Dispose of any unused blocks and the accounting information. */ 1434 error = xrep_newbt_commit(&rr->new_btree); 1435 if (error) 1436 return error; 1437 1438 return xrep_roll_ag_trans(sc); 1439 1440 err_level: 1441 pag->pagf_repair_rmap_level = 0; 1442 err_mcur: 1443 xfs_btree_del_cursor(rr->mcur, error); 1444 err_cur: 1445 xfs_btree_del_cursor(rmap_cur, error); 1446 err_newbt: 1447 xrep_newbt_cancel(&rr->new_btree); 1448 return error; 1449 } 1450 1451 /* Section (IV): Reaping the old btree. */ 1452 1453 struct xrep_rmap_find_gaps { 1454 struct xagb_bitmap rmap_gaps; 1455 xfs_agblock_t next_agbno; 1456 }; 1457 1458 /* Subtract each free extent in the bnobt from the rmap gaps. */ 1459 STATIC int 1460 xrep_rmap_find_freesp( 1461 struct xfs_btree_cur *cur, 1462 const struct xfs_alloc_rec_incore *rec, 1463 void *priv) 1464 { 1465 struct xrep_rmap_find_gaps *rfg = priv; 1466 1467 return xagb_bitmap_clear(&rfg->rmap_gaps, rec->ar_startblock, 1468 rec->ar_blockcount); 1469 } 1470 1471 /* Record the free space we find, as part of cleaning out the btree. */ 1472 STATIC int 1473 xrep_rmap_find_gaps( 1474 struct xfs_btree_cur *cur, 1475 const struct xfs_rmap_irec *rec, 1476 void *priv) 1477 { 1478 struct xrep_rmap_find_gaps *rfg = priv; 1479 int error; 1480 1481 if (rec->rm_startblock > rfg->next_agbno) { 1482 error = xagb_bitmap_set(&rfg->rmap_gaps, rfg->next_agbno, 1483 rec->rm_startblock - rfg->next_agbno); 1484 if (error) 1485 return error; 1486 } 1487 1488 rfg->next_agbno = max_t(xfs_agblock_t, rfg->next_agbno, 1489 rec->rm_startblock + rec->rm_blockcount); 1490 return 0; 1491 } 1492 1493 /* 1494 * Reap the old rmapbt blocks. Now that the rmapbt is fully rebuilt, we make 1495 * a list of gaps in the rmap records and a list of the extents mentioned in 1496 * the bnobt. Any block that's in the new rmapbt gap list but not mentioned 1497 * in the bnobt is a block from the old rmapbt and can be removed. 1498 */ 1499 STATIC int 1500 xrep_rmap_remove_old_tree( 1501 struct xrep_rmap *rr) 1502 { 1503 struct xrep_rmap_find_gaps rfg = { 1504 .next_agbno = 0, 1505 }; 1506 struct xfs_scrub *sc = rr->sc; 1507 struct xfs_agf *agf = sc->sa.agf_bp->b_addr; 1508 struct xfs_perag *pag = sc->sa.pag; 1509 struct xfs_btree_cur *mcur; 1510 xfs_agblock_t agend; 1511 int error; 1512 1513 xagb_bitmap_init(&rfg.rmap_gaps); 1514 1515 /* Compute free space from the new rmapbt. */ 1516 mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL, &rr->rmap_btree); 1517 1518 error = xfs_rmap_query_all(mcur, xrep_rmap_find_gaps, &rfg); 1519 xfs_btree_del_cursor(mcur, error); 1520 if (error) 1521 goto out_bitmap; 1522 1523 /* Insert a record for space between the last rmap and EOAG. */ 1524 agend = be32_to_cpu(agf->agf_length); 1525 if (rfg.next_agbno < agend) { 1526 error = xagb_bitmap_set(&rfg.rmap_gaps, rfg.next_agbno, 1527 agend - rfg.next_agbno); 1528 if (error) 1529 goto out_bitmap; 1530 } 1531 1532 /* Compute free space from the existing bnobt. */ 1533 sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, 1534 sc->sa.pag); 1535 error = xfs_alloc_query_all(sc->sa.bno_cur, xrep_rmap_find_freesp, 1536 &rfg); 1537 xfs_btree_del_cursor(sc->sa.bno_cur, error); 1538 sc->sa.bno_cur = NULL; 1539 if (error) 1540 goto out_bitmap; 1541 1542 /* 1543 * Free the "free" blocks that the new rmapbt knows about but the bnobt 1544 * doesn't--these are the old rmapbt blocks. Credit the old rmapbt 1545 * block usage count back to the per-AG rmapbt reservation (and not 1546 * fdblocks, since the rmap btree lives in free space) to keep the 1547 * reservation and free space accounting correct. 1548 */ 1549 error = xrep_reap_agblocks(sc, &rfg.rmap_gaps, 1550 &XFS_RMAP_OINFO_ANY_OWNER, XFS_AG_RESV_RMAPBT); 1551 if (error) 1552 goto out_bitmap; 1553 1554 /* 1555 * Now that we've zapped all the old rmapbt blocks we can turn off 1556 * the alternate height mechanism and reset the per-AG space 1557 * reservation. 1558 */ 1559 pag->pagf_repair_rmap_level = 0; 1560 sc->flags |= XREP_RESET_PERAG_RESV; 1561 out_bitmap: 1562 xagb_bitmap_destroy(&rfg.rmap_gaps); 1563 return error; 1564 } 1565 1566 static inline bool 1567 xrep_rmapbt_want_live_update( 1568 struct xchk_iscan *iscan, 1569 const struct xfs_owner_info *oi) 1570 { 1571 if (xchk_iscan_aborted(iscan)) 1572 return false; 1573 1574 /* 1575 * Before unlocking the AG header to perform the inode scan, we 1576 * recorded reverse mappings for all AG metadata except for the OWN_AG 1577 * metadata. IOWs, the in-memory btree knows about the AG headers, the 1578 * two inode btrees, the CoW staging extents, and the refcount btrees. 1579 * For these types of metadata, we need to record the live updates in 1580 * the in-memory rmap btree. 1581 * 1582 * However, we do not scan the free space btrees or the AGFL until we 1583 * have re-locked the AGF and are ready to reserve space for the new 1584 * rmap btree, so we do not want live updates for OWN_AG metadata. 1585 */ 1586 if (XFS_RMAP_NON_INODE_OWNER(oi->oi_owner)) 1587 return oi->oi_owner != XFS_RMAP_OWN_AG; 1588 1589 /* Ignore updates to files that the scanner hasn't visited yet. */ 1590 return xchk_iscan_want_live_update(iscan, oi->oi_owner); 1591 } 1592 1593 /* 1594 * Apply a rmapbt update from the regular filesystem into our shadow btree. 1595 * We're running from the thread that owns the AGF buffer and is generating 1596 * the update, so we must be careful about which parts of the struct xrep_rmap 1597 * that we change. 1598 */ 1599 static int 1600 xrep_rmapbt_live_update( 1601 struct notifier_block *nb, 1602 unsigned long action, 1603 void *data) 1604 { 1605 struct xfs_rmap_update_params *p = data; 1606 struct xrep_rmap *rr; 1607 struct xfs_mount *mp; 1608 struct xfs_btree_cur *mcur; 1609 struct xfs_trans *tp; 1610 int error; 1611 1612 rr = container_of(nb, struct xrep_rmap, rhook.rmap_hook.nb); 1613 mp = rr->sc->mp; 1614 1615 if (!xrep_rmapbt_want_live_update(&rr->iscan, &p->oinfo)) 1616 goto out_unlock; 1617 1618 trace_xrep_rmap_live_update(pag_group(rr->sc->sa.pag), action, p); 1619 1620 tp = xfs_trans_alloc_empty(mp); 1621 1622 mutex_lock(&rr->lock); 1623 mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, tp, &rr->rmap_btree); 1624 error = __xfs_rmap_finish_intent(mcur, action, p->startblock, 1625 p->blockcount, &p->oinfo, p->unwritten); 1626 xfs_btree_del_cursor(mcur, error); 1627 if (error) 1628 goto out_cancel; 1629 1630 error = xfbtree_trans_commit(&rr->rmap_btree, tp); 1631 if (error) 1632 goto out_cancel; 1633 1634 xfs_trans_cancel(tp); 1635 mutex_unlock(&rr->lock); 1636 return NOTIFY_DONE; 1637 1638 out_cancel: 1639 xfbtree_trans_cancel(&rr->rmap_btree, tp); 1640 xfs_trans_cancel(tp); 1641 mutex_unlock(&rr->lock); 1642 xchk_iscan_abort(&rr->iscan); 1643 out_unlock: 1644 return NOTIFY_DONE; 1645 } 1646 1647 /* Set up the filesystem scan components. */ 1648 STATIC int 1649 xrep_rmap_setup_scan( 1650 struct xrep_rmap *rr) 1651 { 1652 struct xfs_scrub *sc = rr->sc; 1653 int error; 1654 1655 mutex_init(&rr->lock); 1656 1657 /* Set up in-memory rmap btree */ 1658 error = xfs_rmapbt_mem_init(sc->mp, &rr->rmap_btree, sc->xmbtp, 1659 pag_agno(sc->sa.pag)); 1660 if (error) 1661 goto out_mutex; 1662 1663 /* Retry iget every tenth of a second for up to 30 seconds. */ 1664 xchk_iscan_start(sc, 30000, 100, &rr->iscan); 1665 1666 /* 1667 * Hook into live rmap operations so that we can update our in-memory 1668 * btree to reflect live changes on the filesystem. Since we drop the 1669 * AGF buffer to scan all the inodes, we need this piece to avoid 1670 * installing a stale btree. 1671 */ 1672 ASSERT(sc->flags & XCHK_FSGATES_RMAP); 1673 xfs_rmap_hook_setup(&rr->rhook, xrep_rmapbt_live_update); 1674 error = xfs_rmap_hook_add(pag_group(sc->sa.pag), &rr->rhook); 1675 if (error) 1676 goto out_iscan; 1677 return 0; 1678 1679 out_iscan: 1680 xchk_iscan_teardown(&rr->iscan); 1681 xfbtree_destroy(&rr->rmap_btree); 1682 out_mutex: 1683 mutex_destroy(&rr->lock); 1684 return error; 1685 } 1686 1687 /* Tear down scan components. */ 1688 STATIC void 1689 xrep_rmap_teardown( 1690 struct xrep_rmap *rr) 1691 { 1692 struct xfs_scrub *sc = rr->sc; 1693 1694 xchk_iscan_abort(&rr->iscan); 1695 xfs_rmap_hook_del(pag_group(sc->sa.pag), &rr->rhook); 1696 xchk_iscan_teardown(&rr->iscan); 1697 xfbtree_destroy(&rr->rmap_btree); 1698 mutex_destroy(&rr->lock); 1699 } 1700 1701 /* Repair the rmap btree for some AG. */ 1702 int 1703 xrep_rmapbt( 1704 struct xfs_scrub *sc) 1705 { 1706 struct xrep_rmap *rr = sc->buf; 1707 int error; 1708 1709 error = xrep_rmap_setup_scan(rr); 1710 if (error) 1711 return error; 1712 1713 /* 1714 * Collect rmaps for everything in this AG that isn't space metadata. 1715 * These rmaps won't change even as we try to allocate blocks. 1716 */ 1717 error = xrep_rmap_find_rmaps(rr); 1718 if (error) 1719 goto out_records; 1720 1721 /* Rebuild the rmap information. */ 1722 error = xrep_rmap_build_new_tree(rr); 1723 if (error) 1724 goto out_records; 1725 1726 /* Kill the old tree. */ 1727 error = xrep_rmap_remove_old_tree(rr); 1728 if (error) 1729 goto out_records; 1730 1731 out_records: 1732 xrep_rmap_teardown(rr); 1733 return error; 1734 } 1735