1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (c) 2018-2024 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_defer.h" 13 #include "xfs_btree.h" 14 #include "xfs_btree_staging.h" 15 #include "xfs_buf_mem.h" 16 #include "xfs_btree_mem.h" 17 #include "xfs_bit.h" 18 #include "xfs_log_format.h" 19 #include "xfs_trans.h" 20 #include "xfs_sb.h" 21 #include "xfs_alloc.h" 22 #include "xfs_alloc_btree.h" 23 #include "xfs_ialloc.h" 24 #include "xfs_ialloc_btree.h" 25 #include "xfs_rmap.h" 26 #include "xfs_rmap_btree.h" 27 #include "xfs_inode.h" 28 #include "xfs_icache.h" 29 #include "xfs_bmap.h" 30 #include "xfs_bmap_btree.h" 31 #include "xfs_refcount.h" 32 #include "xfs_refcount_btree.h" 33 #include "xfs_ag.h" 34 #include "xfs_rtrmap_btree.h" 35 #include "xfs_rtgroup.h" 36 #include "xfs_rtrefcount_btree.h" 37 #include "scrub/xfs_scrub.h" 38 #include "scrub/scrub.h" 39 #include "scrub/common.h" 40 #include "scrub/btree.h" 41 #include "scrub/trace.h" 42 #include "scrub/repair.h" 43 #include "scrub/bitmap.h" 44 #include "scrub/agb_bitmap.h" 45 #include "scrub/xfile.h" 46 #include "scrub/xfarray.h" 47 #include "scrub/iscan.h" 48 #include "scrub/newbt.h" 49 #include "scrub/reap.h" 50 51 /* 52 * Reverse Mapping Btree Repair 53 * ============================ 54 * 55 * This is the most involved of all the AG space btree rebuilds. Everywhere 56 * else in XFS we lock inodes and then AG data structures, but generating the 57 * list of rmap records requires that we be able to scan both block mapping 58 * btrees of every inode in the filesystem to see if it owns any extents in 59 * this AG. We can't tolerate any inode updates while we do this, so we 60 * freeze the filesystem to lock everyone else out, and grant ourselves 61 * special privileges to run transactions with regular background reclamation 62 * turned off. 63 * 64 * We also have to be very careful not to allow inode reclaim to start a 65 * transaction because all transactions (other than our own) will block. 66 * Deferred inode inactivation helps us out there. 67 * 68 * I) Reverse mappings for all non-space metadata and file data are collected 69 * according to the following algorithm: 70 * 71 * 1. For each fork of each inode: 72 * 1.1. Create a bitmap BMBIT to track bmbt blocks if necessary. 73 * 1.2. If the incore extent map isn't loaded, walk the bmbt to accumulate 74 * bmaps into rmap records (see 1.1.4). Set bits in BMBIT for each btree 75 * block. 76 * 1.3. If the incore extent map is loaded but the fork is in btree format, 77 * just visit the bmbt blocks to set the corresponding BMBIT areas. 78 * 1.4. From the incore extent map, accumulate each bmap that falls into our 79 * target AG. Remember, multiple bmap records can map to a single rmap 80 * record, so we cannot simply emit rmap records 1:1. 81 * 1.5. Emit rmap records for each extent in BMBIT and free it. 82 * 2. Create bitmaps INOBIT and ICHUNKBIT. 83 * 3. For each record in the inobt, set the corresponding areas in ICHUNKBIT, 84 * and set bits in INOBIT for each btree block. If the inobt has no records 85 * at all, we must be careful to record its root in INOBIT. 86 * 4. For each block in the finobt, set the corresponding INOBIT area. 87 * 5. Emit rmap records for each extent in INOBIT and ICHUNKBIT and free them. 88 * 6. Create bitmaps REFCBIT and COWBIT. 89 * 7. For each CoW staging extent in the refcountbt, set the corresponding 90 * areas in COWBIT. 91 * 8. For each block in the refcountbt, set the corresponding REFCBIT area. 92 * 9. Emit rmap records for each extent in REFCBIT and COWBIT and free them. 93 * A. Emit rmap for the AG headers. 94 * B. Emit rmap for the log, if there is one. 95 * 96 * II) The rmapbt shape and space metadata rmaps are computed as follows: 97 * 98 * 1. Count the rmaps collected in the previous step. (= NR) 99 * 2. Estimate the number of rmapbt blocks needed to store NR records. (= RMB) 100 * 3. Reserve RMB blocks through the newbt using the allocator in normap mode. 101 * 4. Create bitmap AGBIT. 102 * 5. For each reservation in the newbt, set the corresponding areas in AGBIT. 103 * 6. For each block in the AGFL, bnobt, and cntbt, set the bits in AGBIT. 104 * 7. Count the extents in AGBIT. (= AGNR) 105 * 8. Estimate the number of rmapbt blocks needed for NR + AGNR rmaps. (= RMB') 106 * 9. If RMB' >= RMB, reserve RMB' - RMB more newbt blocks, set RMB = RMB', 107 * and clear AGBIT. Go to step 5. 108 * A. Emit rmaps for each extent in AGBIT. 109 * 110 * III) The rmapbt is constructed and set in place as follows: 111 * 112 * 1. Sort the rmap records. 113 * 2. Bulk load the rmaps. 114 * 115 * IV) Reap the old btree blocks. 116 * 117 * 1. Create a bitmap OLDRMBIT. 118 * 2. For each gap in the new rmapbt, set the corresponding areas of OLDRMBIT. 119 * 3. For each extent in the bnobt, clear the corresponding parts of OLDRMBIT. 120 * 4. Reap the extents corresponding to the set areas in OLDRMBIT. These are 121 * the parts of the AG that the rmap didn't find during its scan of the 122 * primary metadata and aren't known to be in the free space, which implies 123 * that they were the old rmapbt blocks. 124 * 5. Commit. 125 * 126 * We use the 'xrep_rmap' prefix for all the rmap functions. 127 */ 128 129 /* Context for collecting rmaps */ 130 struct xrep_rmap { 131 /* new rmapbt information */ 132 struct xrep_newbt new_btree; 133 134 /* lock for the xfbtree and xfile */ 135 struct mutex lock; 136 137 /* rmap records generated from primary metadata */ 138 struct xfbtree rmap_btree; 139 140 struct xfs_scrub *sc; 141 142 /* in-memory btree cursor for the xfs_btree_bload iteration */ 143 struct xfs_btree_cur *mcur; 144 145 /* Hooks into rmap update code. */ 146 struct xfs_rmap_hook rhook; 147 148 /* inode scan cursor */ 149 struct xchk_iscan iscan; 150 151 /* Number of non-freespace records found. */ 152 unsigned long long nr_records; 153 154 /* bnobt/cntbt contribution to btreeblks */ 155 xfs_agblock_t freesp_btblocks; 156 157 /* old agf_rmap_blocks counter */ 158 unsigned int old_rmapbt_fsbcount; 159 }; 160 161 /* Set us up to repair reverse mapping btrees. */ 162 int 163 xrep_setup_ag_rmapbt( 164 struct xfs_scrub *sc) 165 { 166 struct xrep_rmap *rr; 167 char *descr; 168 int error; 169 170 xchk_fsgates_enable(sc, XCHK_FSGATES_RMAP); 171 172 descr = xchk_xfile_ag_descr(sc, "reverse mapping records"); 173 error = xrep_setup_xfbtree(sc, descr); 174 kfree(descr); 175 if (error) 176 return error; 177 178 rr = kzalloc(sizeof(struct xrep_rmap), XCHK_GFP_FLAGS); 179 if (!rr) 180 return -ENOMEM; 181 182 rr->sc = sc; 183 sc->buf = rr; 184 return 0; 185 } 186 187 /* Make sure there's nothing funny about this mapping. */ 188 STATIC int 189 xrep_rmap_check_mapping( 190 struct xfs_scrub *sc, 191 const struct xfs_rmap_irec *rec) 192 { 193 enum xbtree_recpacking outcome; 194 int error; 195 196 if (xfs_rmap_check_irec(sc->sa.pag, rec) != NULL) 197 return -EFSCORRUPTED; 198 199 /* Make sure this isn't free space. */ 200 error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rm_startblock, 201 rec->rm_blockcount, &outcome); 202 if (error) 203 return error; 204 if (outcome != XBTREE_RECPACKING_EMPTY) 205 return -EFSCORRUPTED; 206 207 return 0; 208 } 209 210 /* Store a reverse-mapping record. */ 211 static inline int 212 xrep_rmap_stash( 213 struct xrep_rmap *rr, 214 xfs_agblock_t startblock, 215 xfs_extlen_t blockcount, 216 uint64_t owner, 217 uint64_t offset, 218 unsigned int flags) 219 { 220 struct xfs_rmap_irec rmap = { 221 .rm_startblock = startblock, 222 .rm_blockcount = blockcount, 223 .rm_owner = owner, 224 .rm_offset = offset, 225 .rm_flags = flags, 226 }; 227 struct xfs_scrub *sc = rr->sc; 228 struct xfs_btree_cur *mcur; 229 int error = 0; 230 231 if (xchk_should_terminate(sc, &error)) 232 return error; 233 234 if (xchk_iscan_aborted(&rr->iscan)) 235 return -EFSCORRUPTED; 236 237 trace_xrep_rmap_found(sc->sa.pag, &rmap); 238 239 mutex_lock(&rr->lock); 240 mcur = xfs_rmapbt_mem_cursor(sc->sa.pag, sc->tp, &rr->rmap_btree); 241 error = xfs_rmap_map_raw(mcur, &rmap); 242 xfs_btree_del_cursor(mcur, error); 243 if (error) 244 goto out_cancel; 245 246 error = xfbtree_trans_commit(&rr->rmap_btree, sc->tp); 247 if (error) 248 goto out_abort; 249 250 mutex_unlock(&rr->lock); 251 return 0; 252 253 out_cancel: 254 xfbtree_trans_cancel(&rr->rmap_btree, sc->tp); 255 out_abort: 256 xchk_iscan_abort(&rr->iscan); 257 mutex_unlock(&rr->lock); 258 return error; 259 } 260 261 struct xrep_rmap_stash_run { 262 struct xrep_rmap *rr; 263 uint64_t owner; 264 unsigned int rmap_flags; 265 }; 266 267 static int 268 xrep_rmap_stash_run( 269 uint32_t start, 270 uint32_t len, 271 void *priv) 272 { 273 struct xrep_rmap_stash_run *rsr = priv; 274 struct xrep_rmap *rr = rsr->rr; 275 276 return xrep_rmap_stash(rr, start, len, rsr->owner, 0, rsr->rmap_flags); 277 } 278 279 /* 280 * Emit rmaps for every extent of bits set in the bitmap. Caller must ensure 281 * that the ranges are in units of FS blocks. 282 */ 283 STATIC int 284 xrep_rmap_stash_bitmap( 285 struct xrep_rmap *rr, 286 struct xagb_bitmap *bitmap, 287 const struct xfs_owner_info *oinfo) 288 { 289 struct xrep_rmap_stash_run rsr = { 290 .rr = rr, 291 .owner = oinfo->oi_owner, 292 .rmap_flags = 0, 293 }; 294 295 if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK) 296 rsr.rmap_flags |= XFS_RMAP_ATTR_FORK; 297 if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK) 298 rsr.rmap_flags |= XFS_RMAP_BMBT_BLOCK; 299 300 return xagb_bitmap_walk(bitmap, xrep_rmap_stash_run, &rsr); 301 } 302 303 /* Section (I): Finding all file and bmbt extents. */ 304 305 /* Context for accumulating rmaps for an inode fork. */ 306 struct xrep_rmap_ifork { 307 /* 308 * Accumulate rmap data here to turn multiple adjacent bmaps into a 309 * single rmap. 310 */ 311 struct xfs_rmap_irec accum; 312 313 /* Bitmap of bmbt blocks in this AG. */ 314 struct xagb_bitmap bmbt_blocks; 315 316 struct xrep_rmap *rr; 317 318 /* Which inode fork? */ 319 int whichfork; 320 }; 321 322 /* Stash an rmap that we accumulated while walking an inode fork. */ 323 STATIC int 324 xrep_rmap_stash_accumulated( 325 struct xrep_rmap_ifork *rf) 326 { 327 if (rf->accum.rm_blockcount == 0) 328 return 0; 329 330 return xrep_rmap_stash(rf->rr, rf->accum.rm_startblock, 331 rf->accum.rm_blockcount, rf->accum.rm_owner, 332 rf->accum.rm_offset, rf->accum.rm_flags); 333 } 334 335 /* Accumulate a bmbt record. */ 336 STATIC int 337 xrep_rmap_visit_bmbt( 338 struct xfs_btree_cur *cur, 339 struct xfs_bmbt_irec *rec, 340 void *priv) 341 { 342 struct xrep_rmap_ifork *rf = priv; 343 struct xfs_mount *mp = rf->rr->sc->mp; 344 struct xfs_rmap_irec *accum = &rf->accum; 345 xfs_agblock_t agbno; 346 unsigned int rmap_flags = 0; 347 int error; 348 349 if (XFS_FSB_TO_AGNO(mp, rec->br_startblock) != 350 pag_agno(rf->rr->sc->sa.pag)) 351 return 0; 352 353 agbno = XFS_FSB_TO_AGBNO(mp, rec->br_startblock); 354 if (rf->whichfork == XFS_ATTR_FORK) 355 rmap_flags |= XFS_RMAP_ATTR_FORK; 356 if (rec->br_state == XFS_EXT_UNWRITTEN) 357 rmap_flags |= XFS_RMAP_UNWRITTEN; 358 359 /* If this bmap is adjacent to the previous one, just add it. */ 360 if (accum->rm_blockcount > 0 && 361 rec->br_startoff == accum->rm_offset + accum->rm_blockcount && 362 agbno == accum->rm_startblock + accum->rm_blockcount && 363 rmap_flags == accum->rm_flags) { 364 accum->rm_blockcount += rec->br_blockcount; 365 return 0; 366 } 367 368 /* Otherwise stash the old rmap and start accumulating a new one. */ 369 error = xrep_rmap_stash_accumulated(rf); 370 if (error) 371 return error; 372 373 accum->rm_startblock = agbno; 374 accum->rm_blockcount = rec->br_blockcount; 375 accum->rm_offset = rec->br_startoff; 376 accum->rm_flags = rmap_flags; 377 return 0; 378 } 379 380 /* Add a btree block to the bitmap. */ 381 STATIC int 382 xrep_rmap_visit_iroot_btree_block( 383 struct xfs_btree_cur *cur, 384 int level, 385 void *priv) 386 { 387 struct xrep_rmap_ifork *rf = priv; 388 struct xfs_buf *bp; 389 xfs_fsblock_t fsbno; 390 xfs_agblock_t agbno; 391 392 xfs_btree_get_block(cur, level, &bp); 393 if (!bp) 394 return 0; 395 396 fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); 397 if (XFS_FSB_TO_AGNO(cur->bc_mp, fsbno) != pag_agno(rf->rr->sc->sa.pag)) 398 return 0; 399 400 agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); 401 return xagb_bitmap_set(&rf->bmbt_blocks, agbno, 1); 402 } 403 404 /* 405 * Iterate a metadata btree rooted in an inode to collect rmap records for 406 * anything in this fork that matches the AG. 407 */ 408 STATIC int 409 xrep_rmap_scan_iroot_btree( 410 struct xrep_rmap_ifork *rf, 411 struct xfs_btree_cur *cur) 412 { 413 struct xfs_owner_info oinfo; 414 struct xrep_rmap *rr = rf->rr; 415 int error; 416 417 xagb_bitmap_init(&rf->bmbt_blocks); 418 419 /* Record all the blocks in the btree itself. */ 420 error = xfs_btree_visit_blocks(cur, xrep_rmap_visit_iroot_btree_block, 421 XFS_BTREE_VISIT_ALL, rf); 422 if (error) 423 goto out; 424 425 /* Emit rmaps for the btree blocks. */ 426 xfs_rmap_ino_bmbt_owner(&oinfo, rf->accum.rm_owner, rf->whichfork); 427 error = xrep_rmap_stash_bitmap(rr, &rf->bmbt_blocks, &oinfo); 428 if (error) 429 goto out; 430 431 /* Stash any remaining accumulated rmaps. */ 432 error = xrep_rmap_stash_accumulated(rf); 433 out: 434 xagb_bitmap_destroy(&rf->bmbt_blocks); 435 return error; 436 } 437 438 /* 439 * Iterate the block mapping btree to collect rmap records for anything in this 440 * fork that matches the AG. Sets @mappings_done to true if we've scanned the 441 * block mappings in this fork. 442 */ 443 STATIC int 444 xrep_rmap_scan_bmbt( 445 struct xrep_rmap_ifork *rf, 446 struct xfs_inode *ip, 447 bool *mappings_done) 448 { 449 struct xrep_rmap *rr = rf->rr; 450 struct xfs_btree_cur *cur; 451 struct xfs_ifork *ifp; 452 int error; 453 454 *mappings_done = false; 455 ifp = xfs_ifork_ptr(ip, rf->whichfork); 456 cur = xfs_bmbt_init_cursor(rr->sc->mp, rr->sc->tp, ip, rf->whichfork); 457 458 if (!xfs_ifork_is_realtime(ip, rf->whichfork) && 459 xfs_need_iread_extents(ifp)) { 460 /* 461 * If the incore extent cache isn't loaded, scan the bmbt for 462 * mapping records. This avoids loading the incore extent 463 * tree, which will increase memory pressure at a time when 464 * we're trying to run as quickly as we possibly can. Ignore 465 * realtime extents. 466 */ 467 error = xfs_bmap_query_all(cur, xrep_rmap_visit_bmbt, rf); 468 if (error) 469 goto out_cur; 470 471 *mappings_done = true; 472 } 473 474 /* Scan for the bmbt blocks, which always live on the data device. */ 475 error = xrep_rmap_scan_iroot_btree(rf, cur); 476 out_cur: 477 xfs_btree_del_cursor(cur, error); 478 return error; 479 } 480 481 /* 482 * Iterate the in-core extent cache to collect rmap records for anything in 483 * this fork that matches the AG. 484 */ 485 STATIC int 486 xrep_rmap_scan_iext( 487 struct xrep_rmap_ifork *rf, 488 struct xfs_ifork *ifp) 489 { 490 struct xfs_bmbt_irec rec; 491 struct xfs_iext_cursor icur; 492 int error; 493 494 for_each_xfs_iext(ifp, &icur, &rec) { 495 if (isnullstartblock(rec.br_startblock)) 496 continue; 497 error = xrep_rmap_visit_bmbt(NULL, &rec, rf); 498 if (error) 499 return error; 500 } 501 502 return xrep_rmap_stash_accumulated(rf); 503 } 504 505 static int 506 xrep_rmap_scan_meta_btree( 507 struct xrep_rmap_ifork *rf, 508 struct xfs_inode *ip) 509 { 510 struct xfs_scrub *sc = rf->rr->sc; 511 struct xfs_rtgroup *rtg = NULL; 512 struct xfs_btree_cur *cur = NULL; 513 enum xfs_rtg_inodes type; 514 int error; 515 516 if (rf->whichfork != XFS_DATA_FORK) 517 return -EFSCORRUPTED; 518 519 switch (ip->i_metatype) { 520 case XFS_METAFILE_RTRMAP: 521 type = XFS_RTGI_RMAP; 522 break; 523 case XFS_METAFILE_RTREFCOUNT: 524 type = XFS_RTGI_REFCOUNT; 525 break; 526 default: 527 ASSERT(0); 528 return -EFSCORRUPTED; 529 } 530 531 while ((rtg = xfs_rtgroup_next(sc->mp, rtg))) { 532 if (ip == rtg->rtg_inodes[type]) 533 goto found; 534 } 535 536 /* 537 * We should never find an rt metadata btree inode that isn't 538 * associated with an rtgroup yet has ondisk blocks allocated to it. 539 */ 540 if (ip->i_nblocks) { 541 ASSERT(0); 542 return -EFSCORRUPTED; 543 } 544 545 return 0; 546 547 found: 548 switch (ip->i_metatype) { 549 case XFS_METAFILE_RTRMAP: 550 cur = xfs_rtrmapbt_init_cursor(sc->tp, rtg); 551 break; 552 case XFS_METAFILE_RTREFCOUNT: 553 cur = xfs_rtrefcountbt_init_cursor(sc->tp, rtg); 554 break; 555 default: 556 ASSERT(0); 557 error = -EFSCORRUPTED; 558 goto out_rtg; 559 } 560 561 error = xrep_rmap_scan_iroot_btree(rf, cur); 562 xfs_btree_del_cursor(cur, error); 563 out_rtg: 564 xfs_rtgroup_rele(rtg); 565 return error; 566 } 567 568 /* Find all the extents from a given AG in an inode fork. */ 569 STATIC int 570 xrep_rmap_scan_ifork( 571 struct xrep_rmap *rr, 572 struct xfs_inode *ip, 573 int whichfork) 574 { 575 struct xrep_rmap_ifork rf = { 576 .accum = { .rm_owner = ip->i_ino, }, 577 .rr = rr, 578 .whichfork = whichfork, 579 }; 580 struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); 581 bool mappings_done; 582 int error = 0; 583 584 if (!ifp) 585 return 0; 586 587 switch (ifp->if_format) { 588 case XFS_DINODE_FMT_BTREE: 589 /* 590 * Scan the bmap btree for data device mappings. This includes 591 * the btree blocks themselves, even if this is a realtime 592 * file. 593 */ 594 error = xrep_rmap_scan_bmbt(&rf, ip, &mappings_done); 595 if (error || mappings_done) 596 return error; 597 fallthrough; 598 case XFS_DINODE_FMT_EXTENTS: 599 /* Scan incore extent cache if this isn't a realtime file. */ 600 if (xfs_ifork_is_realtime(ip, whichfork)) 601 return 0; 602 603 return xrep_rmap_scan_iext(&rf, ifp); 604 case XFS_DINODE_FMT_META_BTREE: 605 return xrep_rmap_scan_meta_btree(&rf, ip); 606 } 607 608 return 0; 609 } 610 611 /* 612 * Take ILOCK on a file that we want to scan. 613 * 614 * Select ILOCK_EXCL if the file has an unloaded data bmbt or has an unloaded 615 * attr bmbt. Otherwise, take ILOCK_SHARED. 616 */ 617 static inline unsigned int 618 xrep_rmap_scan_ilock( 619 struct xfs_inode *ip) 620 { 621 uint lock_mode = XFS_ILOCK_SHARED; 622 623 if (xfs_need_iread_extents(&ip->i_df)) { 624 lock_mode = XFS_ILOCK_EXCL; 625 goto lock; 626 } 627 628 if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af)) 629 lock_mode = XFS_ILOCK_EXCL; 630 631 lock: 632 xfs_ilock(ip, lock_mode); 633 return lock_mode; 634 } 635 636 /* Record reverse mappings for a file. */ 637 STATIC int 638 xrep_rmap_scan_inode( 639 struct xrep_rmap *rr, 640 struct xfs_inode *ip) 641 { 642 unsigned int lock_mode = xrep_rmap_scan_ilock(ip); 643 int error; 644 645 /* Check the data fork. */ 646 error = xrep_rmap_scan_ifork(rr, ip, XFS_DATA_FORK); 647 if (error) 648 goto out_unlock; 649 650 /* Check the attr fork. */ 651 error = xrep_rmap_scan_ifork(rr, ip, XFS_ATTR_FORK); 652 if (error) 653 goto out_unlock; 654 655 /* COW fork extents are "owned" by the refcount btree. */ 656 657 xchk_iscan_mark_visited(&rr->iscan, ip); 658 out_unlock: 659 xfs_iunlock(ip, lock_mode); 660 return error; 661 } 662 663 /* Section (I): Find all AG metadata extents except for free space metadata. */ 664 665 struct xrep_rmap_inodes { 666 struct xrep_rmap *rr; 667 struct xagb_bitmap inobt_blocks; /* INOBIT */ 668 struct xagb_bitmap ichunk_blocks; /* ICHUNKBIT */ 669 }; 670 671 /* Record inode btree rmaps. */ 672 STATIC int 673 xrep_rmap_walk_inobt( 674 struct xfs_btree_cur *cur, 675 const union xfs_btree_rec *rec, 676 void *priv) 677 { 678 struct xfs_inobt_rec_incore irec; 679 struct xrep_rmap_inodes *ri = priv; 680 struct xfs_mount *mp = cur->bc_mp; 681 xfs_agblock_t agbno; 682 xfs_extlen_t aglen; 683 xfs_agino_t agino; 684 xfs_agino_t iperhole; 685 unsigned int i; 686 int error; 687 688 /* Record the inobt blocks. */ 689 error = xagb_bitmap_set_btcur_path(&ri->inobt_blocks, cur); 690 if (error) 691 return error; 692 693 xfs_inobt_btrec_to_irec(mp, rec, &irec); 694 if (xfs_inobt_check_irec(to_perag(cur->bc_group), &irec) != NULL) 695 return -EFSCORRUPTED; 696 697 agino = irec.ir_startino; 698 699 /* Record a non-sparse inode chunk. */ 700 if (!xfs_inobt_issparse(irec.ir_holemask)) { 701 agbno = XFS_AGINO_TO_AGBNO(mp, agino); 702 aglen = max_t(xfs_extlen_t, 1, 703 XFS_INODES_PER_CHUNK / mp->m_sb.sb_inopblock); 704 705 return xagb_bitmap_set(&ri->ichunk_blocks, agbno, aglen); 706 } 707 708 /* Iterate each chunk. */ 709 iperhole = max_t(xfs_agino_t, mp->m_sb.sb_inopblock, 710 XFS_INODES_PER_HOLEMASK_BIT); 711 aglen = iperhole / mp->m_sb.sb_inopblock; 712 for (i = 0, agino = irec.ir_startino; 713 i < XFS_INOBT_HOLEMASK_BITS; 714 i += iperhole / XFS_INODES_PER_HOLEMASK_BIT, agino += iperhole) { 715 /* Skip holes. */ 716 if (irec.ir_holemask & (1 << i)) 717 continue; 718 719 /* Record the inode chunk otherwise. */ 720 agbno = XFS_AGINO_TO_AGBNO(mp, agino); 721 error = xagb_bitmap_set(&ri->ichunk_blocks, agbno, aglen); 722 if (error) 723 return error; 724 } 725 726 return 0; 727 } 728 729 /* Collect rmaps for the blocks containing inode btrees and the inode chunks. */ 730 STATIC int 731 xrep_rmap_find_inode_rmaps( 732 struct xrep_rmap *rr) 733 { 734 struct xrep_rmap_inodes ri = { 735 .rr = rr, 736 }; 737 struct xfs_scrub *sc = rr->sc; 738 int error; 739 740 xagb_bitmap_init(&ri.inobt_blocks); 741 xagb_bitmap_init(&ri.ichunk_blocks); 742 743 /* 744 * Iterate every record in the inobt so we can capture all the inode 745 * chunks and the blocks in the inobt itself. 746 */ 747 error = xfs_btree_query_all(sc->sa.ino_cur, xrep_rmap_walk_inobt, &ri); 748 if (error) 749 goto out_bitmap; 750 751 /* 752 * Note that if there are zero records in the inobt then query_all does 753 * nothing and we have to account the empty inobt root manually. 754 */ 755 if (xagb_bitmap_empty(&ri.ichunk_blocks)) { 756 struct xfs_agi *agi = sc->sa.agi_bp->b_addr; 757 758 error = xagb_bitmap_set(&ri.inobt_blocks, 759 be32_to_cpu(agi->agi_root), 1); 760 if (error) 761 goto out_bitmap; 762 } 763 764 /* Scan the finobt too. */ 765 if (xfs_has_finobt(sc->mp)) { 766 error = xagb_bitmap_set_btblocks(&ri.inobt_blocks, 767 sc->sa.fino_cur); 768 if (error) 769 goto out_bitmap; 770 } 771 772 /* Generate rmaps for everything. */ 773 error = xrep_rmap_stash_bitmap(rr, &ri.inobt_blocks, 774 &XFS_RMAP_OINFO_INOBT); 775 if (error) 776 goto out_bitmap; 777 error = xrep_rmap_stash_bitmap(rr, &ri.ichunk_blocks, 778 &XFS_RMAP_OINFO_INODES); 779 780 out_bitmap: 781 xagb_bitmap_destroy(&ri.inobt_blocks); 782 xagb_bitmap_destroy(&ri.ichunk_blocks); 783 return error; 784 } 785 786 /* Record a CoW staging extent. */ 787 STATIC int 788 xrep_rmap_walk_cowblocks( 789 struct xfs_btree_cur *cur, 790 const struct xfs_refcount_irec *irec, 791 void *priv) 792 { 793 struct xagb_bitmap *bitmap = priv; 794 795 if (!xfs_refcount_check_domain(irec) || 796 irec->rc_domain != XFS_REFC_DOMAIN_COW) 797 return -EFSCORRUPTED; 798 799 return xagb_bitmap_set(bitmap, irec->rc_startblock, irec->rc_blockcount); 800 } 801 802 /* 803 * Collect rmaps for the blocks containing the refcount btree, and all CoW 804 * staging extents. 805 */ 806 STATIC int 807 xrep_rmap_find_refcount_rmaps( 808 struct xrep_rmap *rr) 809 { 810 struct xagb_bitmap refcountbt_blocks; /* REFCBIT */ 811 struct xagb_bitmap cow_blocks; /* COWBIT */ 812 struct xfs_refcount_irec low = { 813 .rc_startblock = 0, 814 .rc_domain = XFS_REFC_DOMAIN_COW, 815 }; 816 struct xfs_refcount_irec high = { 817 .rc_startblock = -1U, 818 .rc_domain = XFS_REFC_DOMAIN_COW, 819 }; 820 struct xfs_scrub *sc = rr->sc; 821 int error; 822 823 if (!xfs_has_reflink(sc->mp)) 824 return 0; 825 826 xagb_bitmap_init(&refcountbt_blocks); 827 xagb_bitmap_init(&cow_blocks); 828 829 /* refcountbt */ 830 error = xagb_bitmap_set_btblocks(&refcountbt_blocks, sc->sa.refc_cur); 831 if (error) 832 goto out_bitmap; 833 834 /* Collect rmaps for CoW staging extents. */ 835 error = xfs_refcount_query_range(sc->sa.refc_cur, &low, &high, 836 xrep_rmap_walk_cowblocks, &cow_blocks); 837 if (error) 838 goto out_bitmap; 839 840 /* Generate rmaps for everything. */ 841 error = xrep_rmap_stash_bitmap(rr, &cow_blocks, &XFS_RMAP_OINFO_COW); 842 if (error) 843 goto out_bitmap; 844 error = xrep_rmap_stash_bitmap(rr, &refcountbt_blocks, 845 &XFS_RMAP_OINFO_REFC); 846 847 out_bitmap: 848 xagb_bitmap_destroy(&cow_blocks); 849 xagb_bitmap_destroy(&refcountbt_blocks); 850 return error; 851 } 852 853 /* Generate rmaps for the AG headers (AGI/AGF/AGFL) */ 854 STATIC int 855 xrep_rmap_find_agheader_rmaps( 856 struct xrep_rmap *rr) 857 { 858 struct xfs_scrub *sc = rr->sc; 859 860 /* Create a record for the AG sb->agfl. */ 861 return xrep_rmap_stash(rr, XFS_SB_BLOCK(sc->mp), 862 XFS_AGFL_BLOCK(sc->mp) - XFS_SB_BLOCK(sc->mp) + 1, 863 XFS_RMAP_OWN_FS, 0, 0); 864 } 865 866 /* Generate rmaps for the log, if it's in this AG. */ 867 STATIC int 868 xrep_rmap_find_log_rmaps( 869 struct xrep_rmap *rr) 870 { 871 struct xfs_scrub *sc = rr->sc; 872 873 if (!xfs_ag_contains_log(sc->mp, pag_agno(sc->sa.pag))) 874 return 0; 875 876 return xrep_rmap_stash(rr, 877 XFS_FSB_TO_AGBNO(sc->mp, sc->mp->m_sb.sb_logstart), 878 sc->mp->m_sb.sb_logblocks, XFS_RMAP_OWN_LOG, 0, 0); 879 } 880 881 /* Check and count all the records that we gathered. */ 882 STATIC int 883 xrep_rmap_check_record( 884 struct xfs_btree_cur *cur, 885 const struct xfs_rmap_irec *rec, 886 void *priv) 887 { 888 struct xrep_rmap *rr = priv; 889 int error; 890 891 error = xrep_rmap_check_mapping(rr->sc, rec); 892 if (error) 893 return error; 894 895 rr->nr_records++; 896 return 0; 897 } 898 899 /* 900 * Generate all the reverse-mappings for this AG, a list of the old rmapbt 901 * blocks, and the new btreeblks count. Figure out if we have enough free 902 * space to reconstruct the inode btrees. The caller must clean up the lists 903 * if anything goes wrong. This implements section (I) above. 904 */ 905 STATIC int 906 xrep_rmap_find_rmaps( 907 struct xrep_rmap *rr) 908 { 909 struct xfs_scrub *sc = rr->sc; 910 struct xchk_ag *sa = &sc->sa; 911 struct xfs_inode *ip; 912 struct xfs_btree_cur *mcur; 913 int error; 914 915 /* Find all the per-AG metadata. */ 916 xrep_ag_btcur_init(sc, &sc->sa); 917 918 error = xrep_rmap_find_inode_rmaps(rr); 919 if (error) 920 goto end_agscan; 921 922 error = xrep_rmap_find_refcount_rmaps(rr); 923 if (error) 924 goto end_agscan; 925 926 error = xrep_rmap_find_agheader_rmaps(rr); 927 if (error) 928 goto end_agscan; 929 930 error = xrep_rmap_find_log_rmaps(rr); 931 end_agscan: 932 xchk_ag_btcur_free(&sc->sa); 933 if (error) 934 return error; 935 936 /* 937 * Set up for a potentially lengthy filesystem scan by reducing our 938 * transaction resource usage for the duration. Specifically: 939 * 940 * Unlock the AG header buffers and cancel the transaction to release 941 * the log grant space while we scan the filesystem. 942 * 943 * Create a new empty transaction to eliminate the possibility of the 944 * inode scan deadlocking on cyclical metadata. 945 * 946 * We pass the empty transaction to the file scanning function to avoid 947 * repeatedly cycling empty transactions. This can be done even though 948 * we take the IOLOCK to quiesce the file because empty transactions 949 * do not take sb_internal. 950 */ 951 sa->agf_bp = NULL; 952 sa->agi_bp = NULL; 953 xchk_trans_cancel(sc); 954 error = xchk_trans_alloc_empty(sc); 955 if (error) 956 return error; 957 958 /* Iterate all AGs for inodes rmaps. */ 959 while ((error = xchk_iscan_iter(&rr->iscan, &ip)) == 1) { 960 error = xrep_rmap_scan_inode(rr, ip); 961 xchk_irele(sc, ip); 962 if (error) 963 break; 964 965 if (xchk_should_terminate(sc, &error)) 966 break; 967 } 968 xchk_iscan_iter_finish(&rr->iscan); 969 if (error) 970 return error; 971 972 /* 973 * Switch out for a real transaction and lock the AG headers in 974 * preparation for building a new tree. 975 */ 976 xchk_trans_cancel(sc); 977 error = xchk_setup_fs(sc); 978 if (error) 979 return error; 980 error = xchk_perag_drain_and_lock(sc); 981 if (error) 982 return error; 983 984 /* 985 * If a hook failed to update the in-memory btree, we lack the data to 986 * continue the repair. 987 */ 988 if (xchk_iscan_aborted(&rr->iscan)) 989 return -EFSCORRUPTED; 990 991 /* 992 * Now that we have everything locked again, we need to count the 993 * number of rmap records stashed in the btree. This should reflect 994 * all actively-owned space in the filesystem. At the same time, check 995 * all our records before we start building a new btree, which requires 996 * a bnobt cursor. 997 */ 998 mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL, &rr->rmap_btree); 999 sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, 1000 sc->sa.pag); 1001 1002 rr->nr_records = 0; 1003 error = xfs_rmap_query_all(mcur, xrep_rmap_check_record, rr); 1004 1005 xfs_btree_del_cursor(sc->sa.bno_cur, error); 1006 sc->sa.bno_cur = NULL; 1007 xfs_btree_del_cursor(mcur, error); 1008 1009 return error; 1010 } 1011 1012 /* Section (II): Reserving space for new rmapbt and setting free space bitmap */ 1013 1014 struct xrep_rmap_agfl { 1015 struct xagb_bitmap *bitmap; 1016 xfs_agnumber_t agno; 1017 }; 1018 1019 /* Add an AGFL block to the rmap list. */ 1020 STATIC int 1021 xrep_rmap_walk_agfl( 1022 struct xfs_mount *mp, 1023 xfs_agblock_t agbno, 1024 void *priv) 1025 { 1026 struct xrep_rmap_agfl *ra = priv; 1027 1028 return xagb_bitmap_set(ra->bitmap, agbno, 1); 1029 } 1030 1031 /* 1032 * Run one round of reserving space for the new rmapbt and recomputing the 1033 * number of blocks needed to store the previously observed rmapbt records and 1034 * the ones we'll create for the free space metadata. When we don't need more 1035 * blocks, return a bitmap of OWN_AG extents in @freesp_blocks and set @done to 1036 * true. 1037 */ 1038 STATIC int 1039 xrep_rmap_try_reserve( 1040 struct xrep_rmap *rr, 1041 struct xfs_btree_cur *rmap_cur, 1042 struct xagb_bitmap *freesp_blocks, 1043 uint64_t *blocks_reserved, 1044 bool *done) 1045 { 1046 struct xrep_rmap_agfl ra = { 1047 .bitmap = freesp_blocks, 1048 .agno = pag_agno(rr->sc->sa.pag), 1049 }; 1050 struct xfs_scrub *sc = rr->sc; 1051 struct xrep_newbt_resv *resv, *n; 1052 struct xfs_agf *agf = sc->sa.agf_bp->b_addr; 1053 struct xfs_buf *agfl_bp; 1054 uint64_t nr_blocks; /* RMB */ 1055 uint64_t freesp_records; 1056 int error; 1057 1058 /* 1059 * We're going to recompute new_btree.bload.nr_blocks at the end of 1060 * this function to reflect however many btree blocks we need to store 1061 * all the rmap records (including the ones that reflect the changes we 1062 * made to support the new rmapbt blocks), so we save the old value 1063 * here so we can decide if we've reserved enough blocks. 1064 */ 1065 nr_blocks = rr->new_btree.bload.nr_blocks; 1066 1067 /* 1068 * Make sure we've reserved enough space for the new btree. This can 1069 * change the shape of the free space btrees, which can cause secondary 1070 * interactions with the rmap records because all three space btrees 1071 * have the same rmap owner. We'll account for all that below. 1072 */ 1073 error = xrep_newbt_alloc_blocks(&rr->new_btree, 1074 nr_blocks - *blocks_reserved); 1075 if (error) 1076 return error; 1077 1078 *blocks_reserved = rr->new_btree.bload.nr_blocks; 1079 1080 /* Clear everything in the bitmap. */ 1081 xagb_bitmap_destroy(freesp_blocks); 1082 1083 /* Set all the bnobt blocks in the bitmap. */ 1084 sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, 1085 sc->sa.pag); 1086 error = xagb_bitmap_set_btblocks(freesp_blocks, sc->sa.bno_cur); 1087 xfs_btree_del_cursor(sc->sa.bno_cur, error); 1088 sc->sa.bno_cur = NULL; 1089 if (error) 1090 return error; 1091 1092 /* Set all the cntbt blocks in the bitmap. */ 1093 sc->sa.cnt_cur = xfs_cntbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, 1094 sc->sa.pag); 1095 error = xagb_bitmap_set_btblocks(freesp_blocks, sc->sa.cnt_cur); 1096 xfs_btree_del_cursor(sc->sa.cnt_cur, error); 1097 sc->sa.cnt_cur = NULL; 1098 if (error) 1099 return error; 1100 1101 /* Record our new btreeblks value. */ 1102 rr->freesp_btblocks = xagb_bitmap_hweight(freesp_blocks) - 2; 1103 1104 /* Set all the new rmapbt blocks in the bitmap. */ 1105 list_for_each_entry_safe(resv, n, &rr->new_btree.resv_list, list) { 1106 error = xagb_bitmap_set(freesp_blocks, resv->agbno, resv->len); 1107 if (error) 1108 return error; 1109 } 1110 1111 /* Set all the AGFL blocks in the bitmap. */ 1112 error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp); 1113 if (error) 1114 return error; 1115 1116 error = xfs_agfl_walk(sc->mp, agf, agfl_bp, xrep_rmap_walk_agfl, &ra); 1117 if (error) 1118 return error; 1119 1120 /* Count the extents in the bitmap. */ 1121 freesp_records = xagb_bitmap_count_set_regions(freesp_blocks); 1122 1123 /* Compute how many blocks we'll need for all the rmaps. */ 1124 error = xfs_btree_bload_compute_geometry(rmap_cur, 1125 &rr->new_btree.bload, rr->nr_records + freesp_records); 1126 if (error) 1127 return error; 1128 1129 /* We're done when we don't need more blocks. */ 1130 *done = nr_blocks >= rr->new_btree.bload.nr_blocks; 1131 return 0; 1132 } 1133 1134 /* 1135 * Iteratively reserve space for rmap btree while recording OWN_AG rmaps for 1136 * the free space metadata. This implements section (II) above. 1137 */ 1138 STATIC int 1139 xrep_rmap_reserve_space( 1140 struct xrep_rmap *rr, 1141 struct xfs_btree_cur *rmap_cur) 1142 { 1143 struct xagb_bitmap freesp_blocks; /* AGBIT */ 1144 uint64_t blocks_reserved = 0; 1145 bool done = false; 1146 int error; 1147 1148 /* Compute how many blocks we'll need for the rmaps collected so far. */ 1149 error = xfs_btree_bload_compute_geometry(rmap_cur, 1150 &rr->new_btree.bload, rr->nr_records); 1151 if (error) 1152 return error; 1153 1154 /* Last chance to abort before we start committing fixes. */ 1155 if (xchk_should_terminate(rr->sc, &error)) 1156 return error; 1157 1158 xagb_bitmap_init(&freesp_blocks); 1159 1160 /* 1161 * Iteratively reserve space for the new rmapbt and recompute the 1162 * number of blocks needed to store the previously observed rmapbt 1163 * records and the ones we'll create for the free space metadata. 1164 * Finish when we don't need more blocks. 1165 */ 1166 do { 1167 error = xrep_rmap_try_reserve(rr, rmap_cur, &freesp_blocks, 1168 &blocks_reserved, &done); 1169 if (error) 1170 goto out_bitmap; 1171 } while (!done); 1172 1173 /* Emit rmaps for everything in the free space bitmap. */ 1174 xrep_ag_btcur_init(rr->sc, &rr->sc->sa); 1175 error = xrep_rmap_stash_bitmap(rr, &freesp_blocks, &XFS_RMAP_OINFO_AG); 1176 xchk_ag_btcur_free(&rr->sc->sa); 1177 1178 out_bitmap: 1179 xagb_bitmap_destroy(&freesp_blocks); 1180 return error; 1181 } 1182 1183 /* Section (III): Building the new rmap btree. */ 1184 1185 /* Update the AGF counters. */ 1186 STATIC int 1187 xrep_rmap_reset_counters( 1188 struct xrep_rmap *rr) 1189 { 1190 struct xfs_scrub *sc = rr->sc; 1191 struct xfs_perag *pag = sc->sa.pag; 1192 struct xfs_agf *agf = sc->sa.agf_bp->b_addr; 1193 xfs_agblock_t rmap_btblocks; 1194 1195 /* 1196 * The AGF header contains extra information related to the reverse 1197 * mapping btree, so we must update those fields here. 1198 */ 1199 rmap_btblocks = rr->new_btree.afake.af_blocks - 1; 1200 agf->agf_btreeblks = cpu_to_be32(rr->freesp_btblocks + rmap_btblocks); 1201 xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_BTREEBLKS); 1202 1203 /* 1204 * After we commit the new btree to disk, it is possible that the 1205 * process to reap the old btree blocks will race with the AIL trying 1206 * to checkpoint the old btree blocks into the filesystem. If the new 1207 * tree is shorter than the old one, the rmapbt write verifier will 1208 * fail and the AIL will shut down the filesystem. 1209 * 1210 * To avoid this, save the old incore btree height values as the alt 1211 * height values before re-initializing the perag info from the updated 1212 * AGF to capture all the new values. 1213 */ 1214 pag->pagf_repair_rmap_level = pag->pagf_rmap_level; 1215 1216 /* Reinitialize with the values we just logged. */ 1217 return xrep_reinit_pagf(sc); 1218 } 1219 1220 /* Retrieve rmapbt data for bulk load. */ 1221 STATIC int 1222 xrep_rmap_get_records( 1223 struct xfs_btree_cur *cur, 1224 unsigned int idx, 1225 struct xfs_btree_block *block, 1226 unsigned int nr_wanted, 1227 void *priv) 1228 { 1229 struct xrep_rmap *rr = priv; 1230 union xfs_btree_rec *block_rec; 1231 unsigned int loaded; 1232 int error; 1233 1234 for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { 1235 int stat = 0; 1236 1237 error = xfs_btree_increment(rr->mcur, 0, &stat); 1238 if (error) 1239 return error; 1240 if (!stat) 1241 return -EFSCORRUPTED; 1242 1243 error = xfs_rmap_get_rec(rr->mcur, &cur->bc_rec.r, &stat); 1244 if (error) 1245 return error; 1246 if (!stat) 1247 return -EFSCORRUPTED; 1248 1249 block_rec = xfs_btree_rec_addr(cur, idx, block); 1250 cur->bc_ops->init_rec_from_cur(cur, block_rec); 1251 } 1252 1253 return loaded; 1254 } 1255 1256 /* Feed one of the new btree blocks to the bulk loader. */ 1257 STATIC int 1258 xrep_rmap_claim_block( 1259 struct xfs_btree_cur *cur, 1260 union xfs_btree_ptr *ptr, 1261 void *priv) 1262 { 1263 struct xrep_rmap *rr = priv; 1264 1265 return xrep_newbt_claim_block(cur, &rr->new_btree, ptr); 1266 } 1267 1268 /* Custom allocation function for new rmap btrees. */ 1269 STATIC int 1270 xrep_rmap_alloc_vextent( 1271 struct xfs_scrub *sc, 1272 struct xfs_alloc_arg *args, 1273 xfs_fsblock_t alloc_hint) 1274 { 1275 int error; 1276 1277 /* 1278 * We don't want an rmap update on the allocation, since we iteratively 1279 * compute the OWN_AG records /after/ allocating blocks for the records 1280 * that we already know we need to store. Therefore, fix the freelist 1281 * with the NORMAP flag set so that we don't also try to create an rmap 1282 * for new AGFL blocks. 1283 */ 1284 error = xrep_fix_freelist(sc, XFS_ALLOC_FLAG_NORMAP); 1285 if (error) 1286 return error; 1287 1288 /* 1289 * If xrep_fix_freelist fixed the freelist by moving blocks from the 1290 * free space btrees or by removing blocks from the AGFL and queueing 1291 * an EFI to free the block, the transaction will be dirty. This 1292 * second case is of interest to us. 1293 * 1294 * Later on, we will need to compare gaps in the new recordset against 1295 * the block usage of all OWN_AG owners in order to free the old 1296 * btree's blocks, which means that we can't have EFIs for former AGFL 1297 * blocks attached to the repair transaction when we commit the new 1298 * btree. 1299 * 1300 * xrep_newbt_alloc_blocks guarantees this for us by calling 1301 * xrep_defer_finish to commit anything that fix_freelist may have 1302 * added to the transaction. 1303 */ 1304 return xfs_alloc_vextent_near_bno(args, alloc_hint); 1305 } 1306 1307 1308 /* Count the records in this btree. */ 1309 STATIC int 1310 xrep_rmap_count_records( 1311 struct xfs_btree_cur *cur, 1312 unsigned long long *nr) 1313 { 1314 int running = 1; 1315 int error; 1316 1317 *nr = 0; 1318 1319 error = xfs_btree_goto_left_edge(cur); 1320 if (error) 1321 return error; 1322 1323 while (running && !(error = xfs_btree_increment(cur, 0, &running))) { 1324 if (running) 1325 (*nr)++; 1326 } 1327 1328 return error; 1329 } 1330 /* 1331 * Use the collected rmap information to stage a new rmap btree. If this is 1332 * successful we'll return with the new btree root information logged to the 1333 * repair transaction but not yet committed. This implements section (III) 1334 * above. 1335 */ 1336 STATIC int 1337 xrep_rmap_build_new_tree( 1338 struct xrep_rmap *rr) 1339 { 1340 struct xfs_scrub *sc = rr->sc; 1341 struct xfs_perag *pag = sc->sa.pag; 1342 struct xfs_agf *agf = sc->sa.agf_bp->b_addr; 1343 struct xfs_btree_cur *rmap_cur; 1344 int error; 1345 1346 /* 1347 * Preserve the old rmapbt block count so that we can adjust the 1348 * per-AG rmapbt reservation after we commit the new btree root and 1349 * want to dispose of the old btree blocks. 1350 */ 1351 rr->old_rmapbt_fsbcount = be32_to_cpu(agf->agf_rmap_blocks); 1352 1353 /* 1354 * Prepare to construct the new btree by reserving disk space for the 1355 * new btree and setting up all the accounting information we'll need 1356 * to root the new btree while it's under construction and before we 1357 * attach it to the AG header. The new blocks are accounted to the 1358 * rmapbt per-AG reservation, which we will adjust further after 1359 * committing the new btree. 1360 */ 1361 xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_SKIP_UPDATE, 1362 xfs_agbno_to_fsb(pag, XFS_RMAP_BLOCK(sc->mp)), 1363 XFS_AG_RESV_RMAPBT); 1364 rr->new_btree.bload.get_records = xrep_rmap_get_records; 1365 rr->new_btree.bload.claim_block = xrep_rmap_claim_block; 1366 rr->new_btree.alloc_vextent = xrep_rmap_alloc_vextent; 1367 rmap_cur = xfs_rmapbt_init_cursor(sc->mp, NULL, NULL, pag); 1368 xfs_btree_stage_afakeroot(rmap_cur, &rr->new_btree.afake); 1369 1370 /* 1371 * Initialize @rr->new_btree, reserve space for the new rmapbt, 1372 * and compute OWN_AG rmaps. 1373 */ 1374 error = xrep_rmap_reserve_space(rr, rmap_cur); 1375 if (error) 1376 goto err_cur; 1377 1378 /* 1379 * Count the rmapbt records again, because the space reservation 1380 * for the rmapbt itself probably added more records to the btree. 1381 */ 1382 rr->mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL, 1383 &rr->rmap_btree); 1384 1385 error = xrep_rmap_count_records(rr->mcur, &rr->nr_records); 1386 if (error) 1387 goto err_mcur; 1388 1389 /* 1390 * Due to btree slack factors, it's possible for a new btree to be one 1391 * level taller than the old btree. Update the incore btree height so 1392 * that we don't trip the verifiers when writing the new btree blocks 1393 * to disk. 1394 */ 1395 pag->pagf_repair_rmap_level = rr->new_btree.bload.btree_height; 1396 1397 /* 1398 * Move the cursor to the left edge of the tree so that the first 1399 * increment in ->get_records positions us at the first record. 1400 */ 1401 error = xfs_btree_goto_left_edge(rr->mcur); 1402 if (error) 1403 goto err_level; 1404 1405 /* Add all observed rmap records. */ 1406 error = xfs_btree_bload(rmap_cur, &rr->new_btree.bload, rr); 1407 if (error) 1408 goto err_level; 1409 1410 /* 1411 * Install the new btree in the AG header. After this point the old 1412 * btree is no longer accessible and the new tree is live. 1413 */ 1414 xfs_rmapbt_commit_staged_btree(rmap_cur, sc->tp, sc->sa.agf_bp); 1415 xfs_btree_del_cursor(rmap_cur, 0); 1416 xfs_btree_del_cursor(rr->mcur, 0); 1417 rr->mcur = NULL; 1418 1419 /* 1420 * Now that we've written the new btree to disk, we don't need to keep 1421 * updating the in-memory btree. Abort the scan to stop live updates. 1422 */ 1423 xchk_iscan_abort(&rr->iscan); 1424 1425 /* 1426 * The newly committed rmap recordset includes mappings for the blocks 1427 * that we reserved to build the new btree. If there is excess space 1428 * reservation to be freed, the corresponding rmap records must also be 1429 * removed. 1430 */ 1431 rr->new_btree.oinfo = XFS_RMAP_OINFO_AG; 1432 1433 /* Reset the AGF counters now that we've changed the btree shape. */ 1434 error = xrep_rmap_reset_counters(rr); 1435 if (error) 1436 goto err_newbt; 1437 1438 /* Dispose of any unused blocks and the accounting information. */ 1439 error = xrep_newbt_commit(&rr->new_btree); 1440 if (error) 1441 return error; 1442 1443 return xrep_roll_ag_trans(sc); 1444 1445 err_level: 1446 pag->pagf_repair_rmap_level = 0; 1447 err_mcur: 1448 xfs_btree_del_cursor(rr->mcur, error); 1449 err_cur: 1450 xfs_btree_del_cursor(rmap_cur, error); 1451 err_newbt: 1452 xrep_newbt_cancel(&rr->new_btree); 1453 return error; 1454 } 1455 1456 /* Section (IV): Reaping the old btree. */ 1457 1458 struct xrep_rmap_find_gaps { 1459 struct xagb_bitmap rmap_gaps; 1460 xfs_agblock_t next_agbno; 1461 }; 1462 1463 /* Subtract each free extent in the bnobt from the rmap gaps. */ 1464 STATIC int 1465 xrep_rmap_find_freesp( 1466 struct xfs_btree_cur *cur, 1467 const struct xfs_alloc_rec_incore *rec, 1468 void *priv) 1469 { 1470 struct xrep_rmap_find_gaps *rfg = priv; 1471 1472 return xagb_bitmap_clear(&rfg->rmap_gaps, rec->ar_startblock, 1473 rec->ar_blockcount); 1474 } 1475 1476 /* Record the free space we find, as part of cleaning out the btree. */ 1477 STATIC int 1478 xrep_rmap_find_gaps( 1479 struct xfs_btree_cur *cur, 1480 const struct xfs_rmap_irec *rec, 1481 void *priv) 1482 { 1483 struct xrep_rmap_find_gaps *rfg = priv; 1484 int error; 1485 1486 if (rec->rm_startblock > rfg->next_agbno) { 1487 error = xagb_bitmap_set(&rfg->rmap_gaps, rfg->next_agbno, 1488 rec->rm_startblock - rfg->next_agbno); 1489 if (error) 1490 return error; 1491 } 1492 1493 rfg->next_agbno = max_t(xfs_agblock_t, rfg->next_agbno, 1494 rec->rm_startblock + rec->rm_blockcount); 1495 return 0; 1496 } 1497 1498 /* 1499 * Reap the old rmapbt blocks. Now that the rmapbt is fully rebuilt, we make 1500 * a list of gaps in the rmap records and a list of the extents mentioned in 1501 * the bnobt. Any block that's in the new rmapbt gap list but not mentioned 1502 * in the bnobt is a block from the old rmapbt and can be removed. 1503 */ 1504 STATIC int 1505 xrep_rmap_remove_old_tree( 1506 struct xrep_rmap *rr) 1507 { 1508 struct xrep_rmap_find_gaps rfg = { 1509 .next_agbno = 0, 1510 }; 1511 struct xfs_scrub *sc = rr->sc; 1512 struct xfs_agf *agf = sc->sa.agf_bp->b_addr; 1513 struct xfs_perag *pag = sc->sa.pag; 1514 struct xfs_btree_cur *mcur; 1515 xfs_agblock_t agend; 1516 int error; 1517 1518 xagb_bitmap_init(&rfg.rmap_gaps); 1519 1520 /* Compute free space from the new rmapbt. */ 1521 mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL, &rr->rmap_btree); 1522 1523 error = xfs_rmap_query_all(mcur, xrep_rmap_find_gaps, &rfg); 1524 xfs_btree_del_cursor(mcur, error); 1525 if (error) 1526 goto out_bitmap; 1527 1528 /* Insert a record for space between the last rmap and EOAG. */ 1529 agend = be32_to_cpu(agf->agf_length); 1530 if (rfg.next_agbno < agend) { 1531 error = xagb_bitmap_set(&rfg.rmap_gaps, rfg.next_agbno, 1532 agend - rfg.next_agbno); 1533 if (error) 1534 goto out_bitmap; 1535 } 1536 1537 /* Compute free space from the existing bnobt. */ 1538 sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, 1539 sc->sa.pag); 1540 error = xfs_alloc_query_all(sc->sa.bno_cur, xrep_rmap_find_freesp, 1541 &rfg); 1542 xfs_btree_del_cursor(sc->sa.bno_cur, error); 1543 sc->sa.bno_cur = NULL; 1544 if (error) 1545 goto out_bitmap; 1546 1547 /* 1548 * Free the "free" blocks that the new rmapbt knows about but the bnobt 1549 * doesn't--these are the old rmapbt blocks. Credit the old rmapbt 1550 * block usage count back to the per-AG rmapbt reservation (and not 1551 * fdblocks, since the rmap btree lives in free space) to keep the 1552 * reservation and free space accounting correct. 1553 */ 1554 error = xrep_reap_agblocks(sc, &rfg.rmap_gaps, 1555 &XFS_RMAP_OINFO_ANY_OWNER, XFS_AG_RESV_RMAPBT); 1556 if (error) 1557 goto out_bitmap; 1558 1559 /* 1560 * Now that we've zapped all the old rmapbt blocks we can turn off 1561 * the alternate height mechanism and reset the per-AG space 1562 * reservation. 1563 */ 1564 pag->pagf_repair_rmap_level = 0; 1565 sc->flags |= XREP_RESET_PERAG_RESV; 1566 out_bitmap: 1567 xagb_bitmap_destroy(&rfg.rmap_gaps); 1568 return error; 1569 } 1570 1571 static inline bool 1572 xrep_rmapbt_want_live_update( 1573 struct xchk_iscan *iscan, 1574 const struct xfs_owner_info *oi) 1575 { 1576 if (xchk_iscan_aborted(iscan)) 1577 return false; 1578 1579 /* 1580 * Before unlocking the AG header to perform the inode scan, we 1581 * recorded reverse mappings for all AG metadata except for the OWN_AG 1582 * metadata. IOWs, the in-memory btree knows about the AG headers, the 1583 * two inode btrees, the CoW staging extents, and the refcount btrees. 1584 * For these types of metadata, we need to record the live updates in 1585 * the in-memory rmap btree. 1586 * 1587 * However, we do not scan the free space btrees or the AGFL until we 1588 * have re-locked the AGF and are ready to reserve space for the new 1589 * rmap btree, so we do not want live updates for OWN_AG metadata. 1590 */ 1591 if (XFS_RMAP_NON_INODE_OWNER(oi->oi_owner)) 1592 return oi->oi_owner != XFS_RMAP_OWN_AG; 1593 1594 /* Ignore updates to files that the scanner hasn't visited yet. */ 1595 return xchk_iscan_want_live_update(iscan, oi->oi_owner); 1596 } 1597 1598 /* 1599 * Apply a rmapbt update from the regular filesystem into our shadow btree. 1600 * We're running from the thread that owns the AGF buffer and is generating 1601 * the update, so we must be careful about which parts of the struct xrep_rmap 1602 * that we change. 1603 */ 1604 static int 1605 xrep_rmapbt_live_update( 1606 struct notifier_block *nb, 1607 unsigned long action, 1608 void *data) 1609 { 1610 struct xfs_rmap_update_params *p = data; 1611 struct xrep_rmap *rr; 1612 struct xfs_mount *mp; 1613 struct xfs_btree_cur *mcur; 1614 struct xfs_trans *tp; 1615 void *txcookie; 1616 int error; 1617 1618 rr = container_of(nb, struct xrep_rmap, rhook.rmap_hook.nb); 1619 mp = rr->sc->mp; 1620 1621 if (!xrep_rmapbt_want_live_update(&rr->iscan, &p->oinfo)) 1622 goto out_unlock; 1623 1624 trace_xrep_rmap_live_update(pag_group(rr->sc->sa.pag), action, p); 1625 1626 error = xrep_trans_alloc_hook_dummy(mp, &txcookie, &tp); 1627 if (error) 1628 goto out_abort; 1629 1630 mutex_lock(&rr->lock); 1631 mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, tp, &rr->rmap_btree); 1632 error = __xfs_rmap_finish_intent(mcur, action, p->startblock, 1633 p->blockcount, &p->oinfo, p->unwritten); 1634 xfs_btree_del_cursor(mcur, error); 1635 if (error) 1636 goto out_cancel; 1637 1638 error = xfbtree_trans_commit(&rr->rmap_btree, tp); 1639 if (error) 1640 goto out_cancel; 1641 1642 xrep_trans_cancel_hook_dummy(&txcookie, tp); 1643 mutex_unlock(&rr->lock); 1644 return NOTIFY_DONE; 1645 1646 out_cancel: 1647 xfbtree_trans_cancel(&rr->rmap_btree, tp); 1648 xrep_trans_cancel_hook_dummy(&txcookie, tp); 1649 out_abort: 1650 mutex_unlock(&rr->lock); 1651 xchk_iscan_abort(&rr->iscan); 1652 out_unlock: 1653 return NOTIFY_DONE; 1654 } 1655 1656 /* Set up the filesystem scan components. */ 1657 STATIC int 1658 xrep_rmap_setup_scan( 1659 struct xrep_rmap *rr) 1660 { 1661 struct xfs_scrub *sc = rr->sc; 1662 int error; 1663 1664 mutex_init(&rr->lock); 1665 1666 /* Set up in-memory rmap btree */ 1667 error = xfs_rmapbt_mem_init(sc->mp, &rr->rmap_btree, sc->xmbtp, 1668 pag_agno(sc->sa.pag)); 1669 if (error) 1670 goto out_mutex; 1671 1672 /* Retry iget every tenth of a second for up to 30 seconds. */ 1673 xchk_iscan_start(sc, 30000, 100, &rr->iscan); 1674 1675 /* 1676 * Hook into live rmap operations so that we can update our in-memory 1677 * btree to reflect live changes on the filesystem. Since we drop the 1678 * AGF buffer to scan all the inodes, we need this piece to avoid 1679 * installing a stale btree. 1680 */ 1681 ASSERT(sc->flags & XCHK_FSGATES_RMAP); 1682 xfs_rmap_hook_setup(&rr->rhook, xrep_rmapbt_live_update); 1683 error = xfs_rmap_hook_add(pag_group(sc->sa.pag), &rr->rhook); 1684 if (error) 1685 goto out_iscan; 1686 return 0; 1687 1688 out_iscan: 1689 xchk_iscan_teardown(&rr->iscan); 1690 xfbtree_destroy(&rr->rmap_btree); 1691 out_mutex: 1692 mutex_destroy(&rr->lock); 1693 return error; 1694 } 1695 1696 /* Tear down scan components. */ 1697 STATIC void 1698 xrep_rmap_teardown( 1699 struct xrep_rmap *rr) 1700 { 1701 struct xfs_scrub *sc = rr->sc; 1702 1703 xchk_iscan_abort(&rr->iscan); 1704 xfs_rmap_hook_del(pag_group(sc->sa.pag), &rr->rhook); 1705 xchk_iscan_teardown(&rr->iscan); 1706 xfbtree_destroy(&rr->rmap_btree); 1707 mutex_destroy(&rr->lock); 1708 } 1709 1710 /* Repair the rmap btree for some AG. */ 1711 int 1712 xrep_rmapbt( 1713 struct xfs_scrub *sc) 1714 { 1715 struct xrep_rmap *rr = sc->buf; 1716 int error; 1717 1718 error = xrep_rmap_setup_scan(rr); 1719 if (error) 1720 return error; 1721 1722 /* 1723 * Collect rmaps for everything in this AG that isn't space metadata. 1724 * These rmaps won't change even as we try to allocate blocks. 1725 */ 1726 error = xrep_rmap_find_rmaps(rr); 1727 if (error) 1728 goto out_records; 1729 1730 /* Rebuild the rmap information. */ 1731 error = xrep_rmap_build_new_tree(rr); 1732 if (error) 1733 goto out_records; 1734 1735 /* Kill the old tree. */ 1736 error = xrep_rmap_remove_old_tree(rr); 1737 if (error) 1738 goto out_records; 1739 1740 out_records: 1741 xrep_rmap_teardown(rr); 1742 return error; 1743 } 1744