1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2018-2023 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_defer.h" 13 #include "xfs_btree.h" 14 #include "xfs_btree_staging.h" 15 #include "xfs_bit.h" 16 #include "xfs_log_format.h" 17 #include "xfs_trans.h" 18 #include "xfs_sb.h" 19 #include "xfs_alloc.h" 20 #include "xfs_alloc_btree.h" 21 #include "xfs_rmap.h" 22 #include "xfs_rmap_btree.h" 23 #include "xfs_inode.h" 24 #include "xfs_refcount.h" 25 #include "xfs_extent_busy.h" 26 #include "xfs_health.h" 27 #include "xfs_bmap.h" 28 #include "xfs_ialloc.h" 29 #include "xfs_ag.h" 30 #include "scrub/xfs_scrub.h" 31 #include "scrub/scrub.h" 32 #include "scrub/common.h" 33 #include "scrub/btree.h" 34 #include "scrub/trace.h" 35 #include "scrub/repair.h" 36 #include "scrub/bitmap.h" 37 #include "scrub/agb_bitmap.h" 38 #include "scrub/xfile.h" 39 #include "scrub/xfarray.h" 40 #include "scrub/newbt.h" 41 #include "scrub/reap.h" 42 43 /* 44 * Free Space Btree Repair 45 * ======================= 46 * 47 * The reverse mappings are supposed to record all space usage for the entire 48 * AG. Therefore, we can recreate the free extent records in an AG by looking 49 * for gaps in the physical extents recorded in the rmapbt. These records are 50 * staged in @free_records. Identifying the gaps is more difficult on a 51 * reflink filesystem because rmap records are allowed to overlap. 52 * 53 * Because the final step of building a new index is to free the space used by 54 * the old index, repair needs to find that space. Unfortunately, all 55 * structures that live in the free space (bnobt, cntbt, rmapbt, agfl) share 56 * the same rmapbt owner code (OWN_AG), so this is not straightforward. 57 * 58 * The scan of the reverse mapping information records the space used by OWN_AG 59 * in @old_allocbt_blocks, which (at this stage) is somewhat misnamed. While 60 * walking the rmapbt records, we create a second bitmap @not_allocbt_blocks to 61 * record all visited rmap btree blocks and all blocks owned by the AGFL. 62 * 63 * After that is where the definitions of old_allocbt_blocks shifts. This 64 * expression identifies possible former bnobt/cntbt blocks: 65 * 66 * (OWN_AG blocks) & ~(rmapbt blocks | agfl blocks); 67 * 68 * Substituting from above definitions, that becomes: 69 * 70 * old_allocbt_blocks & ~not_allocbt_blocks 71 * 72 * The OWN_AG bitmap itself isn't needed after this point, so what we really do 73 * instead is: 74 * 75 * old_allocbt_blocks &= ~not_allocbt_blocks; 76 * 77 * After this point, @old_allocbt_blocks is a bitmap of alleged former 78 * bnobt/cntbt blocks. The xagb_bitmap_disunion operation modifies its first 79 * parameter in place to avoid copying records around. 80 * 81 * Next, some of the space described by @free_records are diverted to the newbt 82 * reservation and used to format new btree blocks. The remaining records are 83 * written to the new btree indices. We reconstruct both bnobt and cntbt at 84 * the same time since we've already done all the work. 85 * 86 * We use the prefix 'xrep_abt' here because we regenerate both free space 87 * allocation btrees at the same time. 88 */ 89 90 struct xrep_abt { 91 /* Blocks owned by the rmapbt or the agfl. */ 92 struct xagb_bitmap not_allocbt_blocks; 93 94 /* All OWN_AG blocks. */ 95 struct xagb_bitmap old_allocbt_blocks; 96 97 /* 98 * New bnobt information. All btree block reservations are added to 99 * the reservation list in new_bnobt. 100 */ 101 struct xrep_newbt new_bnobt; 102 103 /* new cntbt information */ 104 struct xrep_newbt new_cntbt; 105 106 /* Free space extents. */ 107 struct xfarray *free_records; 108 109 struct xfs_scrub *sc; 110 111 /* Number of non-null records in @free_records. */ 112 uint64_t nr_real_records; 113 114 /* get_records()'s position in the free space record array. */ 115 xfarray_idx_t array_cur; 116 117 /* 118 * Next block we anticipate seeing in the rmap records. If the next 119 * rmap record is greater than next_agbno, we have found unused space. 120 */ 121 xfs_agblock_t next_agbno; 122 123 /* Number of free blocks in this AG. */ 124 xfs_agblock_t nr_blocks; 125 126 /* Longest free extent we found in the AG. */ 127 xfs_agblock_t longest; 128 }; 129 130 /* Set up to repair AG free space btrees. */ 131 int 132 xrep_setup_ag_allocbt( 133 struct xfs_scrub *sc) 134 { 135 struct xfs_group *xg = pag_group(sc->sa.pag); 136 unsigned int busy_gen; 137 138 /* 139 * Make sure the busy extent list is clear because we can't put extents 140 * on there twice. 141 */ 142 if (xfs_extent_busy_list_empty(xg, &busy_gen)) 143 return 0; 144 return xfs_extent_busy_flush(sc->tp, xg, busy_gen, 0); 145 } 146 147 /* Check for any obvious conflicts in the free extent. */ 148 STATIC int 149 xrep_abt_check_free_ext( 150 struct xfs_scrub *sc, 151 const struct xfs_alloc_rec_incore *rec) 152 { 153 enum xbtree_recpacking outcome; 154 int error; 155 156 if (xfs_alloc_check_irec(sc->sa.pag, rec) != NULL) 157 return -EFSCORRUPTED; 158 159 /* Must not be an inode chunk. */ 160 error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur, 161 rec->ar_startblock, rec->ar_blockcount, &outcome); 162 if (error) 163 return error; 164 if (outcome != XBTREE_RECPACKING_EMPTY) 165 return -EFSCORRUPTED; 166 167 /* Must not be shared or CoW staging. */ 168 if (sc->sa.refc_cur) { 169 error = xfs_refcount_has_records(sc->sa.refc_cur, 170 XFS_REFC_DOMAIN_SHARED, rec->ar_startblock, 171 rec->ar_blockcount, &outcome); 172 if (error) 173 return error; 174 if (outcome != XBTREE_RECPACKING_EMPTY) 175 return -EFSCORRUPTED; 176 177 error = xfs_refcount_has_records(sc->sa.refc_cur, 178 XFS_REFC_DOMAIN_COW, rec->ar_startblock, 179 rec->ar_blockcount, &outcome); 180 if (error) 181 return error; 182 if (outcome != XBTREE_RECPACKING_EMPTY) 183 return -EFSCORRUPTED; 184 } 185 186 return 0; 187 } 188 189 /* 190 * Stash a free space record for all the space since the last bno we found 191 * all the way up to @end. 192 */ 193 static int 194 xrep_abt_stash( 195 struct xrep_abt *ra, 196 xfs_agblock_t end) 197 { 198 struct xfs_alloc_rec_incore arec = { 199 .ar_startblock = ra->next_agbno, 200 .ar_blockcount = end - ra->next_agbno, 201 }; 202 struct xfs_scrub *sc = ra->sc; 203 int error = 0; 204 205 if (xchk_should_terminate(sc, &error)) 206 return error; 207 208 error = xrep_abt_check_free_ext(ra->sc, &arec); 209 if (error) 210 return error; 211 212 trace_xrep_abt_found(sc->sa.pag, &arec); 213 214 error = xfarray_append(ra->free_records, &arec); 215 if (error) 216 return error; 217 218 ra->nr_blocks += arec.ar_blockcount; 219 return 0; 220 } 221 222 /* Record extents that aren't in use from gaps in the rmap records. */ 223 STATIC int 224 xrep_abt_walk_rmap( 225 struct xfs_btree_cur *cur, 226 const struct xfs_rmap_irec *rec, 227 void *priv) 228 { 229 struct xrep_abt *ra = priv; 230 int error; 231 232 /* Record all the OWN_AG blocks... */ 233 if (rec->rm_owner == XFS_RMAP_OWN_AG) { 234 error = xagb_bitmap_set(&ra->old_allocbt_blocks, 235 rec->rm_startblock, rec->rm_blockcount); 236 if (error) 237 return error; 238 } 239 240 /* ...and all the rmapbt blocks... */ 241 error = xagb_bitmap_set_btcur_path(&ra->not_allocbt_blocks, cur); 242 if (error) 243 return error; 244 245 /* ...and all the free space. */ 246 if (rec->rm_startblock > ra->next_agbno) { 247 error = xrep_abt_stash(ra, rec->rm_startblock); 248 if (error) 249 return error; 250 } 251 252 /* 253 * rmap records can overlap on reflink filesystems, so project 254 * next_agbno as far out into the AG space as we currently know about. 255 */ 256 ra->next_agbno = max_t(xfs_agblock_t, ra->next_agbno, 257 rec->rm_startblock + rec->rm_blockcount); 258 return 0; 259 } 260 261 /* Collect an AGFL block for the not-to-release list. */ 262 static int 263 xrep_abt_walk_agfl( 264 struct xfs_mount *mp, 265 xfs_agblock_t agbno, 266 void *priv) 267 { 268 struct xrep_abt *ra = priv; 269 270 return xagb_bitmap_set(&ra->not_allocbt_blocks, agbno, 1); 271 } 272 273 /* 274 * Compare two free space extents by block number. We want to sort in order of 275 * increasing block number. 276 */ 277 static int 278 xrep_bnobt_extent_cmp( 279 const void *a, 280 const void *b) 281 { 282 const struct xfs_alloc_rec_incore *ap = a; 283 const struct xfs_alloc_rec_incore *bp = b; 284 285 if (ap->ar_startblock > bp->ar_startblock) 286 return 1; 287 else if (ap->ar_startblock < bp->ar_startblock) 288 return -1; 289 return 0; 290 } 291 292 /* 293 * Re-sort the free extents by block number so that we can put the records into 294 * the bnobt in the correct order. Make sure the records do not overlap in 295 * physical space. 296 */ 297 STATIC int 298 xrep_bnobt_sort_records( 299 struct xrep_abt *ra) 300 { 301 struct xfs_alloc_rec_incore arec; 302 xfarray_idx_t cur = XFARRAY_CURSOR_INIT; 303 xfs_agblock_t next_agbno = 0; 304 int error; 305 306 error = xfarray_sort(ra->free_records, xrep_bnobt_extent_cmp, 0); 307 if (error) 308 return error; 309 310 while ((error = xfarray_iter(ra->free_records, &cur, &arec)) == 1) { 311 if (arec.ar_startblock < next_agbno) 312 return -EFSCORRUPTED; 313 314 next_agbno = arec.ar_startblock + arec.ar_blockcount; 315 } 316 317 return error; 318 } 319 320 /* 321 * Compare two free space extents by length and then block number. We want 322 * to sort first in order of increasing length and then in order of increasing 323 * block number. 324 */ 325 static int 326 xrep_cntbt_extent_cmp( 327 const void *a, 328 const void *b) 329 { 330 const struct xfs_alloc_rec_incore *ap = a; 331 const struct xfs_alloc_rec_incore *bp = b; 332 333 if (ap->ar_blockcount > bp->ar_blockcount) 334 return 1; 335 else if (ap->ar_blockcount < bp->ar_blockcount) 336 return -1; 337 return xrep_bnobt_extent_cmp(a, b); 338 } 339 340 /* 341 * Sort the free extents by length so so that we can put the records into the 342 * cntbt in the correct order. Don't let userspace kill us if we're resorting 343 * after allocating btree blocks. 344 */ 345 STATIC int 346 xrep_cntbt_sort_records( 347 struct xrep_abt *ra, 348 bool is_resort) 349 { 350 return xfarray_sort(ra->free_records, xrep_cntbt_extent_cmp, 351 is_resort ? 0 : XFARRAY_SORT_KILLABLE); 352 } 353 354 /* 355 * Iterate all reverse mappings to find (1) the gaps between rmap records (all 356 * unowned space), (2) the OWN_AG extents (which encompass the free space 357 * btrees, the rmapbt, and the agfl), (3) the rmapbt blocks, and (4) the AGFL 358 * blocks. The free space is (1) + (2) - (3) - (4). 359 */ 360 STATIC int 361 xrep_abt_find_freespace( 362 struct xrep_abt *ra) 363 { 364 struct xfs_scrub *sc = ra->sc; 365 struct xfs_mount *mp = sc->mp; 366 struct xfs_agf *agf = sc->sa.agf_bp->b_addr; 367 struct xfs_buf *agfl_bp; 368 xfs_agblock_t agend; 369 int error; 370 371 xagb_bitmap_init(&ra->not_allocbt_blocks); 372 373 xrep_ag_btcur_init(sc, &sc->sa); 374 375 /* 376 * Iterate all the reverse mappings to find gaps in the physical 377 * mappings, all the OWN_AG blocks, and all the rmapbt extents. 378 */ 379 error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_abt_walk_rmap, ra); 380 if (error) 381 goto err; 382 383 /* Insert a record for space between the last rmap and EOAG. */ 384 agend = be32_to_cpu(agf->agf_length); 385 if (ra->next_agbno < agend) { 386 error = xrep_abt_stash(ra, agend); 387 if (error) 388 goto err; 389 } 390 391 /* Collect all the AGFL blocks. */ 392 error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp); 393 if (error) 394 goto err; 395 396 error = xfs_agfl_walk(mp, agf, agfl_bp, xrep_abt_walk_agfl, ra); 397 if (error) 398 goto err_agfl; 399 400 /* Compute the old bnobt/cntbt blocks. */ 401 error = xagb_bitmap_disunion(&ra->old_allocbt_blocks, 402 &ra->not_allocbt_blocks); 403 if (error) 404 goto err_agfl; 405 406 ra->nr_real_records = xfarray_length(ra->free_records); 407 err_agfl: 408 xfs_trans_brelse(sc->tp, agfl_bp); 409 err: 410 xchk_ag_btcur_free(&sc->sa); 411 xagb_bitmap_destroy(&ra->not_allocbt_blocks); 412 return error; 413 } 414 415 /* 416 * We're going to use the observed free space records to reserve blocks for the 417 * new free space btrees, so we play an iterative game where we try to converge 418 * on the number of blocks we need: 419 * 420 * 1. Estimate how many blocks we'll need to store the records. 421 * 2. If the first free record has more blocks than we need, we're done. 422 * We will have to re-sort the records prior to building the cntbt. 423 * 3. If that record has exactly the number of blocks we need, null out the 424 * record. We're done. 425 * 4. Otherwise, we still need more blocks. Null out the record, subtract its 426 * length from the number of blocks we need, and go back to step 1. 427 * 428 * Fortunately, we don't have to do any transaction work to play this game, so 429 * we don't have to tear down the staging cursors. 430 */ 431 STATIC int 432 xrep_abt_reserve_space( 433 struct xrep_abt *ra, 434 struct xfs_btree_cur *bno_cur, 435 struct xfs_btree_cur *cnt_cur, 436 bool *needs_resort) 437 { 438 struct xfs_scrub *sc = ra->sc; 439 xfarray_idx_t record_nr; 440 unsigned int allocated = 0; 441 int error = 0; 442 443 record_nr = xfarray_length(ra->free_records) - 1; 444 do { 445 struct xfs_alloc_rec_incore arec; 446 uint64_t required; 447 unsigned int desired; 448 unsigned int len; 449 450 /* Compute how many blocks we'll need. */ 451 error = xfs_btree_bload_compute_geometry(cnt_cur, 452 &ra->new_cntbt.bload, ra->nr_real_records); 453 if (error) 454 break; 455 456 error = xfs_btree_bload_compute_geometry(bno_cur, 457 &ra->new_bnobt.bload, ra->nr_real_records); 458 if (error) 459 break; 460 461 /* How many btree blocks do we need to store all records? */ 462 required = ra->new_bnobt.bload.nr_blocks + 463 ra->new_cntbt.bload.nr_blocks; 464 ASSERT(required < INT_MAX); 465 466 /* If we've reserved enough blocks, we're done. */ 467 if (allocated >= required) 468 break; 469 470 desired = required - allocated; 471 472 /* We need space but there's none left; bye! */ 473 if (ra->nr_real_records == 0) { 474 error = -ENOSPC; 475 break; 476 } 477 478 /* Grab the first record from the list. */ 479 error = xfarray_load(ra->free_records, record_nr, &arec); 480 if (error) 481 break; 482 483 ASSERT(arec.ar_blockcount <= UINT_MAX); 484 len = min_t(unsigned int, arec.ar_blockcount, desired); 485 486 trace_xrep_newbt_alloc_ag_blocks(sc->sa.pag, arec.ar_startblock, 487 len, XFS_RMAP_OWN_AG); 488 489 error = xrep_newbt_add_extent(&ra->new_bnobt, sc->sa.pag, 490 arec.ar_startblock, len); 491 if (error) 492 break; 493 allocated += len; 494 ra->nr_blocks -= len; 495 496 if (arec.ar_blockcount > desired) { 497 /* 498 * Record has more space than we need. The number of 499 * free records doesn't change, so shrink the free 500 * record, inform the caller that the records are no 501 * longer sorted by length, and exit. 502 */ 503 arec.ar_startblock += desired; 504 arec.ar_blockcount -= desired; 505 error = xfarray_store(ra->free_records, record_nr, 506 &arec); 507 if (error) 508 break; 509 510 *needs_resort = true; 511 return 0; 512 } 513 514 /* 515 * We're going to use up the entire record, so unset it and 516 * move on to the next one. This changes the number of free 517 * records (but doesn't break the sorting order), so we must 518 * go around the loop once more to re-run _bload_init. 519 */ 520 error = xfarray_unset(ra->free_records, record_nr); 521 if (error) 522 break; 523 ra->nr_real_records--; 524 record_nr--; 525 } while (1); 526 527 return error; 528 } 529 530 STATIC int 531 xrep_abt_dispose_one( 532 struct xrep_abt *ra, 533 struct xrep_newbt_resv *resv) 534 { 535 struct xfs_scrub *sc = ra->sc; 536 struct xfs_perag *pag = sc->sa.pag; 537 xfs_agblock_t free_agbno = resv->agbno + resv->used; 538 xfs_extlen_t free_aglen = resv->len - resv->used; 539 int error; 540 541 ASSERT(pag == resv->pag); 542 543 /* Add a deferred rmap for each extent we used. */ 544 if (resv->used > 0) 545 xfs_rmap_alloc_extent(sc->tp, pag_agno(pag), resv->agbno, 546 resv->used, XFS_RMAP_OWN_AG); 547 548 /* 549 * For each reserved btree block we didn't use, add it to the free 550 * space btree. We didn't touch fdblocks when we reserved them, so 551 * we don't touch it now. 552 */ 553 if (free_aglen == 0) 554 return 0; 555 556 trace_xrep_newbt_free_blocks(resv->pag, free_agbno, free_aglen, 557 ra->new_bnobt.oinfo.oi_owner); 558 559 error = __xfs_free_extent(sc->tp, resv->pag, free_agbno, free_aglen, 560 &ra->new_bnobt.oinfo, XFS_AG_RESV_IGNORE, true); 561 if (error) 562 return error; 563 564 return xrep_defer_finish(sc); 565 } 566 567 /* 568 * Deal with all the space we reserved. Blocks that were allocated for the 569 * free space btrees need to have a (deferred) rmap added for the OWN_AG 570 * allocation, and blocks that didn't get used can be freed via the usual 571 * (deferred) means. 572 */ 573 STATIC void 574 xrep_abt_dispose_reservations( 575 struct xrep_abt *ra, 576 int error) 577 { 578 struct xrep_newbt_resv *resv, *n; 579 580 if (error) 581 goto junkit; 582 583 list_for_each_entry_safe(resv, n, &ra->new_bnobt.resv_list, list) { 584 error = xrep_abt_dispose_one(ra, resv); 585 if (error) 586 goto junkit; 587 } 588 589 junkit: 590 list_for_each_entry_safe(resv, n, &ra->new_bnobt.resv_list, list) { 591 xfs_perag_put(resv->pag); 592 list_del(&resv->list); 593 kfree(resv); 594 } 595 596 xrep_newbt_cancel(&ra->new_bnobt); 597 xrep_newbt_cancel(&ra->new_cntbt); 598 } 599 600 /* Retrieve free space data for bulk load. */ 601 STATIC int 602 xrep_abt_get_records( 603 struct xfs_btree_cur *cur, 604 unsigned int idx, 605 struct xfs_btree_block *block, 606 unsigned int nr_wanted, 607 void *priv) 608 { 609 struct xfs_alloc_rec_incore *arec = &cur->bc_rec.a; 610 struct xrep_abt *ra = priv; 611 union xfs_btree_rec *block_rec; 612 unsigned int loaded; 613 int error; 614 615 for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { 616 error = xfarray_load_next(ra->free_records, &ra->array_cur, 617 arec); 618 if (error) 619 return error; 620 621 ra->longest = max(ra->longest, arec->ar_blockcount); 622 623 block_rec = xfs_btree_rec_addr(cur, idx, block); 624 cur->bc_ops->init_rec_from_cur(cur, block_rec); 625 } 626 627 return loaded; 628 } 629 630 /* Feed one of the new btree blocks to the bulk loader. */ 631 STATIC int 632 xrep_abt_claim_block( 633 struct xfs_btree_cur *cur, 634 union xfs_btree_ptr *ptr, 635 void *priv) 636 { 637 struct xrep_abt *ra = priv; 638 639 return xrep_newbt_claim_block(cur, &ra->new_bnobt, ptr); 640 } 641 642 /* 643 * Reset the AGF counters to reflect the free space btrees that we just 644 * rebuilt, then reinitialize the per-AG data. 645 */ 646 STATIC int 647 xrep_abt_reset_counters( 648 struct xrep_abt *ra) 649 { 650 struct xfs_scrub *sc = ra->sc; 651 struct xfs_perag *pag = sc->sa.pag; 652 struct xfs_agf *agf = sc->sa.agf_bp->b_addr; 653 unsigned int freesp_btreeblks = 0; 654 655 /* 656 * Compute the contribution to agf_btreeblks for the new free space 657 * btrees. This is the computed btree size minus anything we didn't 658 * use. 659 */ 660 freesp_btreeblks += ra->new_bnobt.bload.nr_blocks - 1; 661 freesp_btreeblks += ra->new_cntbt.bload.nr_blocks - 1; 662 663 freesp_btreeblks -= xrep_newbt_unused_blocks(&ra->new_bnobt); 664 freesp_btreeblks -= xrep_newbt_unused_blocks(&ra->new_cntbt); 665 666 /* 667 * The AGF header contains extra information related to the free space 668 * btrees, so we must update those fields here. 669 */ 670 agf->agf_btreeblks = cpu_to_be32(freesp_btreeblks + 671 (be32_to_cpu(agf->agf_rmap_blocks) - 1)); 672 agf->agf_freeblks = cpu_to_be32(ra->nr_blocks); 673 agf->agf_longest = cpu_to_be32(ra->longest); 674 xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_BTREEBLKS | 675 XFS_AGF_LONGEST | 676 XFS_AGF_FREEBLKS); 677 678 /* 679 * After we commit the new btree to disk, it is possible that the 680 * process to reap the old btree blocks will race with the AIL trying 681 * to checkpoint the old btree blocks into the filesystem. If the new 682 * tree is shorter than the old one, the allocbt write verifier will 683 * fail and the AIL will shut down the filesystem. 684 * 685 * To avoid this, save the old incore btree height values as the alt 686 * height values before re-initializing the perag info from the updated 687 * AGF to capture all the new values. 688 */ 689 pag->pagf_repair_bno_level = pag->pagf_bno_level; 690 pag->pagf_repair_cnt_level = pag->pagf_cnt_level; 691 692 /* Reinitialize with the values we just logged. */ 693 return xrep_reinit_pagf(sc); 694 } 695 696 /* 697 * Use the collected free space information to stage new free space btrees. 698 * If this is successful we'll return with the new btree root 699 * information logged to the repair transaction but not yet committed. 700 */ 701 STATIC int 702 xrep_abt_build_new_trees( 703 struct xrep_abt *ra) 704 { 705 struct xfs_scrub *sc = ra->sc; 706 struct xfs_btree_cur *bno_cur; 707 struct xfs_btree_cur *cnt_cur; 708 struct xfs_perag *pag = sc->sa.pag; 709 bool needs_resort = false; 710 int error; 711 712 /* 713 * Sort the free extents by length so that we can set up the free space 714 * btrees in as few extents as possible. This reduces the amount of 715 * deferred rmap / free work we have to do at the end. 716 */ 717 error = xrep_cntbt_sort_records(ra, false); 718 if (error) 719 return error; 720 721 /* 722 * Prepare to construct the new btree by reserving disk space for the 723 * new btree and setting up all the accounting information we'll need 724 * to root the new btree while it's under construction and before we 725 * attach it to the AG header. 726 */ 727 xrep_newbt_init_bare(&ra->new_bnobt, sc); 728 xrep_newbt_init_bare(&ra->new_cntbt, sc); 729 730 ra->new_bnobt.bload.get_records = xrep_abt_get_records; 731 ra->new_cntbt.bload.get_records = xrep_abt_get_records; 732 733 ra->new_bnobt.bload.claim_block = xrep_abt_claim_block; 734 ra->new_cntbt.bload.claim_block = xrep_abt_claim_block; 735 736 /* Allocate cursors for the staged btrees. */ 737 bno_cur = xfs_bnobt_init_cursor(sc->mp, NULL, NULL, pag); 738 xfs_btree_stage_afakeroot(bno_cur, &ra->new_bnobt.afake); 739 740 cnt_cur = xfs_cntbt_init_cursor(sc->mp, NULL, NULL, pag); 741 xfs_btree_stage_afakeroot(cnt_cur, &ra->new_cntbt.afake); 742 743 /* Last chance to abort before we start committing fixes. */ 744 if (xchk_should_terminate(sc, &error)) 745 goto err_cur; 746 747 /* Reserve the space we'll need for the new btrees. */ 748 error = xrep_abt_reserve_space(ra, bno_cur, cnt_cur, &needs_resort); 749 if (error) 750 goto err_cur; 751 752 /* 753 * If we need to re-sort the free extents by length, do so so that we 754 * can put the records into the cntbt in the correct order. 755 */ 756 if (needs_resort) { 757 error = xrep_cntbt_sort_records(ra, needs_resort); 758 if (error) 759 goto err_cur; 760 } 761 762 /* 763 * Due to btree slack factors, it's possible for a new btree to be one 764 * level taller than the old btree. Update the alternate incore btree 765 * height so that we don't trip the verifiers when writing the new 766 * btree blocks to disk. 767 */ 768 pag->pagf_repair_bno_level = ra->new_bnobt.bload.btree_height; 769 pag->pagf_repair_cnt_level = ra->new_cntbt.bload.btree_height; 770 771 /* Load the free space by length tree. */ 772 ra->array_cur = XFARRAY_CURSOR_INIT; 773 ra->longest = 0; 774 error = xfs_btree_bload(cnt_cur, &ra->new_cntbt.bload, ra); 775 if (error) 776 goto err_levels; 777 778 error = xrep_bnobt_sort_records(ra); 779 if (error) 780 goto err_levels; 781 782 /* Load the free space by block number tree. */ 783 ra->array_cur = XFARRAY_CURSOR_INIT; 784 error = xfs_btree_bload(bno_cur, &ra->new_bnobt.bload, ra); 785 if (error) 786 goto err_levels; 787 788 /* 789 * Install the new btrees in the AG header. After this point the old 790 * btrees are no longer accessible and the new trees are live. 791 */ 792 xfs_allocbt_commit_staged_btree(bno_cur, sc->tp, sc->sa.agf_bp); 793 xfs_btree_del_cursor(bno_cur, 0); 794 xfs_allocbt_commit_staged_btree(cnt_cur, sc->tp, sc->sa.agf_bp); 795 xfs_btree_del_cursor(cnt_cur, 0); 796 797 /* Reset the AGF counters now that we've changed the btree shape. */ 798 error = xrep_abt_reset_counters(ra); 799 if (error) 800 goto err_newbt; 801 802 /* Dispose of any unused blocks and the accounting information. */ 803 xrep_abt_dispose_reservations(ra, error); 804 805 return xrep_roll_ag_trans(sc); 806 807 err_levels: 808 pag->pagf_repair_bno_level = 0; 809 pag->pagf_repair_cnt_level = 0; 810 err_cur: 811 xfs_btree_del_cursor(cnt_cur, error); 812 xfs_btree_del_cursor(bno_cur, error); 813 err_newbt: 814 xrep_abt_dispose_reservations(ra, error); 815 return error; 816 } 817 818 /* 819 * Now that we've logged the roots of the new btrees, invalidate all of the 820 * old blocks and free them. 821 */ 822 STATIC int 823 xrep_abt_remove_old_trees( 824 struct xrep_abt *ra) 825 { 826 struct xfs_perag *pag = ra->sc->sa.pag; 827 int error; 828 829 /* Free the old btree blocks if they're not in use. */ 830 error = xrep_reap_agblocks(ra->sc, &ra->old_allocbt_blocks, 831 &XFS_RMAP_OINFO_AG, XFS_AG_RESV_IGNORE); 832 if (error) 833 return error; 834 835 /* 836 * Now that we've zapped all the old allocbt blocks we can turn off 837 * the alternate height mechanism. 838 */ 839 pag->pagf_repair_bno_level = 0; 840 pag->pagf_repair_cnt_level = 0; 841 return 0; 842 } 843 844 /* Repair the freespace btrees for some AG. */ 845 int 846 xrep_allocbt( 847 struct xfs_scrub *sc) 848 { 849 struct xrep_abt *ra; 850 struct xfs_mount *mp = sc->mp; 851 unsigned int busy_gen; 852 char *descr; 853 int error; 854 855 /* We require the rmapbt to rebuild anything. */ 856 if (!xfs_has_rmapbt(mp)) 857 return -EOPNOTSUPP; 858 859 ra = kzalloc(sizeof(struct xrep_abt), XCHK_GFP_FLAGS); 860 if (!ra) 861 return -ENOMEM; 862 ra->sc = sc; 863 864 /* We rebuild both data structures. */ 865 sc->sick_mask = XFS_SICK_AG_BNOBT | XFS_SICK_AG_CNTBT; 866 867 /* 868 * Make sure the busy extent list is clear because we can't put extents 869 * on there twice. In theory we cleared this before we started, but 870 * let's not risk the filesystem. 871 */ 872 if (!xfs_extent_busy_list_empty(pag_group(sc->sa.pag), &busy_gen)) { 873 error = -EDEADLOCK; 874 goto out_ra; 875 } 876 877 /* Set up enough storage to handle maximally fragmented free space. */ 878 descr = xchk_xfile_ag_descr(sc, "free space records"); 879 error = xfarray_create(descr, mp->m_sb.sb_agblocks / 2, 880 sizeof(struct xfs_alloc_rec_incore), 881 &ra->free_records); 882 kfree(descr); 883 if (error) 884 goto out_ra; 885 886 /* Collect the free space data and find the old btree blocks. */ 887 xagb_bitmap_init(&ra->old_allocbt_blocks); 888 error = xrep_abt_find_freespace(ra); 889 if (error) 890 goto out_bitmap; 891 892 /* Rebuild the free space information. */ 893 error = xrep_abt_build_new_trees(ra); 894 if (error) 895 goto out_bitmap; 896 897 /* Kill the old trees. */ 898 error = xrep_abt_remove_old_trees(ra); 899 if (error) 900 goto out_bitmap; 901 902 out_bitmap: 903 xagb_bitmap_destroy(&ra->old_allocbt_blocks); 904 xfarray_destroy(ra->free_records); 905 out_ra: 906 kfree(ra); 907 return error; 908 } 909 910 /* Make sure both btrees are ok after we've rebuilt them. */ 911 int 912 xrep_revalidate_allocbt( 913 struct xfs_scrub *sc) 914 { 915 __u32 old_type = sc->sm->sm_type; 916 int error; 917 918 /* 919 * We must update sm_type temporarily so that the tree-to-tree cross 920 * reference checks will work in the correct direction, and also so 921 * that tracing will report correctly if there are more errors. 922 */ 923 sc->sm->sm_type = XFS_SCRUB_TYPE_BNOBT; 924 error = xchk_allocbt(sc); 925 if (error) 926 goto out; 927 928 sc->sm->sm_type = XFS_SCRUB_TYPE_CNTBT; 929 error = xchk_allocbt(sc); 930 out: 931 sc->sm->sm_type = old_type; 932 return error; 933 } 934