1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2018-2023 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_defer.h" 13 #include "xfs_btree.h" 14 #include "xfs_btree_staging.h" 15 #include "xfs_bit.h" 16 #include "xfs_log_format.h" 17 #include "xfs_trans.h" 18 #include "xfs_sb.h" 19 #include "xfs_alloc.h" 20 #include "xfs_alloc_btree.h" 21 #include "xfs_rmap.h" 22 #include "xfs_rmap_btree.h" 23 #include "xfs_inode.h" 24 #include "xfs_refcount.h" 25 #include "xfs_extent_busy.h" 26 #include "xfs_health.h" 27 #include "xfs_bmap.h" 28 #include "xfs_ialloc.h" 29 #include "xfs_ag.h" 30 #include "scrub/xfs_scrub.h" 31 #include "scrub/scrub.h" 32 #include "scrub/common.h" 33 #include "scrub/btree.h" 34 #include "scrub/trace.h" 35 #include "scrub/repair.h" 36 #include "scrub/bitmap.h" 37 #include "scrub/agb_bitmap.h" 38 #include "scrub/xfile.h" 39 #include "scrub/xfarray.h" 40 #include "scrub/newbt.h" 41 #include "scrub/reap.h" 42 43 /* 44 * Free Space Btree Repair 45 * ======================= 46 * 47 * The reverse mappings are supposed to record all space usage for the entire 48 * AG. Therefore, we can recreate the free extent records in an AG by looking 49 * for gaps in the physical extents recorded in the rmapbt. These records are 50 * staged in @free_records. Identifying the gaps is more difficult on a 51 * reflink filesystem because rmap records are allowed to overlap. 52 * 53 * Because the final step of building a new index is to free the space used by 54 * the old index, repair needs to find that space. Unfortunately, all 55 * structures that live in the free space (bnobt, cntbt, rmapbt, agfl) share 56 * the same rmapbt owner code (OWN_AG), so this is not straightforward. 57 * 58 * The scan of the reverse mapping information records the space used by OWN_AG 59 * in @old_allocbt_blocks, which (at this stage) is somewhat misnamed. While 60 * walking the rmapbt records, we create a second bitmap @not_allocbt_blocks to 61 * record all visited rmap btree blocks and all blocks owned by the AGFL. 62 * 63 * After that is where the definitions of old_allocbt_blocks shifts. This 64 * expression identifies possible former bnobt/cntbt blocks: 65 * 66 * (OWN_AG blocks) & ~(rmapbt blocks | agfl blocks); 67 * 68 * Substituting from above definitions, that becomes: 69 * 70 * old_allocbt_blocks & ~not_allocbt_blocks 71 * 72 * The OWN_AG bitmap itself isn't needed after this point, so what we really do 73 * instead is: 74 * 75 * old_allocbt_blocks &= ~not_allocbt_blocks; 76 * 77 * After this point, @old_allocbt_blocks is a bitmap of alleged former 78 * bnobt/cntbt blocks. The xagb_bitmap_disunion operation modifies its first 79 * parameter in place to avoid copying records around. 80 * 81 * Next, some of the space described by @free_records are diverted to the newbt 82 * reservation and used to format new btree blocks. The remaining records are 83 * written to the new btree indices. We reconstruct both bnobt and cntbt at 84 * the same time since we've already done all the work. 85 * 86 * We use the prefix 'xrep_abt' here because we regenerate both free space 87 * allocation btrees at the same time. 88 */ 89 90 struct xrep_abt { 91 /* Blocks owned by the rmapbt or the agfl. */ 92 struct xagb_bitmap not_allocbt_blocks; 93 94 /* All OWN_AG blocks. */ 95 struct xagb_bitmap old_allocbt_blocks; 96 97 /* 98 * New bnobt information. All btree block reservations are added to 99 * the reservation list in new_bnobt. 100 */ 101 struct xrep_newbt new_bnobt; 102 103 /* new cntbt information */ 104 struct xrep_newbt new_cntbt; 105 106 /* Free space extents. */ 107 struct xfarray *free_records; 108 109 struct xfs_scrub *sc; 110 111 /* Number of non-null records in @free_records. */ 112 uint64_t nr_real_records; 113 114 /* get_records()'s position in the free space record array. */ 115 xfarray_idx_t array_cur; 116 117 /* 118 * Next block we anticipate seeing in the rmap records. If the next 119 * rmap record is greater than next_agbno, we have found unused space. 120 */ 121 xfs_agblock_t next_agbno; 122 123 /* Number of free blocks in this AG. */ 124 xfs_agblock_t nr_blocks; 125 126 /* Longest free extent we found in the AG. */ 127 xfs_agblock_t longest; 128 }; 129 130 /* Set up to repair AG free space btrees. */ 131 int 132 xrep_setup_ag_allocbt( 133 struct xfs_scrub *sc) 134 { 135 unsigned int busy_gen; 136 137 /* 138 * Make sure the busy extent list is clear because we can't put extents 139 * on there twice. 140 */ 141 busy_gen = READ_ONCE(sc->sa.pag->pagb_gen); 142 if (xfs_extent_busy_list_empty(sc->sa.pag)) 143 return 0; 144 145 return xfs_extent_busy_flush(sc->tp, sc->sa.pag, busy_gen, 0); 146 } 147 148 /* Check for any obvious conflicts in the free extent. */ 149 STATIC int 150 xrep_abt_check_free_ext( 151 struct xfs_scrub *sc, 152 const struct xfs_alloc_rec_incore *rec) 153 { 154 enum xbtree_recpacking outcome; 155 int error; 156 157 if (xfs_alloc_check_irec(sc->sa.pag, rec) != NULL) 158 return -EFSCORRUPTED; 159 160 /* Must not be an inode chunk. */ 161 error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur, 162 rec->ar_startblock, rec->ar_blockcount, &outcome); 163 if (error) 164 return error; 165 if (outcome != XBTREE_RECPACKING_EMPTY) 166 return -EFSCORRUPTED; 167 168 /* Must not be shared or CoW staging. */ 169 if (sc->sa.refc_cur) { 170 error = xfs_refcount_has_records(sc->sa.refc_cur, 171 XFS_REFC_DOMAIN_SHARED, rec->ar_startblock, 172 rec->ar_blockcount, &outcome); 173 if (error) 174 return error; 175 if (outcome != XBTREE_RECPACKING_EMPTY) 176 return -EFSCORRUPTED; 177 178 error = xfs_refcount_has_records(sc->sa.refc_cur, 179 XFS_REFC_DOMAIN_COW, rec->ar_startblock, 180 rec->ar_blockcount, &outcome); 181 if (error) 182 return error; 183 if (outcome != XBTREE_RECPACKING_EMPTY) 184 return -EFSCORRUPTED; 185 } 186 187 return 0; 188 } 189 190 /* 191 * Stash a free space record for all the space since the last bno we found 192 * all the way up to @end. 193 */ 194 static int 195 xrep_abt_stash( 196 struct xrep_abt *ra, 197 xfs_agblock_t end) 198 { 199 struct xfs_alloc_rec_incore arec = { 200 .ar_startblock = ra->next_agbno, 201 .ar_blockcount = end - ra->next_agbno, 202 }; 203 struct xfs_scrub *sc = ra->sc; 204 int error = 0; 205 206 if (xchk_should_terminate(sc, &error)) 207 return error; 208 209 error = xrep_abt_check_free_ext(ra->sc, &arec); 210 if (error) 211 return error; 212 213 trace_xrep_abt_found(sc->mp, sc->sa.pag->pag_agno, &arec); 214 215 error = xfarray_append(ra->free_records, &arec); 216 if (error) 217 return error; 218 219 ra->nr_blocks += arec.ar_blockcount; 220 return 0; 221 } 222 223 /* Record extents that aren't in use from gaps in the rmap records. */ 224 STATIC int 225 xrep_abt_walk_rmap( 226 struct xfs_btree_cur *cur, 227 const struct xfs_rmap_irec *rec, 228 void *priv) 229 { 230 struct xrep_abt *ra = priv; 231 int error; 232 233 /* Record all the OWN_AG blocks... */ 234 if (rec->rm_owner == XFS_RMAP_OWN_AG) { 235 error = xagb_bitmap_set(&ra->old_allocbt_blocks, 236 rec->rm_startblock, rec->rm_blockcount); 237 if (error) 238 return error; 239 } 240 241 /* ...and all the rmapbt blocks... */ 242 error = xagb_bitmap_set_btcur_path(&ra->not_allocbt_blocks, cur); 243 if (error) 244 return error; 245 246 /* ...and all the free space. */ 247 if (rec->rm_startblock > ra->next_agbno) { 248 error = xrep_abt_stash(ra, rec->rm_startblock); 249 if (error) 250 return error; 251 } 252 253 /* 254 * rmap records can overlap on reflink filesystems, so project 255 * next_agbno as far out into the AG space as we currently know about. 256 */ 257 ra->next_agbno = max_t(xfs_agblock_t, ra->next_agbno, 258 rec->rm_startblock + rec->rm_blockcount); 259 return 0; 260 } 261 262 /* Collect an AGFL block for the not-to-release list. */ 263 static int 264 xrep_abt_walk_agfl( 265 struct xfs_mount *mp, 266 xfs_agblock_t agbno, 267 void *priv) 268 { 269 struct xrep_abt *ra = priv; 270 271 return xagb_bitmap_set(&ra->not_allocbt_blocks, agbno, 1); 272 } 273 274 /* 275 * Compare two free space extents by block number. We want to sort in order of 276 * increasing block number. 277 */ 278 static int 279 xrep_bnobt_extent_cmp( 280 const void *a, 281 const void *b) 282 { 283 const struct xfs_alloc_rec_incore *ap = a; 284 const struct xfs_alloc_rec_incore *bp = b; 285 286 if (ap->ar_startblock > bp->ar_startblock) 287 return 1; 288 else if (ap->ar_startblock < bp->ar_startblock) 289 return -1; 290 return 0; 291 } 292 293 /* 294 * Re-sort the free extents by block number so that we can put the records into 295 * the bnobt in the correct order. Make sure the records do not overlap in 296 * physical space. 297 */ 298 STATIC int 299 xrep_bnobt_sort_records( 300 struct xrep_abt *ra) 301 { 302 struct xfs_alloc_rec_incore arec; 303 xfarray_idx_t cur = XFARRAY_CURSOR_INIT; 304 xfs_agblock_t next_agbno = 0; 305 int error; 306 307 error = xfarray_sort(ra->free_records, xrep_bnobt_extent_cmp, 0); 308 if (error) 309 return error; 310 311 while ((error = xfarray_iter(ra->free_records, &cur, &arec)) == 1) { 312 if (arec.ar_startblock < next_agbno) 313 return -EFSCORRUPTED; 314 315 next_agbno = arec.ar_startblock + arec.ar_blockcount; 316 } 317 318 return error; 319 } 320 321 /* 322 * Compare two free space extents by length and then block number. We want 323 * to sort first in order of increasing length and then in order of increasing 324 * block number. 325 */ 326 static int 327 xrep_cntbt_extent_cmp( 328 const void *a, 329 const void *b) 330 { 331 const struct xfs_alloc_rec_incore *ap = a; 332 const struct xfs_alloc_rec_incore *bp = b; 333 334 if (ap->ar_blockcount > bp->ar_blockcount) 335 return 1; 336 else if (ap->ar_blockcount < bp->ar_blockcount) 337 return -1; 338 return xrep_bnobt_extent_cmp(a, b); 339 } 340 341 /* 342 * Sort the free extents by length so so that we can put the records into the 343 * cntbt in the correct order. Don't let userspace kill us if we're resorting 344 * after allocating btree blocks. 345 */ 346 STATIC int 347 xrep_cntbt_sort_records( 348 struct xrep_abt *ra, 349 bool is_resort) 350 { 351 return xfarray_sort(ra->free_records, xrep_cntbt_extent_cmp, 352 is_resort ? 0 : XFARRAY_SORT_KILLABLE); 353 } 354 355 /* 356 * Iterate all reverse mappings to find (1) the gaps between rmap records (all 357 * unowned space), (2) the OWN_AG extents (which encompass the free space 358 * btrees, the rmapbt, and the agfl), (3) the rmapbt blocks, and (4) the AGFL 359 * blocks. The free space is (1) + (2) - (3) - (4). 360 */ 361 STATIC int 362 xrep_abt_find_freespace( 363 struct xrep_abt *ra) 364 { 365 struct xfs_scrub *sc = ra->sc; 366 struct xfs_mount *mp = sc->mp; 367 struct xfs_agf *agf = sc->sa.agf_bp->b_addr; 368 struct xfs_buf *agfl_bp; 369 xfs_agblock_t agend; 370 int error; 371 372 xagb_bitmap_init(&ra->not_allocbt_blocks); 373 374 xrep_ag_btcur_init(sc, &sc->sa); 375 376 /* 377 * Iterate all the reverse mappings to find gaps in the physical 378 * mappings, all the OWN_AG blocks, and all the rmapbt extents. 379 */ 380 error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_abt_walk_rmap, ra); 381 if (error) 382 goto err; 383 384 /* Insert a record for space between the last rmap and EOAG. */ 385 agend = be32_to_cpu(agf->agf_length); 386 if (ra->next_agbno < agend) { 387 error = xrep_abt_stash(ra, agend); 388 if (error) 389 goto err; 390 } 391 392 /* Collect all the AGFL blocks. */ 393 error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp); 394 if (error) 395 goto err; 396 397 error = xfs_agfl_walk(mp, agf, agfl_bp, xrep_abt_walk_agfl, ra); 398 if (error) 399 goto err_agfl; 400 401 /* Compute the old bnobt/cntbt blocks. */ 402 error = xagb_bitmap_disunion(&ra->old_allocbt_blocks, 403 &ra->not_allocbt_blocks); 404 if (error) 405 goto err_agfl; 406 407 ra->nr_real_records = xfarray_length(ra->free_records); 408 err_agfl: 409 xfs_trans_brelse(sc->tp, agfl_bp); 410 err: 411 xchk_ag_btcur_free(&sc->sa); 412 xagb_bitmap_destroy(&ra->not_allocbt_blocks); 413 return error; 414 } 415 416 /* 417 * We're going to use the observed free space records to reserve blocks for the 418 * new free space btrees, so we play an iterative game where we try to converge 419 * on the number of blocks we need: 420 * 421 * 1. Estimate how many blocks we'll need to store the records. 422 * 2. If the first free record has more blocks than we need, we're done. 423 * We will have to re-sort the records prior to building the cntbt. 424 * 3. If that record has exactly the number of blocks we need, null out the 425 * record. We're done. 426 * 4. Otherwise, we still need more blocks. Null out the record, subtract its 427 * length from the number of blocks we need, and go back to step 1. 428 * 429 * Fortunately, we don't have to do any transaction work to play this game, so 430 * we don't have to tear down the staging cursors. 431 */ 432 STATIC int 433 xrep_abt_reserve_space( 434 struct xrep_abt *ra, 435 struct xfs_btree_cur *bno_cur, 436 struct xfs_btree_cur *cnt_cur, 437 bool *needs_resort) 438 { 439 struct xfs_scrub *sc = ra->sc; 440 xfarray_idx_t record_nr; 441 unsigned int allocated = 0; 442 int error = 0; 443 444 record_nr = xfarray_length(ra->free_records) - 1; 445 do { 446 struct xfs_alloc_rec_incore arec; 447 uint64_t required; 448 unsigned int desired; 449 unsigned int len; 450 451 /* Compute how many blocks we'll need. */ 452 error = xfs_btree_bload_compute_geometry(cnt_cur, 453 &ra->new_cntbt.bload, ra->nr_real_records); 454 if (error) 455 break; 456 457 error = xfs_btree_bload_compute_geometry(bno_cur, 458 &ra->new_bnobt.bload, ra->nr_real_records); 459 if (error) 460 break; 461 462 /* How many btree blocks do we need to store all records? */ 463 required = ra->new_bnobt.bload.nr_blocks + 464 ra->new_cntbt.bload.nr_blocks; 465 ASSERT(required < INT_MAX); 466 467 /* If we've reserved enough blocks, we're done. */ 468 if (allocated >= required) 469 break; 470 471 desired = required - allocated; 472 473 /* We need space but there's none left; bye! */ 474 if (ra->nr_real_records == 0) { 475 error = -ENOSPC; 476 break; 477 } 478 479 /* Grab the first record from the list. */ 480 error = xfarray_load(ra->free_records, record_nr, &arec); 481 if (error) 482 break; 483 484 ASSERT(arec.ar_blockcount <= UINT_MAX); 485 len = min_t(unsigned int, arec.ar_blockcount, desired); 486 487 trace_xrep_newbt_alloc_ag_blocks(sc->mp, sc->sa.pag->pag_agno, 488 arec.ar_startblock, len, XFS_RMAP_OWN_AG); 489 490 error = xrep_newbt_add_extent(&ra->new_bnobt, sc->sa.pag, 491 arec.ar_startblock, len); 492 if (error) 493 break; 494 allocated += len; 495 ra->nr_blocks -= len; 496 497 if (arec.ar_blockcount > desired) { 498 /* 499 * Record has more space than we need. The number of 500 * free records doesn't change, so shrink the free 501 * record, inform the caller that the records are no 502 * longer sorted by length, and exit. 503 */ 504 arec.ar_startblock += desired; 505 arec.ar_blockcount -= desired; 506 error = xfarray_store(ra->free_records, record_nr, 507 &arec); 508 if (error) 509 break; 510 511 *needs_resort = true; 512 return 0; 513 } 514 515 /* 516 * We're going to use up the entire record, so unset it and 517 * move on to the next one. This changes the number of free 518 * records (but doesn't break the sorting order), so we must 519 * go around the loop once more to re-run _bload_init. 520 */ 521 error = xfarray_unset(ra->free_records, record_nr); 522 if (error) 523 break; 524 ra->nr_real_records--; 525 record_nr--; 526 } while (1); 527 528 return error; 529 } 530 531 STATIC int 532 xrep_abt_dispose_one( 533 struct xrep_abt *ra, 534 struct xrep_newbt_resv *resv) 535 { 536 struct xfs_scrub *sc = ra->sc; 537 struct xfs_perag *pag = sc->sa.pag; 538 xfs_agblock_t free_agbno = resv->agbno + resv->used; 539 xfs_extlen_t free_aglen = resv->len - resv->used; 540 int error; 541 542 ASSERT(pag == resv->pag); 543 544 /* Add a deferred rmap for each extent we used. */ 545 if (resv->used > 0) 546 xfs_rmap_alloc_extent(sc->tp, pag->pag_agno, resv->agbno, 547 resv->used, XFS_RMAP_OWN_AG); 548 549 /* 550 * For each reserved btree block we didn't use, add it to the free 551 * space btree. We didn't touch fdblocks when we reserved them, so 552 * we don't touch it now. 553 */ 554 if (free_aglen == 0) 555 return 0; 556 557 trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno, 558 free_aglen, ra->new_bnobt.oinfo.oi_owner); 559 560 error = __xfs_free_extent(sc->tp, resv->pag, free_agbno, free_aglen, 561 &ra->new_bnobt.oinfo, XFS_AG_RESV_IGNORE, true); 562 if (error) 563 return error; 564 565 return xrep_defer_finish(sc); 566 } 567 568 /* 569 * Deal with all the space we reserved. Blocks that were allocated for the 570 * free space btrees need to have a (deferred) rmap added for the OWN_AG 571 * allocation, and blocks that didn't get used can be freed via the usual 572 * (deferred) means. 573 */ 574 STATIC void 575 xrep_abt_dispose_reservations( 576 struct xrep_abt *ra, 577 int error) 578 { 579 struct xrep_newbt_resv *resv, *n; 580 581 if (error) 582 goto junkit; 583 584 list_for_each_entry_safe(resv, n, &ra->new_bnobt.resv_list, list) { 585 error = xrep_abt_dispose_one(ra, resv); 586 if (error) 587 goto junkit; 588 } 589 590 junkit: 591 list_for_each_entry_safe(resv, n, &ra->new_bnobt.resv_list, list) { 592 xfs_perag_put(resv->pag); 593 list_del(&resv->list); 594 kfree(resv); 595 } 596 597 xrep_newbt_cancel(&ra->new_bnobt); 598 xrep_newbt_cancel(&ra->new_cntbt); 599 } 600 601 /* Retrieve free space data for bulk load. */ 602 STATIC int 603 xrep_abt_get_records( 604 struct xfs_btree_cur *cur, 605 unsigned int idx, 606 struct xfs_btree_block *block, 607 unsigned int nr_wanted, 608 void *priv) 609 { 610 struct xfs_alloc_rec_incore *arec = &cur->bc_rec.a; 611 struct xrep_abt *ra = priv; 612 union xfs_btree_rec *block_rec; 613 unsigned int loaded; 614 int error; 615 616 for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { 617 error = xfarray_load_next(ra->free_records, &ra->array_cur, 618 arec); 619 if (error) 620 return error; 621 622 ra->longest = max(ra->longest, arec->ar_blockcount); 623 624 block_rec = xfs_btree_rec_addr(cur, idx, block); 625 cur->bc_ops->init_rec_from_cur(cur, block_rec); 626 } 627 628 return loaded; 629 } 630 631 /* Feed one of the new btree blocks to the bulk loader. */ 632 STATIC int 633 xrep_abt_claim_block( 634 struct xfs_btree_cur *cur, 635 union xfs_btree_ptr *ptr, 636 void *priv) 637 { 638 struct xrep_abt *ra = priv; 639 640 return xrep_newbt_claim_block(cur, &ra->new_bnobt, ptr); 641 } 642 643 /* 644 * Reset the AGF counters to reflect the free space btrees that we just 645 * rebuilt, then reinitialize the per-AG data. 646 */ 647 STATIC int 648 xrep_abt_reset_counters( 649 struct xrep_abt *ra) 650 { 651 struct xfs_scrub *sc = ra->sc; 652 struct xfs_perag *pag = sc->sa.pag; 653 struct xfs_agf *agf = sc->sa.agf_bp->b_addr; 654 unsigned int freesp_btreeblks = 0; 655 656 /* 657 * Compute the contribution to agf_btreeblks for the new free space 658 * btrees. This is the computed btree size minus anything we didn't 659 * use. 660 */ 661 freesp_btreeblks += ra->new_bnobt.bload.nr_blocks - 1; 662 freesp_btreeblks += ra->new_cntbt.bload.nr_blocks - 1; 663 664 freesp_btreeblks -= xrep_newbt_unused_blocks(&ra->new_bnobt); 665 freesp_btreeblks -= xrep_newbt_unused_blocks(&ra->new_cntbt); 666 667 /* 668 * The AGF header contains extra information related to the free space 669 * btrees, so we must update those fields here. 670 */ 671 agf->agf_btreeblks = cpu_to_be32(freesp_btreeblks + 672 (be32_to_cpu(agf->agf_rmap_blocks) - 1)); 673 agf->agf_freeblks = cpu_to_be32(ra->nr_blocks); 674 agf->agf_longest = cpu_to_be32(ra->longest); 675 xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_BTREEBLKS | 676 XFS_AGF_LONGEST | 677 XFS_AGF_FREEBLKS); 678 679 /* 680 * After we commit the new btree to disk, it is possible that the 681 * process to reap the old btree blocks will race with the AIL trying 682 * to checkpoint the old btree blocks into the filesystem. If the new 683 * tree is shorter than the old one, the allocbt write verifier will 684 * fail and the AIL will shut down the filesystem. 685 * 686 * To avoid this, save the old incore btree height values as the alt 687 * height values before re-initializing the perag info from the updated 688 * AGF to capture all the new values. 689 */ 690 pag->pagf_repair_levels[XFS_BTNUM_BNOi] = pag->pagf_levels[XFS_BTNUM_BNOi]; 691 pag->pagf_repair_levels[XFS_BTNUM_CNTi] = pag->pagf_levels[XFS_BTNUM_CNTi]; 692 693 /* Reinitialize with the values we just logged. */ 694 return xrep_reinit_pagf(sc); 695 } 696 697 /* 698 * Use the collected free space information to stage new free space btrees. 699 * If this is successful we'll return with the new btree root 700 * information logged to the repair transaction but not yet committed. 701 */ 702 STATIC int 703 xrep_abt_build_new_trees( 704 struct xrep_abt *ra) 705 { 706 struct xfs_scrub *sc = ra->sc; 707 struct xfs_btree_cur *bno_cur; 708 struct xfs_btree_cur *cnt_cur; 709 struct xfs_perag *pag = sc->sa.pag; 710 bool needs_resort = false; 711 int error; 712 713 /* 714 * Sort the free extents by length so that we can set up the free space 715 * btrees in as few extents as possible. This reduces the amount of 716 * deferred rmap / free work we have to do at the end. 717 */ 718 error = xrep_cntbt_sort_records(ra, false); 719 if (error) 720 return error; 721 722 /* 723 * Prepare to construct the new btree by reserving disk space for the 724 * new btree and setting up all the accounting information we'll need 725 * to root the new btree while it's under construction and before we 726 * attach it to the AG header. 727 */ 728 xrep_newbt_init_bare(&ra->new_bnobt, sc); 729 xrep_newbt_init_bare(&ra->new_cntbt, sc); 730 731 ra->new_bnobt.bload.get_records = xrep_abt_get_records; 732 ra->new_cntbt.bload.get_records = xrep_abt_get_records; 733 734 ra->new_bnobt.bload.claim_block = xrep_abt_claim_block; 735 ra->new_cntbt.bload.claim_block = xrep_abt_claim_block; 736 737 /* Allocate cursors for the staged btrees. */ 738 bno_cur = xfs_allocbt_stage_cursor(sc->mp, &ra->new_bnobt.afake, 739 pag, XFS_BTNUM_BNO); 740 cnt_cur = xfs_allocbt_stage_cursor(sc->mp, &ra->new_cntbt.afake, 741 pag, XFS_BTNUM_CNT); 742 743 /* Last chance to abort before we start committing fixes. */ 744 if (xchk_should_terminate(sc, &error)) 745 goto err_cur; 746 747 /* Reserve the space we'll need for the new btrees. */ 748 error = xrep_abt_reserve_space(ra, bno_cur, cnt_cur, &needs_resort); 749 if (error) 750 goto err_cur; 751 752 /* 753 * If we need to re-sort the free extents by length, do so so that we 754 * can put the records into the cntbt in the correct order. 755 */ 756 if (needs_resort) { 757 error = xrep_cntbt_sort_records(ra, needs_resort); 758 if (error) 759 goto err_cur; 760 } 761 762 /* 763 * Due to btree slack factors, it's possible for a new btree to be one 764 * level taller than the old btree. Update the alternate incore btree 765 * height so that we don't trip the verifiers when writing the new 766 * btree blocks to disk. 767 */ 768 pag->pagf_repair_levels[XFS_BTNUM_BNOi] = 769 ra->new_bnobt.bload.btree_height; 770 pag->pagf_repair_levels[XFS_BTNUM_CNTi] = 771 ra->new_cntbt.bload.btree_height; 772 773 /* Load the free space by length tree. */ 774 ra->array_cur = XFARRAY_CURSOR_INIT; 775 ra->longest = 0; 776 error = xfs_btree_bload(cnt_cur, &ra->new_cntbt.bload, ra); 777 if (error) 778 goto err_levels; 779 780 error = xrep_bnobt_sort_records(ra); 781 if (error) 782 return error; 783 784 /* Load the free space by block number tree. */ 785 ra->array_cur = XFARRAY_CURSOR_INIT; 786 error = xfs_btree_bload(bno_cur, &ra->new_bnobt.bload, ra); 787 if (error) 788 goto err_levels; 789 790 /* 791 * Install the new btrees in the AG header. After this point the old 792 * btrees are no longer accessible and the new trees are live. 793 */ 794 xfs_allocbt_commit_staged_btree(bno_cur, sc->tp, sc->sa.agf_bp); 795 xfs_btree_del_cursor(bno_cur, 0); 796 xfs_allocbt_commit_staged_btree(cnt_cur, sc->tp, sc->sa.agf_bp); 797 xfs_btree_del_cursor(cnt_cur, 0); 798 799 /* Reset the AGF counters now that we've changed the btree shape. */ 800 error = xrep_abt_reset_counters(ra); 801 if (error) 802 goto err_newbt; 803 804 /* Dispose of any unused blocks and the accounting information. */ 805 xrep_abt_dispose_reservations(ra, error); 806 807 return xrep_roll_ag_trans(sc); 808 809 err_levels: 810 pag->pagf_repair_levels[XFS_BTNUM_BNOi] = 0; 811 pag->pagf_repair_levels[XFS_BTNUM_CNTi] = 0; 812 err_cur: 813 xfs_btree_del_cursor(cnt_cur, error); 814 xfs_btree_del_cursor(bno_cur, error); 815 err_newbt: 816 xrep_abt_dispose_reservations(ra, error); 817 return error; 818 } 819 820 /* 821 * Now that we've logged the roots of the new btrees, invalidate all of the 822 * old blocks and free them. 823 */ 824 STATIC int 825 xrep_abt_remove_old_trees( 826 struct xrep_abt *ra) 827 { 828 struct xfs_perag *pag = ra->sc->sa.pag; 829 int error; 830 831 /* Free the old btree blocks if they're not in use. */ 832 error = xrep_reap_agblocks(ra->sc, &ra->old_allocbt_blocks, 833 &XFS_RMAP_OINFO_AG, XFS_AG_RESV_IGNORE); 834 if (error) 835 return error; 836 837 /* 838 * Now that we've zapped all the old allocbt blocks we can turn off 839 * the alternate height mechanism. 840 */ 841 pag->pagf_repair_levels[XFS_BTNUM_BNOi] = 0; 842 pag->pagf_repair_levels[XFS_BTNUM_CNTi] = 0; 843 return 0; 844 } 845 846 /* Repair the freespace btrees for some AG. */ 847 int 848 xrep_allocbt( 849 struct xfs_scrub *sc) 850 { 851 struct xrep_abt *ra; 852 struct xfs_mount *mp = sc->mp; 853 char *descr; 854 int error; 855 856 /* We require the rmapbt to rebuild anything. */ 857 if (!xfs_has_rmapbt(mp)) 858 return -EOPNOTSUPP; 859 860 ra = kzalloc(sizeof(struct xrep_abt), XCHK_GFP_FLAGS); 861 if (!ra) 862 return -ENOMEM; 863 ra->sc = sc; 864 865 /* We rebuild both data structures. */ 866 sc->sick_mask = XFS_SICK_AG_BNOBT | XFS_SICK_AG_CNTBT; 867 868 /* 869 * Make sure the busy extent list is clear because we can't put extents 870 * on there twice. In theory we cleared this before we started, but 871 * let's not risk the filesystem. 872 */ 873 if (!xfs_extent_busy_list_empty(sc->sa.pag)) { 874 error = -EDEADLOCK; 875 goto out_ra; 876 } 877 878 /* Set up enough storage to handle maximally fragmented free space. */ 879 descr = xchk_xfile_ag_descr(sc, "free space records"); 880 error = xfarray_create(descr, mp->m_sb.sb_agblocks / 2, 881 sizeof(struct xfs_alloc_rec_incore), 882 &ra->free_records); 883 kfree(descr); 884 if (error) 885 goto out_ra; 886 887 /* Collect the free space data and find the old btree blocks. */ 888 xagb_bitmap_init(&ra->old_allocbt_blocks); 889 error = xrep_abt_find_freespace(ra); 890 if (error) 891 goto out_bitmap; 892 893 /* Rebuild the free space information. */ 894 error = xrep_abt_build_new_trees(ra); 895 if (error) 896 goto out_bitmap; 897 898 /* Kill the old trees. */ 899 error = xrep_abt_remove_old_trees(ra); 900 if (error) 901 goto out_bitmap; 902 903 out_bitmap: 904 xagb_bitmap_destroy(&ra->old_allocbt_blocks); 905 xfarray_destroy(ra->free_records); 906 out_ra: 907 kfree(ra); 908 return error; 909 } 910 911 /* Make sure both btrees are ok after we've rebuilt them. */ 912 int 913 xrep_revalidate_allocbt( 914 struct xfs_scrub *sc) 915 { 916 __u32 old_type = sc->sm->sm_type; 917 int error; 918 919 /* 920 * We must update sm_type temporarily so that the tree-to-tree cross 921 * reference checks will work in the correct direction, and also so 922 * that tracing will report correctly if there are more errors. 923 */ 924 sc->sm->sm_type = XFS_SCRUB_TYPE_BNOBT; 925 error = xchk_allocbt(sc); 926 if (error) 927 goto out; 928 929 sc->sm->sm_type = XFS_SCRUB_TYPE_CNTBT; 930 error = xchk_allocbt(sc); 931 out: 932 sc->sm->sm_type = old_type; 933 return error; 934 } 935