1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2022-2023 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_btree.h" 13 #include "xfs_btree_staging.h" 14 #include "xfs_log_format.h" 15 #include "xfs_trans.h" 16 #include "xfs_sb.h" 17 #include "xfs_inode.h" 18 #include "xfs_alloc.h" 19 #include "xfs_rmap.h" 20 #include "xfs_ag.h" 21 #include "xfs_defer.h" 22 #include "xfs_metafile.h" 23 #include "xfs_quota.h" 24 #include "scrub/scrub.h" 25 #include "scrub/common.h" 26 #include "scrub/trace.h" 27 #include "scrub/repair.h" 28 #include "scrub/newbt.h" 29 30 /* 31 * This is the maximum number of deferred extent freeing item extents (EFIs) 32 * that we'll attach to a transaction without rolling the transaction to avoid 33 * overrunning a tr_itruncate reservation. The newbt code should reserve 34 * exactly the correct number of blocks to rebuild the btree, so there should 35 * not be any excess blocks to free when committing a new btree. 36 */ 37 #define XREP_MAX_ITRUNCATE_EFIS (128) 38 39 /* 40 * Estimate proper slack values for a btree that's being reloaded. 41 * 42 * Under most circumstances, we'll take whatever default loading value the 43 * btree bulk loading code calculates for us. However, there are some 44 * exceptions to this rule: 45 * 46 * (0) If someone turned one of the debug knobs. 47 * (1) If this is a per-AG btree and the AG has less than 10% space free. 48 * (2) If this is an inode btree and the FS has less than 10% space free. 49 50 * In either case, format the new btree blocks almost completely full to 51 * minimize space usage. 52 */ 53 static void 54 xrep_newbt_estimate_slack( 55 struct xrep_newbt *xnr) 56 { 57 struct xfs_scrub *sc = xnr->sc; 58 struct xfs_btree_bload *bload = &xnr->bload; 59 uint64_t free; 60 uint64_t sz; 61 62 /* 63 * The xfs_globals values are set to -1 (i.e. take the bload defaults) 64 * unless someone has set them otherwise, so we just pull the values 65 * here. 66 */ 67 bload->leaf_slack = xfs_globals.bload_leaf_slack; 68 bload->node_slack = xfs_globals.bload_node_slack; 69 70 if (sc->ops->type == ST_PERAG) { 71 free = sc->sa.pag->pagf_freeblks; 72 sz = xfs_ag_block_count(sc->mp, pag_agno(sc->sa.pag)); 73 } else { 74 free = xfs_sum_freecounter_raw(sc->mp, XC_FREE_BLOCKS); 75 sz = sc->mp->m_sb.sb_dblocks; 76 } 77 78 /* No further changes if there's more than 10% free space left. */ 79 if (free >= div_u64(sz, 10)) 80 return; 81 82 /* 83 * We're low on space; load the btrees as tightly as possible. Leave 84 * a couple of open slots in each btree block so that we don't end up 85 * splitting the btrees like crazy after a mount. 86 */ 87 if (bload->leaf_slack < 0) 88 bload->leaf_slack = 2; 89 if (bload->node_slack < 0) 90 bload->node_slack = 2; 91 } 92 93 /* Initialize accounting resources for staging a new AG btree. */ 94 void 95 xrep_newbt_init_ag( 96 struct xrep_newbt *xnr, 97 struct xfs_scrub *sc, 98 const struct xfs_owner_info *oinfo, 99 xfs_fsblock_t alloc_hint, 100 enum xfs_ag_resv_type resv) 101 { 102 memset(xnr, 0, sizeof(struct xrep_newbt)); 103 xnr->sc = sc; 104 xnr->oinfo = *oinfo; /* structure copy */ 105 xnr->alloc_hint = alloc_hint; 106 xnr->resv = resv; 107 INIT_LIST_HEAD(&xnr->resv_list); 108 xnr->bload.max_dirty = XFS_B_TO_FSBT(sc->mp, 256U << 10); /* 256K */ 109 xrep_newbt_estimate_slack(xnr); 110 } 111 112 /* Initialize accounting resources for staging a new inode fork btree. */ 113 int 114 xrep_newbt_init_inode( 115 struct xrep_newbt *xnr, 116 struct xfs_scrub *sc, 117 int whichfork, 118 const struct xfs_owner_info *oinfo) 119 { 120 struct xfs_ifork *ifp; 121 122 ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS); 123 if (!ifp) 124 return -ENOMEM; 125 126 xrep_newbt_init_ag(xnr, sc, oinfo, 127 XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino), 128 XFS_AG_RESV_NONE); 129 xnr->ifake.if_fork = ifp; 130 xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, whichfork); 131 return 0; 132 } 133 134 /* 135 * Initialize accounting resources for staging a new metadata inode btree. 136 * If the metadata file has a space reservation, the caller must adjust that 137 * reservation when committing the new ondisk btree. 138 */ 139 int 140 xrep_newbt_init_metadir_inode( 141 struct xrep_newbt *xnr, 142 struct xfs_scrub *sc) 143 { 144 struct xfs_owner_info oinfo; 145 struct xfs_ifork *ifp; 146 147 ASSERT(xfs_is_metadir_inode(sc->ip)); 148 149 xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, XFS_DATA_FORK); 150 151 ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS); 152 if (!ifp) 153 return -ENOMEM; 154 155 /* 156 * Allocate new metadir btree blocks with XFS_AG_RESV_NONE because the 157 * inode metadata space reservations can only account allocated space 158 * to the i_nblocks. We do not want to change the inode core fields 159 * until we're ready to commit the new tree, so we allocate the blocks 160 * as if they were regular file blocks. This exposes us to a higher 161 * risk of the repair being cancelled due to ENOSPC. 162 */ 163 xrep_newbt_init_ag(xnr, sc, &oinfo, 164 XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino), 165 XFS_AG_RESV_NONE); 166 xnr->ifake.if_fork = ifp; 167 xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, XFS_DATA_FORK); 168 return 0; 169 } 170 171 /* 172 * Initialize accounting resources for staging a new btree. Callers are 173 * expected to add their own reservations (and clean them up) manually. 174 */ 175 void 176 xrep_newbt_init_bare( 177 struct xrep_newbt *xnr, 178 struct xfs_scrub *sc) 179 { 180 xrep_newbt_init_ag(xnr, sc, &XFS_RMAP_OINFO_ANY_OWNER, NULLFSBLOCK, 181 XFS_AG_RESV_NONE); 182 } 183 184 /* 185 * Designate specific blocks to be used to build our new btree. @pag must be 186 * a passive reference. 187 */ 188 STATIC int 189 xrep_newbt_add_blocks( 190 struct xrep_newbt *xnr, 191 struct xfs_perag *pag, 192 const struct xfs_alloc_arg *args) 193 { 194 struct xfs_mount *mp = xnr->sc->mp; 195 struct xrep_newbt_resv *resv; 196 int error; 197 198 resv = kmalloc(sizeof(struct xrep_newbt_resv), XCHK_GFP_FLAGS); 199 if (!resv) 200 return -ENOMEM; 201 202 INIT_LIST_HEAD(&resv->list); 203 resv->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); 204 resv->len = args->len; 205 resv->used = 0; 206 resv->pag = xfs_perag_hold(pag); 207 208 if (args->tp) { 209 ASSERT(xnr->oinfo.oi_offset == 0); 210 211 error = xfs_alloc_schedule_autoreap(args, 212 XFS_FREE_EXTENT_SKIP_DISCARD, &resv->autoreap); 213 if (error) 214 goto out_pag; 215 } 216 217 list_add_tail(&resv->list, &xnr->resv_list); 218 return 0; 219 out_pag: 220 xfs_perag_put(resv->pag); 221 kfree(resv); 222 return error; 223 } 224 225 /* 226 * Add an extent to the new btree reservation pool. Callers are required to 227 * reap this reservation manually if the repair is cancelled. @pag must be a 228 * passive reference. 229 */ 230 int 231 xrep_newbt_add_extent( 232 struct xrep_newbt *xnr, 233 struct xfs_perag *pag, 234 xfs_agblock_t agbno, 235 xfs_extlen_t len) 236 { 237 struct xfs_alloc_arg args = { 238 .tp = NULL, /* no autoreap */ 239 .oinfo = xnr->oinfo, 240 .fsbno = xfs_agbno_to_fsb(pag, agbno), 241 .len = len, 242 .resv = xnr->resv, 243 }; 244 245 return xrep_newbt_add_blocks(xnr, pag, &args); 246 } 247 248 /* Don't let our allocation hint take us beyond this AG */ 249 static inline void 250 xrep_newbt_validate_ag_alloc_hint( 251 struct xrep_newbt *xnr) 252 { 253 struct xfs_scrub *sc = xnr->sc; 254 xfs_agnumber_t agno = XFS_FSB_TO_AGNO(sc->mp, xnr->alloc_hint); 255 256 if (agno == pag_agno(sc->sa.pag) && 257 xfs_verify_fsbno(sc->mp, xnr->alloc_hint)) 258 return; 259 260 xnr->alloc_hint = 261 xfs_agbno_to_fsb(sc->sa.pag, XFS_AGFL_BLOCK(sc->mp) + 1); 262 } 263 264 /* Allocate disk space for a new per-AG btree. */ 265 STATIC int 266 xrep_newbt_alloc_ag_blocks( 267 struct xrep_newbt *xnr, 268 uint64_t nr_blocks) 269 { 270 struct xfs_scrub *sc = xnr->sc; 271 struct xfs_mount *mp = sc->mp; 272 int error = 0; 273 274 ASSERT(sc->sa.pag != NULL); 275 ASSERT(xnr->resv != XFS_AG_RESV_METAFILE); 276 277 while (nr_blocks > 0) { 278 struct xfs_alloc_arg args = { 279 .tp = sc->tp, 280 .mp = mp, 281 .oinfo = xnr->oinfo, 282 .minlen = 1, 283 .maxlen = nr_blocks, 284 .prod = 1, 285 .resv = xnr->resv, 286 }; 287 xfs_agnumber_t agno; 288 289 xrep_newbt_validate_ag_alloc_hint(xnr); 290 291 if (xnr->alloc_vextent) 292 error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint); 293 else 294 error = xfs_alloc_vextent_near_bno(&args, 295 xnr->alloc_hint); 296 if (error) 297 return error; 298 if (args.fsbno == NULLFSBLOCK) 299 return -ENOSPC; 300 301 agno = XFS_FSB_TO_AGNO(mp, args.fsbno); 302 if (agno != pag_agno(sc->sa.pag)) { 303 ASSERT(agno == pag_agno(sc->sa.pag)); 304 return -EFSCORRUPTED; 305 } 306 307 trace_xrep_newbt_alloc_ag_blocks(sc->sa.pag, 308 XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len, 309 xnr->oinfo.oi_owner); 310 311 error = xrep_newbt_add_blocks(xnr, sc->sa.pag, &args); 312 if (error) 313 return error; 314 315 nr_blocks -= args.len; 316 xnr->alloc_hint = args.fsbno + args.len; 317 318 error = xrep_defer_finish(sc); 319 if (error) 320 return error; 321 } 322 323 return 0; 324 } 325 326 /* Don't let our allocation hint take us beyond EOFS */ 327 static inline void 328 xrep_newbt_validate_file_alloc_hint( 329 struct xrep_newbt *xnr) 330 { 331 struct xfs_scrub *sc = xnr->sc; 332 333 if (xfs_verify_fsbno(sc->mp, xnr->alloc_hint)) 334 return; 335 336 xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, 0, XFS_AGFL_BLOCK(sc->mp) + 1); 337 } 338 339 /* Allocate disk space for our new file-based btree. */ 340 STATIC int 341 xrep_newbt_alloc_file_blocks( 342 struct xrep_newbt *xnr, 343 uint64_t nr_blocks) 344 { 345 struct xfs_scrub *sc = xnr->sc; 346 struct xfs_mount *mp = sc->mp; 347 int error = 0; 348 349 ASSERT(xnr->resv != XFS_AG_RESV_METAFILE); 350 351 while (nr_blocks > 0) { 352 struct xfs_alloc_arg args = { 353 .tp = sc->tp, 354 .mp = mp, 355 .oinfo = xnr->oinfo, 356 .minlen = 1, 357 .maxlen = nr_blocks, 358 .prod = 1, 359 .resv = xnr->resv, 360 }; 361 struct xfs_perag *pag; 362 xfs_agnumber_t agno; 363 364 xrep_newbt_validate_file_alloc_hint(xnr); 365 366 if (xnr->alloc_vextent) 367 error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint); 368 else 369 error = xfs_alloc_vextent_start_ag(&args, 370 xnr->alloc_hint); 371 if (error) 372 return error; 373 if (args.fsbno == NULLFSBLOCK) 374 return -ENOSPC; 375 376 agno = XFS_FSB_TO_AGNO(mp, args.fsbno); 377 378 pag = xfs_perag_get(mp, agno); 379 if (!pag) { 380 ASSERT(0); 381 return -EFSCORRUPTED; 382 } 383 384 trace_xrep_newbt_alloc_file_blocks(pag, 385 XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len, 386 xnr->oinfo.oi_owner); 387 388 error = xrep_newbt_add_blocks(xnr, pag, &args); 389 xfs_perag_put(pag); 390 if (error) 391 return error; 392 393 nr_blocks -= args.len; 394 xnr->alloc_hint = args.fsbno + args.len; 395 396 error = xrep_defer_finish(sc); 397 if (error) 398 return error; 399 } 400 401 return 0; 402 } 403 404 /* Allocate disk space for our new btree. */ 405 int 406 xrep_newbt_alloc_blocks( 407 struct xrep_newbt *xnr, 408 uint64_t nr_blocks) 409 { 410 if (xnr->sc->ip) 411 return xrep_newbt_alloc_file_blocks(xnr, nr_blocks); 412 return xrep_newbt_alloc_ag_blocks(xnr, nr_blocks); 413 } 414 415 /* 416 * Free the unused part of a space extent that was reserved for a new ondisk 417 * structure. Returns the number of EFIs logged or a negative errno. 418 */ 419 STATIC int 420 xrep_newbt_free_extent( 421 struct xrep_newbt *xnr, 422 struct xrep_newbt_resv *resv, 423 bool btree_committed) 424 { 425 struct xfs_scrub *sc = xnr->sc; 426 xfs_agblock_t free_agbno = resv->agbno; 427 xfs_extlen_t free_aglen = resv->len; 428 int error; 429 430 if (!btree_committed || resv->used == 0) { 431 /* 432 * If we're not committing a new btree or we didn't use the 433 * space reservation, let the existing EFI free the entire 434 * space extent. 435 */ 436 trace_xrep_newbt_free_blocks(resv->pag, free_agbno, free_aglen, 437 xnr->oinfo.oi_owner); 438 xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap); 439 return 1; 440 } 441 442 /* 443 * We used space and committed the btree. Cancel the autoreap, remove 444 * the written blocks from the reservation, and possibly log a new EFI 445 * to free any unused reservation space. 446 */ 447 xfs_alloc_cancel_autoreap(sc->tp, &resv->autoreap); 448 free_agbno += resv->used; 449 free_aglen -= resv->used; 450 451 if (free_aglen == 0) 452 return 0; 453 454 trace_xrep_newbt_free_blocks(resv->pag, free_agbno, free_aglen, 455 xnr->oinfo.oi_owner); 456 457 ASSERT(xnr->resv != XFS_AG_RESV_AGFL); 458 ASSERT(xnr->resv != XFS_AG_RESV_IGNORE); 459 460 /* 461 * Use EFIs to free the reservations. This reduces the chance 462 * that we leak blocks if the system goes down. 463 */ 464 error = xfs_free_extent_later(sc->tp, 465 xfs_agbno_to_fsb(resv->pag, free_agbno), free_aglen, 466 &xnr->oinfo, xnr->resv, XFS_FREE_EXTENT_SKIP_DISCARD); 467 if (error) 468 return error; 469 470 return 1; 471 } 472 473 /* Free all the accounting info and disk space we reserved for a new btree. */ 474 STATIC int 475 xrep_newbt_free( 476 struct xrep_newbt *xnr, 477 bool btree_committed) 478 { 479 struct xfs_scrub *sc = xnr->sc; 480 struct xrep_newbt_resv *resv, *n; 481 unsigned int freed = 0; 482 int error = 0; 483 484 /* 485 * If the filesystem already went down, we can't free the blocks. Skip 486 * ahead to freeing the incore metadata because we can't fix anything. 487 */ 488 if (xfs_is_shutdown(sc->mp)) 489 goto junkit; 490 491 list_for_each_entry_safe(resv, n, &xnr->resv_list, list) { 492 int ret; 493 494 ret = xrep_newbt_free_extent(xnr, resv, btree_committed); 495 list_del(&resv->list); 496 xfs_perag_put(resv->pag); 497 kfree(resv); 498 if (ret < 0) { 499 error = ret; 500 goto junkit; 501 } 502 503 freed += ret; 504 if (freed >= XREP_MAX_ITRUNCATE_EFIS) { 505 error = xrep_defer_finish(sc); 506 if (error) 507 goto junkit; 508 freed = 0; 509 } 510 } 511 512 if (freed) 513 error = xrep_defer_finish(sc); 514 515 junkit: 516 /* 517 * If we still have reservations attached to @newbt, cleanup must have 518 * failed and the filesystem is about to go down. Clean up the incore 519 * reservations and try to commit to freeing the space we used. 520 */ 521 list_for_each_entry_safe(resv, n, &xnr->resv_list, list) { 522 xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap); 523 list_del(&resv->list); 524 xfs_perag_put(resv->pag); 525 kfree(resv); 526 } 527 528 if (sc->ip) { 529 kmem_cache_free(xfs_ifork_cache, xnr->ifake.if_fork); 530 xnr->ifake.if_fork = NULL; 531 } 532 533 return error; 534 } 535 536 /* 537 * Free all the accounting info and unused disk space allocations after 538 * committing a new btree. 539 */ 540 int 541 xrep_newbt_commit( 542 struct xrep_newbt *xnr) 543 { 544 return xrep_newbt_free(xnr, true); 545 } 546 547 /* 548 * Free all the accounting info and all of the disk space we reserved for a new 549 * btree that we're not going to commit. We want to try to roll things back 550 * cleanly for things like ENOSPC midway through allocation. 551 */ 552 void 553 xrep_newbt_cancel( 554 struct xrep_newbt *xnr) 555 { 556 xrep_newbt_free(xnr, false); 557 } 558 559 /* Feed one of the reserved btree blocks to the bulk loader. */ 560 int 561 xrep_newbt_claim_block( 562 struct xfs_btree_cur *cur, 563 struct xrep_newbt *xnr, 564 union xfs_btree_ptr *ptr) 565 { 566 struct xrep_newbt_resv *resv; 567 xfs_agblock_t agbno; 568 569 /* 570 * The first item in the list should always have a free block unless 571 * we're completely out. 572 */ 573 resv = list_first_entry(&xnr->resv_list, struct xrep_newbt_resv, list); 574 if (resv->used == resv->len) 575 return -ENOSPC; 576 577 /* 578 * Peel off a block from the start of the reservation. We allocate 579 * blocks in order to place blocks on disk in increasing record or key 580 * order. The block reservations tend to end up on the list in 581 * decreasing order, which hopefully results in leaf blocks ending up 582 * together. 583 */ 584 agbno = resv->agbno + resv->used; 585 resv->used++; 586 587 /* If we used all the blocks in this reservation, move it to the end. */ 588 if (resv->used == resv->len) 589 list_move_tail(&resv->list, &xnr->resv_list); 590 591 trace_xrep_newbt_claim_block(resv->pag, agbno, 1, xnr->oinfo.oi_owner); 592 593 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) 594 ptr->l = cpu_to_be64(xfs_agbno_to_fsb(resv->pag, agbno)); 595 else 596 ptr->s = cpu_to_be32(agbno); 597 598 /* Relog all the EFIs. */ 599 return xrep_defer_finish(xnr->sc); 600 } 601 602 /* How many reserved blocks are unused? */ 603 unsigned int 604 xrep_newbt_unused_blocks( 605 struct xrep_newbt *xnr) 606 { 607 struct xrep_newbt_resv *resv; 608 unsigned int unused = 0; 609 610 list_for_each_entry(resv, &xnr->resv_list, list) 611 unused += resv->len - resv->used; 612 return unused; 613 } 614