1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2022-2023 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs_platform.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_btree.h" 13 #include "xfs_btree_staging.h" 14 #include "xfs_log_format.h" 15 #include "xfs_trans.h" 16 #include "xfs_sb.h" 17 #include "xfs_inode.h" 18 #include "xfs_alloc.h" 19 #include "xfs_rmap.h" 20 #include "xfs_ag.h" 21 #include "xfs_defer.h" 22 #include "xfs_metafile.h" 23 #include "xfs_quota.h" 24 #include "scrub/scrub.h" 25 #include "scrub/common.h" 26 #include "scrub/trace.h" 27 #include "scrub/repair.h" 28 #include "scrub/newbt.h" 29 30 /* 31 * This is the maximum number of deferred extent freeing item extents (EFIs) 32 * that we'll attach to a transaction without rolling the transaction to avoid 33 * overrunning a tr_itruncate reservation. The newbt code should reserve 34 * exactly the correct number of blocks to rebuild the btree, so there should 35 * not be any excess blocks to free when committing a new btree. 36 */ 37 #define XREP_MAX_ITRUNCATE_EFIS (128) 38 39 /* 40 * Estimate proper slack values for a btree that's being reloaded. 41 * 42 * Under most circumstances, we'll take whatever default loading value the 43 * btree bulk loading code calculates for us. However, there are some 44 * exceptions to this rule: 45 * 46 * (0) If someone turned one of the debug knobs. 47 * (1) If this is a per-AG btree and the AG has less than 10% space free. 48 * (2) If this is an inode btree and the FS has less than 10% space free. 49 50 * In either case, format the new btree blocks almost completely full to 51 * minimize space usage. 52 */ 53 static void 54 xrep_newbt_estimate_slack( 55 struct xrep_newbt *xnr) 56 { 57 struct xfs_scrub *sc = xnr->sc; 58 struct xfs_btree_bload *bload = &xnr->bload; 59 uint64_t free; 60 uint64_t sz; 61 62 /* 63 * The xfs_globals values are set to -1 (i.e. take the bload defaults) 64 * unless someone has set them otherwise, so we just pull the values 65 * here. 66 */ 67 bload->leaf_slack = xfs_globals.bload_leaf_slack; 68 bload->node_slack = xfs_globals.bload_node_slack; 69 70 if (sc->ops->type == ST_PERAG) { 71 free = sc->sa.pag->pagf_freeblks; 72 sz = xfs_ag_block_count(sc->mp, pag_agno(sc->sa.pag)); 73 } else { 74 free = xfs_sum_freecounter_raw(sc->mp, XC_FREE_BLOCKS); 75 sz = sc->mp->m_sb.sb_dblocks; 76 } 77 78 /* No further changes if there's more than 10% free space left. */ 79 if (free >= div_u64(sz, 10)) 80 return; 81 82 /* 83 * We're low on space; load the btrees as tightly as possible. Leave 84 * a couple of open slots in each btree block so that we don't end up 85 * splitting the btrees like crazy after a mount. 86 */ 87 if (bload->leaf_slack < 0) 88 bload->leaf_slack = 2; 89 if (bload->node_slack < 0) 90 bload->node_slack = 2; 91 } 92 93 /* Initialize accounting resources for staging a new AG btree. */ 94 void 95 xrep_newbt_init_ag( 96 struct xrep_newbt *xnr, 97 struct xfs_scrub *sc, 98 const struct xfs_owner_info *oinfo, 99 xfs_fsblock_t alloc_hint, 100 enum xfs_ag_resv_type resv) 101 { 102 memset(xnr, 0, sizeof(struct xrep_newbt)); 103 xnr->sc = sc; 104 xnr->oinfo = *oinfo; /* structure copy */ 105 xnr->alloc_hint = alloc_hint; 106 xnr->resv = resv; 107 INIT_LIST_HEAD(&xnr->resv_list); 108 xnr->bload.max_dirty = XFS_B_TO_FSBT(sc->mp, 256U << 10); /* 256K */ 109 xrep_newbt_estimate_slack(xnr); 110 } 111 112 /* Initialize accounting resources for staging a new inode fork btree. */ 113 int 114 xrep_newbt_init_inode( 115 struct xrep_newbt *xnr, 116 struct xfs_scrub *sc, 117 int whichfork, 118 const struct xfs_owner_info *oinfo) 119 { 120 struct xfs_ifork *ifp; 121 122 ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS); 123 if (!ifp) 124 return -ENOMEM; 125 126 xrep_newbt_init_ag(xnr, sc, oinfo, XFS_INODE_TO_FSB(sc->ip), 127 XFS_AG_RESV_NONE); 128 xnr->ifake.if_fork = ifp; 129 xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, whichfork); 130 return 0; 131 } 132 133 /* 134 * Initialize accounting resources for staging a new metadata inode btree. 135 * If the metadata file has a space reservation, the caller must adjust that 136 * reservation when committing the new ondisk btree. 137 */ 138 int 139 xrep_newbt_init_metadir_inode( 140 struct xrep_newbt *xnr, 141 struct xfs_scrub *sc) 142 { 143 struct xfs_owner_info oinfo; 144 struct xfs_ifork *ifp; 145 146 ASSERT(xfs_is_metadir_inode(sc->ip)); 147 148 xfs_rmap_inode_bmbt_owner(&oinfo, sc->ip, XFS_DATA_FORK); 149 150 ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS); 151 if (!ifp) 152 return -ENOMEM; 153 154 /* 155 * Allocate new metadir btree blocks with XFS_AG_RESV_NONE because the 156 * inode metadata space reservations can only account allocated space 157 * to the i_nblocks. We do not want to change the inode core fields 158 * until we're ready to commit the new tree, so we allocate the blocks 159 * as if they were regular file blocks. This exposes us to a higher 160 * risk of the repair being cancelled due to ENOSPC. 161 */ 162 xrep_newbt_init_ag(xnr, sc, &oinfo, XFS_INODE_TO_FSB(sc->ip), 163 XFS_AG_RESV_NONE); 164 xnr->ifake.if_fork = ifp; 165 xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, XFS_DATA_FORK); 166 return 0; 167 } 168 169 /* 170 * Initialize accounting resources for staging a new btree. Callers are 171 * expected to add their own reservations (and clean them up) manually. 172 */ 173 void 174 xrep_newbt_init_bare( 175 struct xrep_newbt *xnr, 176 struct xfs_scrub *sc) 177 { 178 xrep_newbt_init_ag(xnr, sc, &XFS_RMAP_OINFO_ANY_OWNER, NULLFSBLOCK, 179 XFS_AG_RESV_NONE); 180 } 181 182 /* 183 * Designate specific blocks to be used to build our new btree. @pag must be 184 * a passive reference. 185 */ 186 STATIC int 187 xrep_newbt_add_blocks( 188 struct xrep_newbt *xnr, 189 struct xfs_perag *pag, 190 const struct xfs_alloc_arg *args) 191 { 192 struct xfs_mount *mp = xnr->sc->mp; 193 struct xrep_newbt_resv *resv; 194 int error; 195 196 resv = kmalloc_obj(struct xrep_newbt_resv, XCHK_GFP_FLAGS); 197 if (!resv) 198 return -ENOMEM; 199 200 INIT_LIST_HEAD(&resv->list); 201 resv->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); 202 resv->len = args->len; 203 resv->used = 0; 204 resv->pag = xfs_perag_hold(pag); 205 206 if (args->tp) { 207 ASSERT(xnr->oinfo.oi_offset == 0); 208 209 error = xfs_alloc_schedule_autoreap(args, 210 XFS_FREE_EXTENT_SKIP_DISCARD, &resv->autoreap); 211 if (error) 212 goto out_pag; 213 } 214 215 list_add_tail(&resv->list, &xnr->resv_list); 216 return 0; 217 out_pag: 218 xfs_perag_put(resv->pag); 219 kfree(resv); 220 return error; 221 } 222 223 /* 224 * Add an extent to the new btree reservation pool. Callers are required to 225 * reap this reservation manually if the repair is cancelled. @pag must be a 226 * passive reference. 227 */ 228 int 229 xrep_newbt_add_extent( 230 struct xrep_newbt *xnr, 231 struct xfs_perag *pag, 232 xfs_agblock_t agbno, 233 xfs_extlen_t len) 234 { 235 struct xfs_alloc_arg args = { 236 .tp = NULL, /* no autoreap */ 237 .oinfo = xnr->oinfo, 238 .fsbno = xfs_agbno_to_fsb(pag, agbno), 239 .len = len, 240 .resv = xnr->resv, 241 }; 242 243 return xrep_newbt_add_blocks(xnr, pag, &args); 244 } 245 246 /* Don't let our allocation hint take us beyond this AG */ 247 static inline void 248 xrep_newbt_validate_ag_alloc_hint( 249 struct xrep_newbt *xnr) 250 { 251 struct xfs_scrub *sc = xnr->sc; 252 xfs_agnumber_t agno = XFS_FSB_TO_AGNO(sc->mp, xnr->alloc_hint); 253 254 if (agno == pag_agno(sc->sa.pag) && 255 xfs_verify_fsbno(sc->mp, xnr->alloc_hint)) 256 return; 257 258 xnr->alloc_hint = 259 xfs_agbno_to_fsb(sc->sa.pag, XFS_AGFL_BLOCK(sc->mp) + 1); 260 } 261 262 /* Allocate disk space for a new per-AG btree. */ 263 STATIC int 264 xrep_newbt_alloc_ag_blocks( 265 struct xrep_newbt *xnr, 266 uint64_t nr_blocks) 267 { 268 struct xfs_scrub *sc = xnr->sc; 269 struct xfs_mount *mp = sc->mp; 270 int error = 0; 271 272 ASSERT(sc->sa.pag != NULL); 273 ASSERT(xnr->resv != XFS_AG_RESV_METAFILE); 274 275 while (nr_blocks > 0) { 276 struct xfs_alloc_arg args = { 277 .tp = sc->tp, 278 .mp = mp, 279 .oinfo = xnr->oinfo, 280 .minlen = 1, 281 .maxlen = nr_blocks, 282 .prod = 1, 283 .resv = xnr->resv, 284 }; 285 xfs_agnumber_t agno; 286 287 xrep_newbt_validate_ag_alloc_hint(xnr); 288 289 if (xnr->alloc_vextent) 290 error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint); 291 else 292 error = xfs_alloc_vextent_near_bno(&args, 293 xnr->alloc_hint); 294 if (error) 295 return error; 296 if (args.fsbno == NULLFSBLOCK) 297 return -ENOSPC; 298 299 agno = XFS_FSB_TO_AGNO(mp, args.fsbno); 300 if (agno != pag_agno(sc->sa.pag)) { 301 ASSERT(agno == pag_agno(sc->sa.pag)); 302 return -EFSCORRUPTED; 303 } 304 305 trace_xrep_newbt_alloc_ag_blocks(sc->sa.pag, 306 XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len, 307 xnr->oinfo.oi_owner); 308 309 error = xrep_newbt_add_blocks(xnr, sc->sa.pag, &args); 310 if (error) 311 return error; 312 313 nr_blocks -= args.len; 314 xnr->alloc_hint = args.fsbno + args.len; 315 316 error = xrep_defer_finish(sc); 317 if (error) 318 return error; 319 } 320 321 return 0; 322 } 323 324 /* Don't let our allocation hint take us beyond EOFS */ 325 static inline void 326 xrep_newbt_validate_file_alloc_hint( 327 struct xrep_newbt *xnr) 328 { 329 struct xfs_scrub *sc = xnr->sc; 330 331 if (xfs_verify_fsbno(sc->mp, xnr->alloc_hint)) 332 return; 333 334 xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, 0, XFS_AGFL_BLOCK(sc->mp) + 1); 335 } 336 337 /* Allocate disk space for our new file-based btree. */ 338 STATIC int 339 xrep_newbt_alloc_file_blocks( 340 struct xrep_newbt *xnr, 341 uint64_t nr_blocks) 342 { 343 struct xfs_scrub *sc = xnr->sc; 344 struct xfs_mount *mp = sc->mp; 345 int error = 0; 346 347 ASSERT(xnr->resv != XFS_AG_RESV_METAFILE); 348 349 while (nr_blocks > 0) { 350 struct xfs_alloc_arg args = { 351 .tp = sc->tp, 352 .mp = mp, 353 .oinfo = xnr->oinfo, 354 .minlen = 1, 355 .maxlen = nr_blocks, 356 .prod = 1, 357 .resv = xnr->resv, 358 }; 359 struct xfs_perag *pag; 360 xfs_agnumber_t agno; 361 362 xrep_newbt_validate_file_alloc_hint(xnr); 363 364 if (xnr->alloc_vextent) 365 error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint); 366 else 367 error = xfs_alloc_vextent_start_ag(&args, 368 xnr->alloc_hint); 369 if (error) 370 return error; 371 if (args.fsbno == NULLFSBLOCK) 372 return -ENOSPC; 373 374 agno = XFS_FSB_TO_AGNO(mp, args.fsbno); 375 376 pag = xfs_perag_get(mp, agno); 377 if (!pag) { 378 ASSERT(0); 379 return -EFSCORRUPTED; 380 } 381 382 trace_xrep_newbt_alloc_file_blocks(pag, 383 XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len, 384 xnr->oinfo.oi_owner); 385 386 error = xrep_newbt_add_blocks(xnr, pag, &args); 387 xfs_perag_put(pag); 388 if (error) 389 return error; 390 391 nr_blocks -= args.len; 392 xnr->alloc_hint = args.fsbno + args.len; 393 394 error = xrep_defer_finish(sc); 395 if (error) 396 return error; 397 } 398 399 return 0; 400 } 401 402 /* Allocate disk space for our new btree. */ 403 int 404 xrep_newbt_alloc_blocks( 405 struct xrep_newbt *xnr, 406 uint64_t nr_blocks) 407 { 408 if (xnr->sc->ip) 409 return xrep_newbt_alloc_file_blocks(xnr, nr_blocks); 410 return xrep_newbt_alloc_ag_blocks(xnr, nr_blocks); 411 } 412 413 /* 414 * Free the unused part of a space extent that was reserved for a new ondisk 415 * structure. Returns the number of EFIs logged or a negative errno. 416 */ 417 STATIC int 418 xrep_newbt_free_extent( 419 struct xrep_newbt *xnr, 420 struct xrep_newbt_resv *resv, 421 bool btree_committed) 422 { 423 struct xfs_scrub *sc = xnr->sc; 424 xfs_agblock_t free_agbno = resv->agbno; 425 xfs_extlen_t free_aglen = resv->len; 426 int error; 427 428 if (!btree_committed || resv->used == 0) { 429 /* 430 * If we're not committing a new btree or we didn't use the 431 * space reservation, let the existing EFI free the entire 432 * space extent. 433 */ 434 trace_xrep_newbt_free_blocks(resv->pag, free_agbno, free_aglen, 435 xnr->oinfo.oi_owner); 436 xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap); 437 return 1; 438 } 439 440 /* 441 * We used space and committed the btree. Cancel the autoreap, remove 442 * the written blocks from the reservation, and possibly log a new EFI 443 * to free any unused reservation space. 444 */ 445 xfs_alloc_cancel_autoreap(sc->tp, &resv->autoreap); 446 free_agbno += resv->used; 447 free_aglen -= resv->used; 448 449 if (free_aglen == 0) 450 return 0; 451 452 trace_xrep_newbt_free_blocks(resv->pag, free_agbno, free_aglen, 453 xnr->oinfo.oi_owner); 454 455 ASSERT(xnr->resv != XFS_AG_RESV_AGFL); 456 ASSERT(xnr->resv != XFS_AG_RESV_IGNORE); 457 458 /* 459 * Use EFIs to free the reservations. This reduces the chance 460 * that we leak blocks if the system goes down. 461 */ 462 error = xfs_free_extent_later(sc->tp, 463 xfs_agbno_to_fsb(resv->pag, free_agbno), free_aglen, 464 &xnr->oinfo, xnr->resv, XFS_FREE_EXTENT_SKIP_DISCARD); 465 if (error) 466 return error; 467 468 return 1; 469 } 470 471 /* Free all the accounting info and disk space we reserved for a new btree. */ 472 STATIC int 473 xrep_newbt_free( 474 struct xrep_newbt *xnr, 475 bool btree_committed) 476 { 477 struct xfs_scrub *sc = xnr->sc; 478 struct xrep_newbt_resv *resv, *n; 479 unsigned int freed = 0; 480 int error = 0; 481 482 /* 483 * If the filesystem already went down, we can't free the blocks. Skip 484 * ahead to freeing the incore metadata because we can't fix anything. 485 */ 486 if (xfs_is_shutdown(sc->mp)) 487 goto junkit; 488 489 list_for_each_entry_safe(resv, n, &xnr->resv_list, list) { 490 int ret; 491 492 ret = xrep_newbt_free_extent(xnr, resv, btree_committed); 493 list_del(&resv->list); 494 xfs_perag_put(resv->pag); 495 kfree(resv); 496 if (ret < 0) { 497 error = ret; 498 goto junkit; 499 } 500 501 freed += ret; 502 if (freed >= XREP_MAX_ITRUNCATE_EFIS) { 503 error = xrep_defer_finish(sc); 504 if (error) 505 goto junkit; 506 freed = 0; 507 } 508 } 509 510 if (freed) 511 error = xrep_defer_finish(sc); 512 513 junkit: 514 /* 515 * If we still have reservations attached to @newbt, cleanup must have 516 * failed and the filesystem is about to go down. Clean up the incore 517 * reservations and try to commit to freeing the space we used. 518 */ 519 list_for_each_entry_safe(resv, n, &xnr->resv_list, list) { 520 xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap); 521 list_del(&resv->list); 522 xfs_perag_put(resv->pag); 523 kfree(resv); 524 } 525 526 if (sc->ip) { 527 kmem_cache_free(xfs_ifork_cache, xnr->ifake.if_fork); 528 xnr->ifake.if_fork = NULL; 529 } 530 531 return error; 532 } 533 534 /* 535 * Free all the accounting info and unused disk space allocations after 536 * committing a new btree. 537 */ 538 int 539 xrep_newbt_commit( 540 struct xrep_newbt *xnr) 541 { 542 return xrep_newbt_free(xnr, true); 543 } 544 545 /* 546 * Free all the accounting info and all of the disk space we reserved for a new 547 * btree that we're not going to commit. We want to try to roll things back 548 * cleanly for things like ENOSPC midway through allocation. 549 */ 550 void 551 xrep_newbt_cancel( 552 struct xrep_newbt *xnr) 553 { 554 xrep_newbt_free(xnr, false); 555 } 556 557 /* Feed one of the reserved btree blocks to the bulk loader. */ 558 int 559 xrep_newbt_claim_block( 560 struct xfs_btree_cur *cur, 561 struct xrep_newbt *xnr, 562 union xfs_btree_ptr *ptr) 563 { 564 struct xrep_newbt_resv *resv; 565 xfs_agblock_t agbno; 566 567 /* 568 * The first item in the list should always have a free block unless 569 * we're completely out. 570 */ 571 resv = list_first_entry(&xnr->resv_list, struct xrep_newbt_resv, list); 572 if (resv->used == resv->len) 573 return -ENOSPC; 574 575 /* 576 * Peel off a block from the start of the reservation. We allocate 577 * blocks in order to place blocks on disk in increasing record or key 578 * order. The block reservations tend to end up on the list in 579 * decreasing order, which hopefully results in leaf blocks ending up 580 * together. 581 */ 582 agbno = resv->agbno + resv->used; 583 resv->used++; 584 585 /* If we used all the blocks in this reservation, move it to the end. */ 586 if (resv->used == resv->len) 587 list_move_tail(&resv->list, &xnr->resv_list); 588 589 trace_xrep_newbt_claim_block(resv->pag, agbno, 1, xnr->oinfo.oi_owner); 590 591 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) 592 ptr->l = cpu_to_be64(xfs_agbno_to_fsb(resv->pag, agbno)); 593 else 594 ptr->s = cpu_to_be32(agbno); 595 596 /* Relog all the EFIs. */ 597 return xrep_defer_finish(xnr->sc); 598 } 599 600 /* How many reserved blocks are unused? */ 601 unsigned int 602 xrep_newbt_unused_blocks( 603 struct xrep_newbt *xnr) 604 { 605 struct xrep_newbt_resv *resv; 606 unsigned int unused = 0; 607 608 list_for_each_entry(resv, &xnr->resv_list, list) 609 unused += resv->len - resv->used; 610 return unused; 611 } 612