1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2022-2023 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_btree.h" 13 #include "xfs_btree_staging.h" 14 #include "xfs_log_format.h" 15 #include "xfs_trans.h" 16 #include "xfs_sb.h" 17 #include "xfs_inode.h" 18 #include "xfs_alloc.h" 19 #include "xfs_rmap.h" 20 #include "xfs_ag.h" 21 #include "xfs_defer.h" 22 #include "xfs_metafile.h" 23 #include "xfs_quota.h" 24 #include "scrub/scrub.h" 25 #include "scrub/common.h" 26 #include "scrub/trace.h" 27 #include "scrub/repair.h" 28 #include "scrub/newbt.h" 29 30 /* 31 * Estimate proper slack values for a btree that's being reloaded. 32 * 33 * Under most circumstances, we'll take whatever default loading value the 34 * btree bulk loading code calculates for us. However, there are some 35 * exceptions to this rule: 36 * 37 * (0) If someone turned one of the debug knobs. 38 * (1) If this is a per-AG btree and the AG has less than 10% space free. 39 * (2) If this is an inode btree and the FS has less than 10% space free. 40 41 * In either case, format the new btree blocks almost completely full to 42 * minimize space usage. 43 */ 44 static void 45 xrep_newbt_estimate_slack( 46 struct xrep_newbt *xnr) 47 { 48 struct xfs_scrub *sc = xnr->sc; 49 struct xfs_btree_bload *bload = &xnr->bload; 50 uint64_t free; 51 uint64_t sz; 52 53 /* 54 * The xfs_globals values are set to -1 (i.e. take the bload defaults) 55 * unless someone has set them otherwise, so we just pull the values 56 * here. 57 */ 58 bload->leaf_slack = xfs_globals.bload_leaf_slack; 59 bload->node_slack = xfs_globals.bload_node_slack; 60 61 if (sc->ops->type == ST_PERAG) { 62 free = sc->sa.pag->pagf_freeblks; 63 sz = xfs_ag_block_count(sc->mp, pag_agno(sc->sa.pag)); 64 } else { 65 free = percpu_counter_sum(&sc->mp->m_fdblocks); 66 sz = sc->mp->m_sb.sb_dblocks; 67 } 68 69 /* No further changes if there's more than 10% free space left. */ 70 if (free >= div_u64(sz, 10)) 71 return; 72 73 /* 74 * We're low on space; load the btrees as tightly as possible. Leave 75 * a couple of open slots in each btree block so that we don't end up 76 * splitting the btrees like crazy after a mount. 77 */ 78 if (bload->leaf_slack < 0) 79 bload->leaf_slack = 2; 80 if (bload->node_slack < 0) 81 bload->node_slack = 2; 82 } 83 84 /* Initialize accounting resources for staging a new AG btree. */ 85 void 86 xrep_newbt_init_ag( 87 struct xrep_newbt *xnr, 88 struct xfs_scrub *sc, 89 const struct xfs_owner_info *oinfo, 90 xfs_fsblock_t alloc_hint, 91 enum xfs_ag_resv_type resv) 92 { 93 memset(xnr, 0, sizeof(struct xrep_newbt)); 94 xnr->sc = sc; 95 xnr->oinfo = *oinfo; /* structure copy */ 96 xnr->alloc_hint = alloc_hint; 97 xnr->resv = resv; 98 INIT_LIST_HEAD(&xnr->resv_list); 99 xnr->bload.max_dirty = XFS_B_TO_FSBT(sc->mp, 256U << 10); /* 256K */ 100 xrep_newbt_estimate_slack(xnr); 101 } 102 103 /* Initialize accounting resources for staging a new inode fork btree. */ 104 int 105 xrep_newbt_init_inode( 106 struct xrep_newbt *xnr, 107 struct xfs_scrub *sc, 108 int whichfork, 109 const struct xfs_owner_info *oinfo) 110 { 111 struct xfs_ifork *ifp; 112 113 ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS); 114 if (!ifp) 115 return -ENOMEM; 116 117 xrep_newbt_init_ag(xnr, sc, oinfo, 118 XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino), 119 XFS_AG_RESV_NONE); 120 xnr->ifake.if_fork = ifp; 121 xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, whichfork); 122 return 0; 123 } 124 125 /* 126 * Initialize accounting resources for staging a new metadata inode btree. 127 * If the metadata file has a space reservation, the caller must adjust that 128 * reservation when committing the new ondisk btree. 129 */ 130 int 131 xrep_newbt_init_metadir_inode( 132 struct xrep_newbt *xnr, 133 struct xfs_scrub *sc) 134 { 135 struct xfs_owner_info oinfo; 136 struct xfs_ifork *ifp; 137 138 ASSERT(xfs_is_metadir_inode(sc->ip)); 139 140 xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, XFS_DATA_FORK); 141 142 ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS); 143 if (!ifp) 144 return -ENOMEM; 145 146 /* 147 * Allocate new metadir btree blocks with XFS_AG_RESV_NONE because the 148 * inode metadata space reservations can only account allocated space 149 * to the i_nblocks. We do not want to change the inode core fields 150 * until we're ready to commit the new tree, so we allocate the blocks 151 * as if they were regular file blocks. This exposes us to a higher 152 * risk of the repair being cancelled due to ENOSPC. 153 */ 154 xrep_newbt_init_ag(xnr, sc, &oinfo, 155 XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino), 156 XFS_AG_RESV_NONE); 157 xnr->ifake.if_fork = ifp; 158 xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, XFS_DATA_FORK); 159 return 0; 160 } 161 162 /* 163 * Initialize accounting resources for staging a new btree. Callers are 164 * expected to add their own reservations (and clean them up) manually. 165 */ 166 void 167 xrep_newbt_init_bare( 168 struct xrep_newbt *xnr, 169 struct xfs_scrub *sc) 170 { 171 xrep_newbt_init_ag(xnr, sc, &XFS_RMAP_OINFO_ANY_OWNER, NULLFSBLOCK, 172 XFS_AG_RESV_NONE); 173 } 174 175 /* 176 * Designate specific blocks to be used to build our new btree. @pag must be 177 * a passive reference. 178 */ 179 STATIC int 180 xrep_newbt_add_blocks( 181 struct xrep_newbt *xnr, 182 struct xfs_perag *pag, 183 const struct xfs_alloc_arg *args) 184 { 185 struct xfs_mount *mp = xnr->sc->mp; 186 struct xrep_newbt_resv *resv; 187 int error; 188 189 resv = kmalloc(sizeof(struct xrep_newbt_resv), XCHK_GFP_FLAGS); 190 if (!resv) 191 return -ENOMEM; 192 193 INIT_LIST_HEAD(&resv->list); 194 resv->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); 195 resv->len = args->len; 196 resv->used = 0; 197 resv->pag = xfs_perag_hold(pag); 198 199 if (args->tp) { 200 ASSERT(xnr->oinfo.oi_offset == 0); 201 202 error = xfs_alloc_schedule_autoreap(args, 203 XFS_FREE_EXTENT_SKIP_DISCARD, &resv->autoreap); 204 if (error) 205 goto out_pag; 206 } 207 208 list_add_tail(&resv->list, &xnr->resv_list); 209 return 0; 210 out_pag: 211 xfs_perag_put(resv->pag); 212 kfree(resv); 213 return error; 214 } 215 216 /* 217 * Add an extent to the new btree reservation pool. Callers are required to 218 * reap this reservation manually if the repair is cancelled. @pag must be a 219 * passive reference. 220 */ 221 int 222 xrep_newbt_add_extent( 223 struct xrep_newbt *xnr, 224 struct xfs_perag *pag, 225 xfs_agblock_t agbno, 226 xfs_extlen_t len) 227 { 228 struct xfs_alloc_arg args = { 229 .tp = NULL, /* no autoreap */ 230 .oinfo = xnr->oinfo, 231 .fsbno = xfs_agbno_to_fsb(pag, agbno), 232 .len = len, 233 .resv = xnr->resv, 234 }; 235 236 return xrep_newbt_add_blocks(xnr, pag, &args); 237 } 238 239 /* Don't let our allocation hint take us beyond this AG */ 240 static inline void 241 xrep_newbt_validate_ag_alloc_hint( 242 struct xrep_newbt *xnr) 243 { 244 struct xfs_scrub *sc = xnr->sc; 245 xfs_agnumber_t agno = XFS_FSB_TO_AGNO(sc->mp, xnr->alloc_hint); 246 247 if (agno == pag_agno(sc->sa.pag) && 248 xfs_verify_fsbno(sc->mp, xnr->alloc_hint)) 249 return; 250 251 xnr->alloc_hint = 252 xfs_agbno_to_fsb(sc->sa.pag, XFS_AGFL_BLOCK(sc->mp) + 1); 253 } 254 255 /* Allocate disk space for a new per-AG btree. */ 256 STATIC int 257 xrep_newbt_alloc_ag_blocks( 258 struct xrep_newbt *xnr, 259 uint64_t nr_blocks) 260 { 261 struct xfs_scrub *sc = xnr->sc; 262 struct xfs_mount *mp = sc->mp; 263 int error = 0; 264 265 ASSERT(sc->sa.pag != NULL); 266 ASSERT(xnr->resv != XFS_AG_RESV_METAFILE); 267 268 while (nr_blocks > 0) { 269 struct xfs_alloc_arg args = { 270 .tp = sc->tp, 271 .mp = mp, 272 .oinfo = xnr->oinfo, 273 .minlen = 1, 274 .maxlen = nr_blocks, 275 .prod = 1, 276 .resv = xnr->resv, 277 }; 278 xfs_agnumber_t agno; 279 280 xrep_newbt_validate_ag_alloc_hint(xnr); 281 282 if (xnr->alloc_vextent) 283 error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint); 284 else 285 error = xfs_alloc_vextent_near_bno(&args, 286 xnr->alloc_hint); 287 if (error) 288 return error; 289 if (args.fsbno == NULLFSBLOCK) 290 return -ENOSPC; 291 292 agno = XFS_FSB_TO_AGNO(mp, args.fsbno); 293 if (agno != pag_agno(sc->sa.pag)) { 294 ASSERT(agno == pag_agno(sc->sa.pag)); 295 return -EFSCORRUPTED; 296 } 297 298 trace_xrep_newbt_alloc_ag_blocks(sc->sa.pag, 299 XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len, 300 xnr->oinfo.oi_owner); 301 302 error = xrep_newbt_add_blocks(xnr, sc->sa.pag, &args); 303 if (error) 304 return error; 305 306 nr_blocks -= args.len; 307 xnr->alloc_hint = args.fsbno + args.len; 308 309 error = xrep_defer_finish(sc); 310 if (error) 311 return error; 312 } 313 314 return 0; 315 } 316 317 /* Don't let our allocation hint take us beyond EOFS */ 318 static inline void 319 xrep_newbt_validate_file_alloc_hint( 320 struct xrep_newbt *xnr) 321 { 322 struct xfs_scrub *sc = xnr->sc; 323 324 if (xfs_verify_fsbno(sc->mp, xnr->alloc_hint)) 325 return; 326 327 xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, 0, XFS_AGFL_BLOCK(sc->mp) + 1); 328 } 329 330 /* Allocate disk space for our new file-based btree. */ 331 STATIC int 332 xrep_newbt_alloc_file_blocks( 333 struct xrep_newbt *xnr, 334 uint64_t nr_blocks) 335 { 336 struct xfs_scrub *sc = xnr->sc; 337 struct xfs_mount *mp = sc->mp; 338 int error = 0; 339 340 ASSERT(xnr->resv != XFS_AG_RESV_METAFILE); 341 342 while (nr_blocks > 0) { 343 struct xfs_alloc_arg args = { 344 .tp = sc->tp, 345 .mp = mp, 346 .oinfo = xnr->oinfo, 347 .minlen = 1, 348 .maxlen = nr_blocks, 349 .prod = 1, 350 .resv = xnr->resv, 351 }; 352 struct xfs_perag *pag; 353 xfs_agnumber_t agno; 354 355 xrep_newbt_validate_file_alloc_hint(xnr); 356 357 if (xnr->alloc_vextent) 358 error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint); 359 else 360 error = xfs_alloc_vextent_start_ag(&args, 361 xnr->alloc_hint); 362 if (error) 363 return error; 364 if (args.fsbno == NULLFSBLOCK) 365 return -ENOSPC; 366 367 agno = XFS_FSB_TO_AGNO(mp, args.fsbno); 368 369 pag = xfs_perag_get(mp, agno); 370 if (!pag) { 371 ASSERT(0); 372 return -EFSCORRUPTED; 373 } 374 375 trace_xrep_newbt_alloc_file_blocks(pag, 376 XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len, 377 xnr->oinfo.oi_owner); 378 379 error = xrep_newbt_add_blocks(xnr, pag, &args); 380 xfs_perag_put(pag); 381 if (error) 382 return error; 383 384 nr_blocks -= args.len; 385 xnr->alloc_hint = args.fsbno + args.len; 386 387 error = xrep_defer_finish(sc); 388 if (error) 389 return error; 390 } 391 392 return 0; 393 } 394 395 /* Allocate disk space for our new btree. */ 396 int 397 xrep_newbt_alloc_blocks( 398 struct xrep_newbt *xnr, 399 uint64_t nr_blocks) 400 { 401 if (xnr->sc->ip) 402 return xrep_newbt_alloc_file_blocks(xnr, nr_blocks); 403 return xrep_newbt_alloc_ag_blocks(xnr, nr_blocks); 404 } 405 406 /* 407 * Free the unused part of a space extent that was reserved for a new ondisk 408 * structure. Returns the number of EFIs logged or a negative errno. 409 */ 410 STATIC int 411 xrep_newbt_free_extent( 412 struct xrep_newbt *xnr, 413 struct xrep_newbt_resv *resv, 414 bool btree_committed) 415 { 416 struct xfs_scrub *sc = xnr->sc; 417 xfs_agblock_t free_agbno = resv->agbno; 418 xfs_extlen_t free_aglen = resv->len; 419 int error; 420 421 if (!btree_committed || resv->used == 0) { 422 /* 423 * If we're not committing a new btree or we didn't use the 424 * space reservation, let the existing EFI free the entire 425 * space extent. 426 */ 427 trace_xrep_newbt_free_blocks(resv->pag, free_agbno, free_aglen, 428 xnr->oinfo.oi_owner); 429 xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap); 430 return 1; 431 } 432 433 /* 434 * We used space and committed the btree. Cancel the autoreap, remove 435 * the written blocks from the reservation, and possibly log a new EFI 436 * to free any unused reservation space. 437 */ 438 xfs_alloc_cancel_autoreap(sc->tp, &resv->autoreap); 439 free_agbno += resv->used; 440 free_aglen -= resv->used; 441 442 if (free_aglen == 0) 443 return 0; 444 445 trace_xrep_newbt_free_blocks(resv->pag, free_agbno, free_aglen, 446 xnr->oinfo.oi_owner); 447 448 ASSERT(xnr->resv != XFS_AG_RESV_AGFL); 449 ASSERT(xnr->resv != XFS_AG_RESV_IGNORE); 450 451 /* 452 * Use EFIs to free the reservations. This reduces the chance 453 * that we leak blocks if the system goes down. 454 */ 455 error = xfs_free_extent_later(sc->tp, 456 xfs_agbno_to_fsb(resv->pag, free_agbno), free_aglen, 457 &xnr->oinfo, xnr->resv, XFS_FREE_EXTENT_SKIP_DISCARD); 458 if (error) 459 return error; 460 461 return 1; 462 } 463 464 /* Free all the accounting info and disk space we reserved for a new btree. */ 465 STATIC int 466 xrep_newbt_free( 467 struct xrep_newbt *xnr, 468 bool btree_committed) 469 { 470 struct xfs_scrub *sc = xnr->sc; 471 struct xrep_newbt_resv *resv, *n; 472 unsigned int freed = 0; 473 int error = 0; 474 475 /* 476 * If the filesystem already went down, we can't free the blocks. Skip 477 * ahead to freeing the incore metadata because we can't fix anything. 478 */ 479 if (xfs_is_shutdown(sc->mp)) 480 goto junkit; 481 482 list_for_each_entry_safe(resv, n, &xnr->resv_list, list) { 483 int ret; 484 485 ret = xrep_newbt_free_extent(xnr, resv, btree_committed); 486 list_del(&resv->list); 487 xfs_perag_put(resv->pag); 488 kfree(resv); 489 if (ret < 0) { 490 error = ret; 491 goto junkit; 492 } 493 494 freed += ret; 495 if (freed >= XREP_MAX_ITRUNCATE_EFIS) { 496 error = xrep_defer_finish(sc); 497 if (error) 498 goto junkit; 499 freed = 0; 500 } 501 } 502 503 if (freed) 504 error = xrep_defer_finish(sc); 505 506 junkit: 507 /* 508 * If we still have reservations attached to @newbt, cleanup must have 509 * failed and the filesystem is about to go down. Clean up the incore 510 * reservations and try to commit to freeing the space we used. 511 */ 512 list_for_each_entry_safe(resv, n, &xnr->resv_list, list) { 513 xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap); 514 list_del(&resv->list); 515 xfs_perag_put(resv->pag); 516 kfree(resv); 517 } 518 519 if (sc->ip) { 520 kmem_cache_free(xfs_ifork_cache, xnr->ifake.if_fork); 521 xnr->ifake.if_fork = NULL; 522 } 523 524 return error; 525 } 526 527 /* 528 * Free all the accounting info and unused disk space allocations after 529 * committing a new btree. 530 */ 531 int 532 xrep_newbt_commit( 533 struct xrep_newbt *xnr) 534 { 535 return xrep_newbt_free(xnr, true); 536 } 537 538 /* 539 * Free all the accounting info and all of the disk space we reserved for a new 540 * btree that we're not going to commit. We want to try to roll things back 541 * cleanly for things like ENOSPC midway through allocation. 542 */ 543 void 544 xrep_newbt_cancel( 545 struct xrep_newbt *xnr) 546 { 547 xrep_newbt_free(xnr, false); 548 } 549 550 /* Feed one of the reserved btree blocks to the bulk loader. */ 551 int 552 xrep_newbt_claim_block( 553 struct xfs_btree_cur *cur, 554 struct xrep_newbt *xnr, 555 union xfs_btree_ptr *ptr) 556 { 557 struct xrep_newbt_resv *resv; 558 xfs_agblock_t agbno; 559 560 /* 561 * The first item in the list should always have a free block unless 562 * we're completely out. 563 */ 564 resv = list_first_entry(&xnr->resv_list, struct xrep_newbt_resv, list); 565 if (resv->used == resv->len) 566 return -ENOSPC; 567 568 /* 569 * Peel off a block from the start of the reservation. We allocate 570 * blocks in order to place blocks on disk in increasing record or key 571 * order. The block reservations tend to end up on the list in 572 * decreasing order, which hopefully results in leaf blocks ending up 573 * together. 574 */ 575 agbno = resv->agbno + resv->used; 576 resv->used++; 577 578 /* If we used all the blocks in this reservation, move it to the end. */ 579 if (resv->used == resv->len) 580 list_move_tail(&resv->list, &xnr->resv_list); 581 582 trace_xrep_newbt_claim_block(resv->pag, agbno, 1, xnr->oinfo.oi_owner); 583 584 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) 585 ptr->l = cpu_to_be64(xfs_agbno_to_fsb(resv->pag, agbno)); 586 else 587 ptr->s = cpu_to_be32(agbno); 588 589 /* Relog all the EFIs. */ 590 return xrep_defer_finish(xnr->sc); 591 } 592 593 /* How many reserved blocks are unused? */ 594 unsigned int 595 xrep_newbt_unused_blocks( 596 struct xrep_newbt *xnr) 597 { 598 struct xrep_newbt_resv *resv; 599 unsigned int unused = 0; 600 601 list_for_each_entry(resv, &xnr->resv_list, list) 602 unused += resv->len - resv->used; 603 return unused; 604 } 605