1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2022-2023 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_btree.h" 13 #include "xfs_btree_staging.h" 14 #include "xfs_log_format.h" 15 #include "xfs_trans.h" 16 #include "xfs_sb.h" 17 #include "xfs_inode.h" 18 #include "xfs_alloc.h" 19 #include "xfs_rmap.h" 20 #include "xfs_ag.h" 21 #include "xfs_defer.h" 22 #include "scrub/scrub.h" 23 #include "scrub/common.h" 24 #include "scrub/trace.h" 25 #include "scrub/repair.h" 26 #include "scrub/newbt.h" 27 28 /* 29 * Estimate proper slack values for a btree that's being reloaded. 30 * 31 * Under most circumstances, we'll take whatever default loading value the 32 * btree bulk loading code calculates for us. However, there are some 33 * exceptions to this rule: 34 * 35 * (0) If someone turned one of the debug knobs. 36 * (1) If this is a per-AG btree and the AG has less than 10% space free. 37 * (2) If this is an inode btree and the FS has less than 10% space free. 38 39 * In either case, format the new btree blocks almost completely full to 40 * minimize space usage. 41 */ 42 static void 43 xrep_newbt_estimate_slack( 44 struct xrep_newbt *xnr) 45 { 46 struct xfs_scrub *sc = xnr->sc; 47 struct xfs_btree_bload *bload = &xnr->bload; 48 uint64_t free; 49 uint64_t sz; 50 51 /* 52 * The xfs_globals values are set to -1 (i.e. take the bload defaults) 53 * unless someone has set them otherwise, so we just pull the values 54 * here. 55 */ 56 bload->leaf_slack = xfs_globals.bload_leaf_slack; 57 bload->node_slack = xfs_globals.bload_node_slack; 58 59 if (sc->ops->type == ST_PERAG) { 60 free = sc->sa.pag->pagf_freeblks; 61 sz = xfs_ag_block_count(sc->mp, sc->sa.pag->pag_agno); 62 } else { 63 free = percpu_counter_sum(&sc->mp->m_fdblocks); 64 sz = sc->mp->m_sb.sb_dblocks; 65 } 66 67 /* No further changes if there's more than 10% free space left. */ 68 if (free >= div_u64(sz, 10)) 69 return; 70 71 /* 72 * We're low on space; load the btrees as tightly as possible. Leave 73 * a couple of open slots in each btree block so that we don't end up 74 * splitting the btrees like crazy after a mount. 75 */ 76 if (bload->leaf_slack < 0) 77 bload->leaf_slack = 2; 78 if (bload->node_slack < 0) 79 bload->node_slack = 2; 80 } 81 82 /* Initialize accounting resources for staging a new AG btree. */ 83 void 84 xrep_newbt_init_ag( 85 struct xrep_newbt *xnr, 86 struct xfs_scrub *sc, 87 const struct xfs_owner_info *oinfo, 88 xfs_fsblock_t alloc_hint, 89 enum xfs_ag_resv_type resv) 90 { 91 memset(xnr, 0, sizeof(struct xrep_newbt)); 92 xnr->sc = sc; 93 xnr->oinfo = *oinfo; /* structure copy */ 94 xnr->alloc_hint = alloc_hint; 95 xnr->resv = resv; 96 INIT_LIST_HEAD(&xnr->resv_list); 97 xnr->bload.max_dirty = XFS_B_TO_FSBT(sc->mp, 256U << 10); /* 256K */ 98 xrep_newbt_estimate_slack(xnr); 99 } 100 101 /* Initialize accounting resources for staging a new inode fork btree. */ 102 int 103 xrep_newbt_init_inode( 104 struct xrep_newbt *xnr, 105 struct xfs_scrub *sc, 106 int whichfork, 107 const struct xfs_owner_info *oinfo) 108 { 109 struct xfs_ifork *ifp; 110 111 ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS); 112 if (!ifp) 113 return -ENOMEM; 114 115 xrep_newbt_init_ag(xnr, sc, oinfo, 116 XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino), 117 XFS_AG_RESV_NONE); 118 xnr->ifake.if_fork = ifp; 119 xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, whichfork); 120 return 0; 121 } 122 123 /* 124 * Initialize accounting resources for staging a new btree. Callers are 125 * expected to add their own reservations (and clean them up) manually. 126 */ 127 void 128 xrep_newbt_init_bare( 129 struct xrep_newbt *xnr, 130 struct xfs_scrub *sc) 131 { 132 xrep_newbt_init_ag(xnr, sc, &XFS_RMAP_OINFO_ANY_OWNER, NULLFSBLOCK, 133 XFS_AG_RESV_NONE); 134 } 135 136 /* 137 * Designate specific blocks to be used to build our new btree. @pag must be 138 * a passive reference. 139 */ 140 STATIC int 141 xrep_newbt_add_blocks( 142 struct xrep_newbt *xnr, 143 struct xfs_perag *pag, 144 const struct xfs_alloc_arg *args) 145 { 146 struct xfs_mount *mp = xnr->sc->mp; 147 struct xrep_newbt_resv *resv; 148 int error; 149 150 resv = kmalloc(sizeof(struct xrep_newbt_resv), XCHK_GFP_FLAGS); 151 if (!resv) 152 return -ENOMEM; 153 154 INIT_LIST_HEAD(&resv->list); 155 resv->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); 156 resv->len = args->len; 157 resv->used = 0; 158 resv->pag = xfs_perag_hold(pag); 159 160 if (args->tp) { 161 ASSERT(xnr->oinfo.oi_offset == 0); 162 163 error = xfs_alloc_schedule_autoreap(args, 164 XFS_FREE_EXTENT_SKIP_DISCARD, &resv->autoreap); 165 if (error) 166 goto out_pag; 167 } 168 169 list_add_tail(&resv->list, &xnr->resv_list); 170 return 0; 171 out_pag: 172 xfs_perag_put(resv->pag); 173 kfree(resv); 174 return error; 175 } 176 177 /* 178 * Add an extent to the new btree reservation pool. Callers are required to 179 * reap this reservation manually if the repair is cancelled. @pag must be a 180 * passive reference. 181 */ 182 int 183 xrep_newbt_add_extent( 184 struct xrep_newbt *xnr, 185 struct xfs_perag *pag, 186 xfs_agblock_t agbno, 187 xfs_extlen_t len) 188 { 189 struct xfs_mount *mp = xnr->sc->mp; 190 struct xfs_alloc_arg args = { 191 .tp = NULL, /* no autoreap */ 192 .oinfo = xnr->oinfo, 193 .fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, agbno), 194 .len = len, 195 .resv = xnr->resv, 196 }; 197 198 return xrep_newbt_add_blocks(xnr, pag, &args); 199 } 200 201 /* Don't let our allocation hint take us beyond this AG */ 202 static inline void 203 xrep_newbt_validate_ag_alloc_hint( 204 struct xrep_newbt *xnr) 205 { 206 struct xfs_scrub *sc = xnr->sc; 207 xfs_agnumber_t agno = XFS_FSB_TO_AGNO(sc->mp, xnr->alloc_hint); 208 209 if (agno == sc->sa.pag->pag_agno && 210 xfs_verify_fsbno(sc->mp, xnr->alloc_hint)) 211 return; 212 213 xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, 214 XFS_AGFL_BLOCK(sc->mp) + 1); 215 } 216 217 /* Allocate disk space for a new per-AG btree. */ 218 STATIC int 219 xrep_newbt_alloc_ag_blocks( 220 struct xrep_newbt *xnr, 221 uint64_t nr_blocks) 222 { 223 struct xfs_scrub *sc = xnr->sc; 224 struct xfs_mount *mp = sc->mp; 225 int error = 0; 226 227 ASSERT(sc->sa.pag != NULL); 228 229 while (nr_blocks > 0) { 230 struct xfs_alloc_arg args = { 231 .tp = sc->tp, 232 .mp = mp, 233 .oinfo = xnr->oinfo, 234 .minlen = 1, 235 .maxlen = nr_blocks, 236 .prod = 1, 237 .resv = xnr->resv, 238 }; 239 xfs_agnumber_t agno; 240 241 xrep_newbt_validate_ag_alloc_hint(xnr); 242 243 if (xnr->alloc_vextent) 244 error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint); 245 else 246 error = xfs_alloc_vextent_near_bno(&args, 247 xnr->alloc_hint); 248 if (error) 249 return error; 250 if (args.fsbno == NULLFSBLOCK) 251 return -ENOSPC; 252 253 agno = XFS_FSB_TO_AGNO(mp, args.fsbno); 254 255 trace_xrep_newbt_alloc_ag_blocks(mp, agno, 256 XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len, 257 xnr->oinfo.oi_owner); 258 259 if (agno != sc->sa.pag->pag_agno) { 260 ASSERT(agno == sc->sa.pag->pag_agno); 261 return -EFSCORRUPTED; 262 } 263 264 error = xrep_newbt_add_blocks(xnr, sc->sa.pag, &args); 265 if (error) 266 return error; 267 268 nr_blocks -= args.len; 269 xnr->alloc_hint = args.fsbno + args.len; 270 271 error = xrep_defer_finish(sc); 272 if (error) 273 return error; 274 } 275 276 return 0; 277 } 278 279 /* Don't let our allocation hint take us beyond EOFS */ 280 static inline void 281 xrep_newbt_validate_file_alloc_hint( 282 struct xrep_newbt *xnr) 283 { 284 struct xfs_scrub *sc = xnr->sc; 285 286 if (xfs_verify_fsbno(sc->mp, xnr->alloc_hint)) 287 return; 288 289 xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, 0, XFS_AGFL_BLOCK(sc->mp) + 1); 290 } 291 292 /* Allocate disk space for our new file-based btree. */ 293 STATIC int 294 xrep_newbt_alloc_file_blocks( 295 struct xrep_newbt *xnr, 296 uint64_t nr_blocks) 297 { 298 struct xfs_scrub *sc = xnr->sc; 299 struct xfs_mount *mp = sc->mp; 300 int error = 0; 301 302 while (nr_blocks > 0) { 303 struct xfs_alloc_arg args = { 304 .tp = sc->tp, 305 .mp = mp, 306 .oinfo = xnr->oinfo, 307 .minlen = 1, 308 .maxlen = nr_blocks, 309 .prod = 1, 310 .resv = xnr->resv, 311 }; 312 struct xfs_perag *pag; 313 xfs_agnumber_t agno; 314 315 xrep_newbt_validate_file_alloc_hint(xnr); 316 317 if (xnr->alloc_vextent) 318 error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint); 319 else 320 error = xfs_alloc_vextent_start_ag(&args, 321 xnr->alloc_hint); 322 if (error) 323 return error; 324 if (args.fsbno == NULLFSBLOCK) 325 return -ENOSPC; 326 327 agno = XFS_FSB_TO_AGNO(mp, args.fsbno); 328 329 trace_xrep_newbt_alloc_file_blocks(mp, agno, 330 XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len, 331 xnr->oinfo.oi_owner); 332 333 pag = xfs_perag_get(mp, agno); 334 if (!pag) { 335 ASSERT(0); 336 return -EFSCORRUPTED; 337 } 338 339 error = xrep_newbt_add_blocks(xnr, pag, &args); 340 xfs_perag_put(pag); 341 if (error) 342 return error; 343 344 nr_blocks -= args.len; 345 xnr->alloc_hint = args.fsbno + args.len; 346 347 error = xrep_defer_finish(sc); 348 if (error) 349 return error; 350 } 351 352 return 0; 353 } 354 355 /* Allocate disk space for our new btree. */ 356 int 357 xrep_newbt_alloc_blocks( 358 struct xrep_newbt *xnr, 359 uint64_t nr_blocks) 360 { 361 if (xnr->sc->ip) 362 return xrep_newbt_alloc_file_blocks(xnr, nr_blocks); 363 return xrep_newbt_alloc_ag_blocks(xnr, nr_blocks); 364 } 365 366 /* 367 * Free the unused part of a space extent that was reserved for a new ondisk 368 * structure. Returns the number of EFIs logged or a negative errno. 369 */ 370 STATIC int 371 xrep_newbt_free_extent( 372 struct xrep_newbt *xnr, 373 struct xrep_newbt_resv *resv, 374 bool btree_committed) 375 { 376 struct xfs_scrub *sc = xnr->sc; 377 xfs_agblock_t free_agbno = resv->agbno; 378 xfs_extlen_t free_aglen = resv->len; 379 xfs_fsblock_t fsbno; 380 int error; 381 382 if (!btree_committed || resv->used == 0) { 383 /* 384 * If we're not committing a new btree or we didn't use the 385 * space reservation, let the existing EFI free the entire 386 * space extent. 387 */ 388 trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, 389 free_agbno, free_aglen, xnr->oinfo.oi_owner); 390 xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap); 391 return 1; 392 } 393 394 /* 395 * We used space and committed the btree. Cancel the autoreap, remove 396 * the written blocks from the reservation, and possibly log a new EFI 397 * to free any unused reservation space. 398 */ 399 xfs_alloc_cancel_autoreap(sc->tp, &resv->autoreap); 400 free_agbno += resv->used; 401 free_aglen -= resv->used; 402 403 if (free_aglen == 0) 404 return 0; 405 406 trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno, 407 free_aglen, xnr->oinfo.oi_owner); 408 409 ASSERT(xnr->resv != XFS_AG_RESV_AGFL); 410 ASSERT(xnr->resv != XFS_AG_RESV_IGNORE); 411 412 /* 413 * Use EFIs to free the reservations. This reduces the chance 414 * that we leak blocks if the system goes down. 415 */ 416 fsbno = XFS_AGB_TO_FSB(sc->mp, resv->pag->pag_agno, free_agbno); 417 error = xfs_free_extent_later(sc->tp, fsbno, free_aglen, &xnr->oinfo, 418 xnr->resv, XFS_FREE_EXTENT_SKIP_DISCARD); 419 if (error) 420 return error; 421 422 return 1; 423 } 424 425 /* Free all the accounting info and disk space we reserved for a new btree. */ 426 STATIC int 427 xrep_newbt_free( 428 struct xrep_newbt *xnr, 429 bool btree_committed) 430 { 431 struct xfs_scrub *sc = xnr->sc; 432 struct xrep_newbt_resv *resv, *n; 433 unsigned int freed = 0; 434 int error = 0; 435 436 /* 437 * If the filesystem already went down, we can't free the blocks. Skip 438 * ahead to freeing the incore metadata because we can't fix anything. 439 */ 440 if (xfs_is_shutdown(sc->mp)) 441 goto junkit; 442 443 list_for_each_entry_safe(resv, n, &xnr->resv_list, list) { 444 int ret; 445 446 ret = xrep_newbt_free_extent(xnr, resv, btree_committed); 447 list_del(&resv->list); 448 xfs_perag_put(resv->pag); 449 kfree(resv); 450 if (ret < 0) { 451 error = ret; 452 goto junkit; 453 } 454 455 freed += ret; 456 if (freed >= XREP_MAX_ITRUNCATE_EFIS) { 457 error = xrep_defer_finish(sc); 458 if (error) 459 goto junkit; 460 freed = 0; 461 } 462 } 463 464 if (freed) 465 error = xrep_defer_finish(sc); 466 467 junkit: 468 /* 469 * If we still have reservations attached to @newbt, cleanup must have 470 * failed and the filesystem is about to go down. Clean up the incore 471 * reservations and try to commit to freeing the space we used. 472 */ 473 list_for_each_entry_safe(resv, n, &xnr->resv_list, list) { 474 xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap); 475 list_del(&resv->list); 476 xfs_perag_put(resv->pag); 477 kfree(resv); 478 } 479 480 if (sc->ip) { 481 kmem_cache_free(xfs_ifork_cache, xnr->ifake.if_fork); 482 xnr->ifake.if_fork = NULL; 483 } 484 485 return error; 486 } 487 488 /* 489 * Free all the accounting info and unused disk space allocations after 490 * committing a new btree. 491 */ 492 int 493 xrep_newbt_commit( 494 struct xrep_newbt *xnr) 495 { 496 return xrep_newbt_free(xnr, true); 497 } 498 499 /* 500 * Free all the accounting info and all of the disk space we reserved for a new 501 * btree that we're not going to commit. We want to try to roll things back 502 * cleanly for things like ENOSPC midway through allocation. 503 */ 504 void 505 xrep_newbt_cancel( 506 struct xrep_newbt *xnr) 507 { 508 xrep_newbt_free(xnr, false); 509 } 510 511 /* Feed one of the reserved btree blocks to the bulk loader. */ 512 int 513 xrep_newbt_claim_block( 514 struct xfs_btree_cur *cur, 515 struct xrep_newbt *xnr, 516 union xfs_btree_ptr *ptr) 517 { 518 struct xrep_newbt_resv *resv; 519 struct xfs_mount *mp = cur->bc_mp; 520 xfs_agblock_t agbno; 521 522 /* 523 * The first item in the list should always have a free block unless 524 * we're completely out. 525 */ 526 resv = list_first_entry(&xnr->resv_list, struct xrep_newbt_resv, list); 527 if (resv->used == resv->len) 528 return -ENOSPC; 529 530 /* 531 * Peel off a block from the start of the reservation. We allocate 532 * blocks in order to place blocks on disk in increasing record or key 533 * order. The block reservations tend to end up on the list in 534 * decreasing order, which hopefully results in leaf blocks ending up 535 * together. 536 */ 537 agbno = resv->agbno + resv->used; 538 resv->used++; 539 540 /* If we used all the blocks in this reservation, move it to the end. */ 541 if (resv->used == resv->len) 542 list_move_tail(&resv->list, &xnr->resv_list); 543 544 trace_xrep_newbt_claim_block(mp, resv->pag->pag_agno, agbno, 1, 545 xnr->oinfo.oi_owner); 546 547 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) 548 ptr->l = cpu_to_be64(XFS_AGB_TO_FSB(mp, resv->pag->pag_agno, 549 agbno)); 550 else 551 ptr->s = cpu_to_be32(agbno); 552 553 /* Relog all the EFIs. */ 554 return xrep_defer_finish(xnr->sc); 555 } 556 557 /* How many reserved blocks are unused? */ 558 unsigned int 559 xrep_newbt_unused_blocks( 560 struct xrep_newbt *xnr) 561 { 562 struct xrep_newbt_resv *resv; 563 unsigned int unused = 0; 564 565 list_for_each_entry(resv, &xnr->resv_list, list) 566 unused += resv->len - resv->used; 567 return unused; 568 } 569