1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2022-2023 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_btree.h" 13 #include "xfs_btree_staging.h" 14 #include "xfs_log_format.h" 15 #include "xfs_trans.h" 16 #include "xfs_sb.h" 17 #include "xfs_inode.h" 18 #include "xfs_alloc.h" 19 #include "xfs_rmap.h" 20 #include "xfs_ag.h" 21 #include "xfs_defer.h" 22 #include "scrub/scrub.h" 23 #include "scrub/common.h" 24 #include "scrub/trace.h" 25 #include "scrub/repair.h" 26 #include "scrub/newbt.h" 27 28 /* 29 * Estimate proper slack values for a btree that's being reloaded. 30 * 31 * Under most circumstances, we'll take whatever default loading value the 32 * btree bulk loading code calculates for us. However, there are some 33 * exceptions to this rule: 34 * 35 * (0) If someone turned one of the debug knobs. 36 * (1) If this is a per-AG btree and the AG has less than 10% space free. 37 * (2) If this is an inode btree and the FS has less than 10% space free. 38 39 * In either case, format the new btree blocks almost completely full to 40 * minimize space usage. 41 */ 42 static void 43 xrep_newbt_estimate_slack( 44 struct xrep_newbt *xnr) 45 { 46 struct xfs_scrub *sc = xnr->sc; 47 struct xfs_btree_bload *bload = &xnr->bload; 48 uint64_t free; 49 uint64_t sz; 50 51 /* 52 * The xfs_globals values are set to -1 (i.e. take the bload defaults) 53 * unless someone has set them otherwise, so we just pull the values 54 * here. 55 */ 56 bload->leaf_slack = xfs_globals.bload_leaf_slack; 57 bload->node_slack = xfs_globals.bload_node_slack; 58 59 if (sc->ops->type == ST_PERAG) { 60 free = sc->sa.pag->pagf_freeblks; 61 sz = xfs_ag_block_count(sc->mp, pag_agno(sc->sa.pag)); 62 } else { 63 free = percpu_counter_sum(&sc->mp->m_fdblocks); 64 sz = sc->mp->m_sb.sb_dblocks; 65 } 66 67 /* No further changes if there's more than 10% free space left. */ 68 if (free >= div_u64(sz, 10)) 69 return; 70 71 /* 72 * We're low on space; load the btrees as tightly as possible. Leave 73 * a couple of open slots in each btree block so that we don't end up 74 * splitting the btrees like crazy after a mount. 75 */ 76 if (bload->leaf_slack < 0) 77 bload->leaf_slack = 2; 78 if (bload->node_slack < 0) 79 bload->node_slack = 2; 80 } 81 82 /* Initialize accounting resources for staging a new AG btree. */ 83 void 84 xrep_newbt_init_ag( 85 struct xrep_newbt *xnr, 86 struct xfs_scrub *sc, 87 const struct xfs_owner_info *oinfo, 88 xfs_fsblock_t alloc_hint, 89 enum xfs_ag_resv_type resv) 90 { 91 memset(xnr, 0, sizeof(struct xrep_newbt)); 92 xnr->sc = sc; 93 xnr->oinfo = *oinfo; /* structure copy */ 94 xnr->alloc_hint = alloc_hint; 95 xnr->resv = resv; 96 INIT_LIST_HEAD(&xnr->resv_list); 97 xnr->bload.max_dirty = XFS_B_TO_FSBT(sc->mp, 256U << 10); /* 256K */ 98 xrep_newbt_estimate_slack(xnr); 99 } 100 101 /* Initialize accounting resources for staging a new inode fork btree. */ 102 int 103 xrep_newbt_init_inode( 104 struct xrep_newbt *xnr, 105 struct xfs_scrub *sc, 106 int whichfork, 107 const struct xfs_owner_info *oinfo) 108 { 109 struct xfs_ifork *ifp; 110 111 ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS); 112 if (!ifp) 113 return -ENOMEM; 114 115 xrep_newbt_init_ag(xnr, sc, oinfo, 116 XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino), 117 XFS_AG_RESV_NONE); 118 xnr->ifake.if_fork = ifp; 119 xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, whichfork); 120 return 0; 121 } 122 123 /* 124 * Initialize accounting resources for staging a new btree. Callers are 125 * expected to add their own reservations (and clean them up) manually. 126 */ 127 void 128 xrep_newbt_init_bare( 129 struct xrep_newbt *xnr, 130 struct xfs_scrub *sc) 131 { 132 xrep_newbt_init_ag(xnr, sc, &XFS_RMAP_OINFO_ANY_OWNER, NULLFSBLOCK, 133 XFS_AG_RESV_NONE); 134 } 135 136 /* 137 * Designate specific blocks to be used to build our new btree. @pag must be 138 * a passive reference. 139 */ 140 STATIC int 141 xrep_newbt_add_blocks( 142 struct xrep_newbt *xnr, 143 struct xfs_perag *pag, 144 const struct xfs_alloc_arg *args) 145 { 146 struct xfs_mount *mp = xnr->sc->mp; 147 struct xrep_newbt_resv *resv; 148 int error; 149 150 resv = kmalloc(sizeof(struct xrep_newbt_resv), XCHK_GFP_FLAGS); 151 if (!resv) 152 return -ENOMEM; 153 154 INIT_LIST_HEAD(&resv->list); 155 resv->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); 156 resv->len = args->len; 157 resv->used = 0; 158 resv->pag = xfs_perag_hold(pag); 159 160 if (args->tp) { 161 ASSERT(xnr->oinfo.oi_offset == 0); 162 163 error = xfs_alloc_schedule_autoreap(args, 164 XFS_FREE_EXTENT_SKIP_DISCARD, &resv->autoreap); 165 if (error) 166 goto out_pag; 167 } 168 169 list_add_tail(&resv->list, &xnr->resv_list); 170 return 0; 171 out_pag: 172 xfs_perag_put(resv->pag); 173 kfree(resv); 174 return error; 175 } 176 177 /* 178 * Add an extent to the new btree reservation pool. Callers are required to 179 * reap this reservation manually if the repair is cancelled. @pag must be a 180 * passive reference. 181 */ 182 int 183 xrep_newbt_add_extent( 184 struct xrep_newbt *xnr, 185 struct xfs_perag *pag, 186 xfs_agblock_t agbno, 187 xfs_extlen_t len) 188 { 189 struct xfs_alloc_arg args = { 190 .tp = NULL, /* no autoreap */ 191 .oinfo = xnr->oinfo, 192 .fsbno = xfs_agbno_to_fsb(pag, agbno), 193 .len = len, 194 .resv = xnr->resv, 195 }; 196 197 return xrep_newbt_add_blocks(xnr, pag, &args); 198 } 199 200 /* Don't let our allocation hint take us beyond this AG */ 201 static inline void 202 xrep_newbt_validate_ag_alloc_hint( 203 struct xrep_newbt *xnr) 204 { 205 struct xfs_scrub *sc = xnr->sc; 206 xfs_agnumber_t agno = XFS_FSB_TO_AGNO(sc->mp, xnr->alloc_hint); 207 208 if (agno == pag_agno(sc->sa.pag) && 209 xfs_verify_fsbno(sc->mp, xnr->alloc_hint)) 210 return; 211 212 xnr->alloc_hint = 213 xfs_agbno_to_fsb(sc->sa.pag, XFS_AGFL_BLOCK(sc->mp) + 1); 214 } 215 216 /* Allocate disk space for a new per-AG btree. */ 217 STATIC int 218 xrep_newbt_alloc_ag_blocks( 219 struct xrep_newbt *xnr, 220 uint64_t nr_blocks) 221 { 222 struct xfs_scrub *sc = xnr->sc; 223 struct xfs_mount *mp = sc->mp; 224 int error = 0; 225 226 ASSERT(sc->sa.pag != NULL); 227 228 while (nr_blocks > 0) { 229 struct xfs_alloc_arg args = { 230 .tp = sc->tp, 231 .mp = mp, 232 .oinfo = xnr->oinfo, 233 .minlen = 1, 234 .maxlen = nr_blocks, 235 .prod = 1, 236 .resv = xnr->resv, 237 }; 238 xfs_agnumber_t agno; 239 240 xrep_newbt_validate_ag_alloc_hint(xnr); 241 242 if (xnr->alloc_vextent) 243 error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint); 244 else 245 error = xfs_alloc_vextent_near_bno(&args, 246 xnr->alloc_hint); 247 if (error) 248 return error; 249 if (args.fsbno == NULLFSBLOCK) 250 return -ENOSPC; 251 252 agno = XFS_FSB_TO_AGNO(mp, args.fsbno); 253 if (agno != pag_agno(sc->sa.pag)) { 254 ASSERT(agno == pag_agno(sc->sa.pag)); 255 return -EFSCORRUPTED; 256 } 257 258 trace_xrep_newbt_alloc_ag_blocks(sc->sa.pag, 259 XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len, 260 xnr->oinfo.oi_owner); 261 262 error = xrep_newbt_add_blocks(xnr, sc->sa.pag, &args); 263 if (error) 264 return error; 265 266 nr_blocks -= args.len; 267 xnr->alloc_hint = args.fsbno + args.len; 268 269 error = xrep_defer_finish(sc); 270 if (error) 271 return error; 272 } 273 274 return 0; 275 } 276 277 /* Don't let our allocation hint take us beyond EOFS */ 278 static inline void 279 xrep_newbt_validate_file_alloc_hint( 280 struct xrep_newbt *xnr) 281 { 282 struct xfs_scrub *sc = xnr->sc; 283 284 if (xfs_verify_fsbno(sc->mp, xnr->alloc_hint)) 285 return; 286 287 xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, 0, XFS_AGFL_BLOCK(sc->mp) + 1); 288 } 289 290 /* Allocate disk space for our new file-based btree. */ 291 STATIC int 292 xrep_newbt_alloc_file_blocks( 293 struct xrep_newbt *xnr, 294 uint64_t nr_blocks) 295 { 296 struct xfs_scrub *sc = xnr->sc; 297 struct xfs_mount *mp = sc->mp; 298 int error = 0; 299 300 while (nr_blocks > 0) { 301 struct xfs_alloc_arg args = { 302 .tp = sc->tp, 303 .mp = mp, 304 .oinfo = xnr->oinfo, 305 .minlen = 1, 306 .maxlen = nr_blocks, 307 .prod = 1, 308 .resv = xnr->resv, 309 }; 310 struct xfs_perag *pag; 311 xfs_agnumber_t agno; 312 313 xrep_newbt_validate_file_alloc_hint(xnr); 314 315 if (xnr->alloc_vextent) 316 error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint); 317 else 318 error = xfs_alloc_vextent_start_ag(&args, 319 xnr->alloc_hint); 320 if (error) 321 return error; 322 if (args.fsbno == NULLFSBLOCK) 323 return -ENOSPC; 324 325 agno = XFS_FSB_TO_AGNO(mp, args.fsbno); 326 327 pag = xfs_perag_get(mp, agno); 328 if (!pag) { 329 ASSERT(0); 330 return -EFSCORRUPTED; 331 } 332 333 trace_xrep_newbt_alloc_file_blocks(pag, 334 XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len, 335 xnr->oinfo.oi_owner); 336 337 error = xrep_newbt_add_blocks(xnr, pag, &args); 338 xfs_perag_put(pag); 339 if (error) 340 return error; 341 342 nr_blocks -= args.len; 343 xnr->alloc_hint = args.fsbno + args.len; 344 345 error = xrep_defer_finish(sc); 346 if (error) 347 return error; 348 } 349 350 return 0; 351 } 352 353 /* Allocate disk space for our new btree. */ 354 int 355 xrep_newbt_alloc_blocks( 356 struct xrep_newbt *xnr, 357 uint64_t nr_blocks) 358 { 359 if (xnr->sc->ip) 360 return xrep_newbt_alloc_file_blocks(xnr, nr_blocks); 361 return xrep_newbt_alloc_ag_blocks(xnr, nr_blocks); 362 } 363 364 /* 365 * Free the unused part of a space extent that was reserved for a new ondisk 366 * structure. Returns the number of EFIs logged or a negative errno. 367 */ 368 STATIC int 369 xrep_newbt_free_extent( 370 struct xrep_newbt *xnr, 371 struct xrep_newbt_resv *resv, 372 bool btree_committed) 373 { 374 struct xfs_scrub *sc = xnr->sc; 375 xfs_agblock_t free_agbno = resv->agbno; 376 xfs_extlen_t free_aglen = resv->len; 377 int error; 378 379 if (!btree_committed || resv->used == 0) { 380 /* 381 * If we're not committing a new btree or we didn't use the 382 * space reservation, let the existing EFI free the entire 383 * space extent. 384 */ 385 trace_xrep_newbt_free_blocks(resv->pag, free_agbno, free_aglen, 386 xnr->oinfo.oi_owner); 387 xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap); 388 return 1; 389 } 390 391 /* 392 * We used space and committed the btree. Cancel the autoreap, remove 393 * the written blocks from the reservation, and possibly log a new EFI 394 * to free any unused reservation space. 395 */ 396 xfs_alloc_cancel_autoreap(sc->tp, &resv->autoreap); 397 free_agbno += resv->used; 398 free_aglen -= resv->used; 399 400 if (free_aglen == 0) 401 return 0; 402 403 trace_xrep_newbt_free_blocks(resv->pag, free_agbno, free_aglen, 404 xnr->oinfo.oi_owner); 405 406 ASSERT(xnr->resv != XFS_AG_RESV_AGFL); 407 ASSERT(xnr->resv != XFS_AG_RESV_IGNORE); 408 409 /* 410 * Use EFIs to free the reservations. This reduces the chance 411 * that we leak blocks if the system goes down. 412 */ 413 error = xfs_free_extent_later(sc->tp, 414 xfs_agbno_to_fsb(resv->pag, free_agbno), free_aglen, 415 &xnr->oinfo, xnr->resv, XFS_FREE_EXTENT_SKIP_DISCARD); 416 if (error) 417 return error; 418 419 return 1; 420 } 421 422 /* Free all the accounting info and disk space we reserved for a new btree. */ 423 STATIC int 424 xrep_newbt_free( 425 struct xrep_newbt *xnr, 426 bool btree_committed) 427 { 428 struct xfs_scrub *sc = xnr->sc; 429 struct xrep_newbt_resv *resv, *n; 430 unsigned int freed = 0; 431 int error = 0; 432 433 /* 434 * If the filesystem already went down, we can't free the blocks. Skip 435 * ahead to freeing the incore metadata because we can't fix anything. 436 */ 437 if (xfs_is_shutdown(sc->mp)) 438 goto junkit; 439 440 list_for_each_entry_safe(resv, n, &xnr->resv_list, list) { 441 int ret; 442 443 ret = xrep_newbt_free_extent(xnr, resv, btree_committed); 444 list_del(&resv->list); 445 xfs_perag_put(resv->pag); 446 kfree(resv); 447 if (ret < 0) { 448 error = ret; 449 goto junkit; 450 } 451 452 freed += ret; 453 if (freed >= XREP_MAX_ITRUNCATE_EFIS) { 454 error = xrep_defer_finish(sc); 455 if (error) 456 goto junkit; 457 freed = 0; 458 } 459 } 460 461 if (freed) 462 error = xrep_defer_finish(sc); 463 464 junkit: 465 /* 466 * If we still have reservations attached to @newbt, cleanup must have 467 * failed and the filesystem is about to go down. Clean up the incore 468 * reservations and try to commit to freeing the space we used. 469 */ 470 list_for_each_entry_safe(resv, n, &xnr->resv_list, list) { 471 xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap); 472 list_del(&resv->list); 473 xfs_perag_put(resv->pag); 474 kfree(resv); 475 } 476 477 if (sc->ip) { 478 kmem_cache_free(xfs_ifork_cache, xnr->ifake.if_fork); 479 xnr->ifake.if_fork = NULL; 480 } 481 482 return error; 483 } 484 485 /* 486 * Free all the accounting info and unused disk space allocations after 487 * committing a new btree. 488 */ 489 int 490 xrep_newbt_commit( 491 struct xrep_newbt *xnr) 492 { 493 return xrep_newbt_free(xnr, true); 494 } 495 496 /* 497 * Free all the accounting info and all of the disk space we reserved for a new 498 * btree that we're not going to commit. We want to try to roll things back 499 * cleanly for things like ENOSPC midway through allocation. 500 */ 501 void 502 xrep_newbt_cancel( 503 struct xrep_newbt *xnr) 504 { 505 xrep_newbt_free(xnr, false); 506 } 507 508 /* Feed one of the reserved btree blocks to the bulk loader. */ 509 int 510 xrep_newbt_claim_block( 511 struct xfs_btree_cur *cur, 512 struct xrep_newbt *xnr, 513 union xfs_btree_ptr *ptr) 514 { 515 struct xrep_newbt_resv *resv; 516 xfs_agblock_t agbno; 517 518 /* 519 * The first item in the list should always have a free block unless 520 * we're completely out. 521 */ 522 resv = list_first_entry(&xnr->resv_list, struct xrep_newbt_resv, list); 523 if (resv->used == resv->len) 524 return -ENOSPC; 525 526 /* 527 * Peel off a block from the start of the reservation. We allocate 528 * blocks in order to place blocks on disk in increasing record or key 529 * order. The block reservations tend to end up on the list in 530 * decreasing order, which hopefully results in leaf blocks ending up 531 * together. 532 */ 533 agbno = resv->agbno + resv->used; 534 resv->used++; 535 536 /* If we used all the blocks in this reservation, move it to the end. */ 537 if (resv->used == resv->len) 538 list_move_tail(&resv->list, &xnr->resv_list); 539 540 trace_xrep_newbt_claim_block(resv->pag, agbno, 1, xnr->oinfo.oi_owner); 541 542 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) 543 ptr->l = cpu_to_be64(xfs_agbno_to_fsb(resv->pag, agbno)); 544 else 545 ptr->s = cpu_to_be32(agbno); 546 547 /* Relog all the EFIs. */ 548 return xrep_defer_finish(xnr->sc); 549 } 550 551 /* How many reserved blocks are unused? */ 552 unsigned int 553 xrep_newbt_unused_blocks( 554 struct xrep_newbt *xnr) 555 { 556 struct xrep_newbt_resv *resv; 557 unsigned int unused = 0; 558 559 list_for_each_entry(resv, &xnr->resv_list, list) 560 unused += resv->len - resv->used; 561 return unused; 562 } 563