1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2022-2023 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_btree.h" 13 #include "xfs_btree_staging.h" 14 #include "xfs_log_format.h" 15 #include "xfs_trans.h" 16 #include "xfs_sb.h" 17 #include "xfs_inode.h" 18 #include "xfs_alloc.h" 19 #include "xfs_rmap.h" 20 #include "xfs_ag.h" 21 #include "xfs_defer.h" 22 #include "scrub/scrub.h" 23 #include "scrub/common.h" 24 #include "scrub/trace.h" 25 #include "scrub/repair.h" 26 #include "scrub/newbt.h" 27 28 /* 29 * Estimate proper slack values for a btree that's being reloaded. 30 * 31 * Under most circumstances, we'll take whatever default loading value the 32 * btree bulk loading code calculates for us. However, there are some 33 * exceptions to this rule: 34 * 35 * (0) If someone turned one of the debug knobs. 36 * (1) If this is a per-AG btree and the AG has less than 10% space free. 37 * (2) If this is an inode btree and the FS has less than 10% space free. 38 39 * In either case, format the new btree blocks almost completely full to 40 * minimize space usage. 41 */ 42 static void 43 xrep_newbt_estimate_slack( 44 struct xrep_newbt *xnr) 45 { 46 struct xfs_scrub *sc = xnr->sc; 47 struct xfs_btree_bload *bload = &xnr->bload; 48 uint64_t free; 49 uint64_t sz; 50 51 /* 52 * The xfs_globals values are set to -1 (i.e. take the bload defaults) 53 * unless someone has set them otherwise, so we just pull the values 54 * here. 55 */ 56 bload->leaf_slack = xfs_globals.bload_leaf_slack; 57 bload->node_slack = xfs_globals.bload_node_slack; 58 59 if (sc->ops->type == ST_PERAG) { 60 free = sc->sa.pag->pagf_freeblks; 61 sz = xfs_ag_block_count(sc->mp, sc->sa.pag->pag_agno); 62 } else { 63 free = percpu_counter_sum(&sc->mp->m_fdblocks); 64 sz = sc->mp->m_sb.sb_dblocks; 65 } 66 67 /* No further changes if there's more than 10% free space left. */ 68 if (free >= div_u64(sz, 10)) 69 return; 70 71 /* 72 * We're low on space; load the btrees as tightly as possible. Leave 73 * a couple of open slots in each btree block so that we don't end up 74 * splitting the btrees like crazy after a mount. 75 */ 76 if (bload->leaf_slack < 0) 77 bload->leaf_slack = 2; 78 if (bload->node_slack < 0) 79 bload->node_slack = 2; 80 } 81 82 /* Initialize accounting resources for staging a new AG btree. */ 83 void 84 xrep_newbt_init_ag( 85 struct xrep_newbt *xnr, 86 struct xfs_scrub *sc, 87 const struct xfs_owner_info *oinfo, 88 xfs_fsblock_t alloc_hint, 89 enum xfs_ag_resv_type resv) 90 { 91 memset(xnr, 0, sizeof(struct xrep_newbt)); 92 xnr->sc = sc; 93 xnr->oinfo = *oinfo; /* structure copy */ 94 xnr->alloc_hint = alloc_hint; 95 xnr->resv = resv; 96 INIT_LIST_HEAD(&xnr->resv_list); 97 xnr->bload.max_dirty = XFS_B_TO_FSBT(sc->mp, 256U << 10); /* 256K */ 98 xrep_newbt_estimate_slack(xnr); 99 } 100 101 /* Initialize accounting resources for staging a new inode fork btree. */ 102 int 103 xrep_newbt_init_inode( 104 struct xrep_newbt *xnr, 105 struct xfs_scrub *sc, 106 int whichfork, 107 const struct xfs_owner_info *oinfo) 108 { 109 struct xfs_ifork *ifp; 110 111 ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS); 112 if (!ifp) 113 return -ENOMEM; 114 115 xrep_newbt_init_ag(xnr, sc, oinfo, 116 XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino), 117 XFS_AG_RESV_NONE); 118 xnr->ifake.if_fork = ifp; 119 xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, whichfork); 120 return 0; 121 } 122 123 /* 124 * Initialize accounting resources for staging a new btree. Callers are 125 * expected to add their own reservations (and clean them up) manually. 126 */ 127 void 128 xrep_newbt_init_bare( 129 struct xrep_newbt *xnr, 130 struct xfs_scrub *sc) 131 { 132 xrep_newbt_init_ag(xnr, sc, &XFS_RMAP_OINFO_ANY_OWNER, NULLFSBLOCK, 133 XFS_AG_RESV_NONE); 134 } 135 136 /* 137 * Designate specific blocks to be used to build our new btree. @pag must be 138 * a passive reference. 139 */ 140 STATIC int 141 xrep_newbt_add_blocks( 142 struct xrep_newbt *xnr, 143 struct xfs_perag *pag, 144 const struct xfs_alloc_arg *args) 145 { 146 struct xfs_mount *mp = xnr->sc->mp; 147 struct xrep_newbt_resv *resv; 148 int error; 149 150 resv = kmalloc(sizeof(struct xrep_newbt_resv), XCHK_GFP_FLAGS); 151 if (!resv) 152 return -ENOMEM; 153 154 INIT_LIST_HEAD(&resv->list); 155 resv->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); 156 resv->len = args->len; 157 resv->used = 0; 158 resv->pag = xfs_perag_hold(pag); 159 160 if (args->tp) { 161 ASSERT(xnr->oinfo.oi_offset == 0); 162 163 error = xfs_alloc_schedule_autoreap(args, true, &resv->autoreap); 164 if (error) 165 goto out_pag; 166 } 167 168 list_add_tail(&resv->list, &xnr->resv_list); 169 return 0; 170 out_pag: 171 xfs_perag_put(resv->pag); 172 kfree(resv); 173 return error; 174 } 175 176 /* 177 * Add an extent to the new btree reservation pool. Callers are required to 178 * reap this reservation manually if the repair is cancelled. @pag must be a 179 * passive reference. 180 */ 181 int 182 xrep_newbt_add_extent( 183 struct xrep_newbt *xnr, 184 struct xfs_perag *pag, 185 xfs_agblock_t agbno, 186 xfs_extlen_t len) 187 { 188 struct xfs_mount *mp = xnr->sc->mp; 189 struct xfs_alloc_arg args = { 190 .tp = NULL, /* no autoreap */ 191 .oinfo = xnr->oinfo, 192 .fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, agbno), 193 .len = len, 194 .resv = xnr->resv, 195 }; 196 197 return xrep_newbt_add_blocks(xnr, pag, &args); 198 } 199 200 /* Don't let our allocation hint take us beyond this AG */ 201 static inline void 202 xrep_newbt_validate_ag_alloc_hint( 203 struct xrep_newbt *xnr) 204 { 205 struct xfs_scrub *sc = xnr->sc; 206 xfs_agnumber_t agno = XFS_FSB_TO_AGNO(sc->mp, xnr->alloc_hint); 207 208 if (agno == sc->sa.pag->pag_agno && 209 xfs_verify_fsbno(sc->mp, xnr->alloc_hint)) 210 return; 211 212 xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, 213 XFS_AGFL_BLOCK(sc->mp) + 1); 214 } 215 216 /* Allocate disk space for a new per-AG btree. */ 217 STATIC int 218 xrep_newbt_alloc_ag_blocks( 219 struct xrep_newbt *xnr, 220 uint64_t nr_blocks) 221 { 222 struct xfs_scrub *sc = xnr->sc; 223 struct xfs_mount *mp = sc->mp; 224 int error = 0; 225 226 ASSERT(sc->sa.pag != NULL); 227 228 while (nr_blocks > 0) { 229 struct xfs_alloc_arg args = { 230 .tp = sc->tp, 231 .mp = mp, 232 .oinfo = xnr->oinfo, 233 .minlen = 1, 234 .maxlen = nr_blocks, 235 .prod = 1, 236 .resv = xnr->resv, 237 }; 238 xfs_agnumber_t agno; 239 240 xrep_newbt_validate_ag_alloc_hint(xnr); 241 242 if (xnr->alloc_vextent) 243 error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint); 244 else 245 error = xfs_alloc_vextent_near_bno(&args, 246 xnr->alloc_hint); 247 if (error) 248 return error; 249 if (args.fsbno == NULLFSBLOCK) 250 return -ENOSPC; 251 252 agno = XFS_FSB_TO_AGNO(mp, args.fsbno); 253 254 trace_xrep_newbt_alloc_ag_blocks(mp, agno, 255 XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len, 256 xnr->oinfo.oi_owner); 257 258 if (agno != sc->sa.pag->pag_agno) { 259 ASSERT(agno == sc->sa.pag->pag_agno); 260 return -EFSCORRUPTED; 261 } 262 263 error = xrep_newbt_add_blocks(xnr, sc->sa.pag, &args); 264 if (error) 265 return error; 266 267 nr_blocks -= args.len; 268 xnr->alloc_hint = args.fsbno + args.len; 269 270 error = xrep_defer_finish(sc); 271 if (error) 272 return error; 273 } 274 275 return 0; 276 } 277 278 /* Don't let our allocation hint take us beyond EOFS */ 279 static inline void 280 xrep_newbt_validate_file_alloc_hint( 281 struct xrep_newbt *xnr) 282 { 283 struct xfs_scrub *sc = xnr->sc; 284 285 if (xfs_verify_fsbno(sc->mp, xnr->alloc_hint)) 286 return; 287 288 xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, 0, XFS_AGFL_BLOCK(sc->mp) + 1); 289 } 290 291 /* Allocate disk space for our new file-based btree. */ 292 STATIC int 293 xrep_newbt_alloc_file_blocks( 294 struct xrep_newbt *xnr, 295 uint64_t nr_blocks) 296 { 297 struct xfs_scrub *sc = xnr->sc; 298 struct xfs_mount *mp = sc->mp; 299 int error = 0; 300 301 while (nr_blocks > 0) { 302 struct xfs_alloc_arg args = { 303 .tp = sc->tp, 304 .mp = mp, 305 .oinfo = xnr->oinfo, 306 .minlen = 1, 307 .maxlen = nr_blocks, 308 .prod = 1, 309 .resv = xnr->resv, 310 }; 311 struct xfs_perag *pag; 312 xfs_agnumber_t agno; 313 314 xrep_newbt_validate_file_alloc_hint(xnr); 315 316 if (xnr->alloc_vextent) 317 error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint); 318 else 319 error = xfs_alloc_vextent_start_ag(&args, 320 xnr->alloc_hint); 321 if (error) 322 return error; 323 if (args.fsbno == NULLFSBLOCK) 324 return -ENOSPC; 325 326 agno = XFS_FSB_TO_AGNO(mp, args.fsbno); 327 328 trace_xrep_newbt_alloc_file_blocks(mp, agno, 329 XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len, 330 xnr->oinfo.oi_owner); 331 332 pag = xfs_perag_get(mp, agno); 333 if (!pag) { 334 ASSERT(0); 335 return -EFSCORRUPTED; 336 } 337 338 error = xrep_newbt_add_blocks(xnr, pag, &args); 339 xfs_perag_put(pag); 340 if (error) 341 return error; 342 343 nr_blocks -= args.len; 344 xnr->alloc_hint = args.fsbno + args.len; 345 346 error = xrep_defer_finish(sc); 347 if (error) 348 return error; 349 } 350 351 return 0; 352 } 353 354 /* Allocate disk space for our new btree. */ 355 int 356 xrep_newbt_alloc_blocks( 357 struct xrep_newbt *xnr, 358 uint64_t nr_blocks) 359 { 360 if (xnr->sc->ip) 361 return xrep_newbt_alloc_file_blocks(xnr, nr_blocks); 362 return xrep_newbt_alloc_ag_blocks(xnr, nr_blocks); 363 } 364 365 /* 366 * Free the unused part of a space extent that was reserved for a new ondisk 367 * structure. Returns the number of EFIs logged or a negative errno. 368 */ 369 STATIC int 370 xrep_newbt_free_extent( 371 struct xrep_newbt *xnr, 372 struct xrep_newbt_resv *resv, 373 bool btree_committed) 374 { 375 struct xfs_scrub *sc = xnr->sc; 376 xfs_agblock_t free_agbno = resv->agbno; 377 xfs_extlen_t free_aglen = resv->len; 378 xfs_fsblock_t fsbno; 379 int error; 380 381 if (!btree_committed || resv->used == 0) { 382 /* 383 * If we're not committing a new btree or we didn't use the 384 * space reservation, let the existing EFI free the entire 385 * space extent. 386 */ 387 trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, 388 free_agbno, free_aglen, xnr->oinfo.oi_owner); 389 xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap); 390 return 1; 391 } 392 393 /* 394 * We used space and committed the btree. Cancel the autoreap, remove 395 * the written blocks from the reservation, and possibly log a new EFI 396 * to free any unused reservation space. 397 */ 398 xfs_alloc_cancel_autoreap(sc->tp, &resv->autoreap); 399 free_agbno += resv->used; 400 free_aglen -= resv->used; 401 402 if (free_aglen == 0) 403 return 0; 404 405 trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno, 406 free_aglen, xnr->oinfo.oi_owner); 407 408 ASSERT(xnr->resv != XFS_AG_RESV_AGFL); 409 ASSERT(xnr->resv != XFS_AG_RESV_IGNORE); 410 411 /* 412 * Use EFIs to free the reservations. This reduces the chance 413 * that we leak blocks if the system goes down. 414 */ 415 fsbno = XFS_AGB_TO_FSB(sc->mp, resv->pag->pag_agno, free_agbno); 416 error = xfs_free_extent_later(sc->tp, fsbno, free_aglen, &xnr->oinfo, 417 xnr->resv, true); 418 if (error) 419 return error; 420 421 return 1; 422 } 423 424 /* Free all the accounting info and disk space we reserved for a new btree. */ 425 STATIC int 426 xrep_newbt_free( 427 struct xrep_newbt *xnr, 428 bool btree_committed) 429 { 430 struct xfs_scrub *sc = xnr->sc; 431 struct xrep_newbt_resv *resv, *n; 432 unsigned int freed = 0; 433 int error = 0; 434 435 /* 436 * If the filesystem already went down, we can't free the blocks. Skip 437 * ahead to freeing the incore metadata because we can't fix anything. 438 */ 439 if (xfs_is_shutdown(sc->mp)) 440 goto junkit; 441 442 list_for_each_entry_safe(resv, n, &xnr->resv_list, list) { 443 int ret; 444 445 ret = xrep_newbt_free_extent(xnr, resv, btree_committed); 446 list_del(&resv->list); 447 xfs_perag_put(resv->pag); 448 kfree(resv); 449 if (ret < 0) { 450 error = ret; 451 goto junkit; 452 } 453 454 freed += ret; 455 if (freed >= XREP_MAX_ITRUNCATE_EFIS) { 456 error = xrep_defer_finish(sc); 457 if (error) 458 goto junkit; 459 freed = 0; 460 } 461 } 462 463 if (freed) 464 error = xrep_defer_finish(sc); 465 466 junkit: 467 /* 468 * If we still have reservations attached to @newbt, cleanup must have 469 * failed and the filesystem is about to go down. Clean up the incore 470 * reservations and try to commit to freeing the space we used. 471 */ 472 list_for_each_entry_safe(resv, n, &xnr->resv_list, list) { 473 xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap); 474 list_del(&resv->list); 475 xfs_perag_put(resv->pag); 476 kfree(resv); 477 } 478 479 if (sc->ip) { 480 kmem_cache_free(xfs_ifork_cache, xnr->ifake.if_fork); 481 xnr->ifake.if_fork = NULL; 482 } 483 484 return error; 485 } 486 487 /* 488 * Free all the accounting info and unused disk space allocations after 489 * committing a new btree. 490 */ 491 int 492 xrep_newbt_commit( 493 struct xrep_newbt *xnr) 494 { 495 return xrep_newbt_free(xnr, true); 496 } 497 498 /* 499 * Free all the accounting info and all of the disk space we reserved for a new 500 * btree that we're not going to commit. We want to try to roll things back 501 * cleanly for things like ENOSPC midway through allocation. 502 */ 503 void 504 xrep_newbt_cancel( 505 struct xrep_newbt *xnr) 506 { 507 xrep_newbt_free(xnr, false); 508 } 509 510 /* Feed one of the reserved btree blocks to the bulk loader. */ 511 int 512 xrep_newbt_claim_block( 513 struct xfs_btree_cur *cur, 514 struct xrep_newbt *xnr, 515 union xfs_btree_ptr *ptr) 516 { 517 struct xrep_newbt_resv *resv; 518 struct xfs_mount *mp = cur->bc_mp; 519 xfs_agblock_t agbno; 520 521 /* 522 * The first item in the list should always have a free block unless 523 * we're completely out. 524 */ 525 resv = list_first_entry(&xnr->resv_list, struct xrep_newbt_resv, list); 526 if (resv->used == resv->len) 527 return -ENOSPC; 528 529 /* 530 * Peel off a block from the start of the reservation. We allocate 531 * blocks in order to place blocks on disk in increasing record or key 532 * order. The block reservations tend to end up on the list in 533 * decreasing order, which hopefully results in leaf blocks ending up 534 * together. 535 */ 536 agbno = resv->agbno + resv->used; 537 resv->used++; 538 539 /* If we used all the blocks in this reservation, move it to the end. */ 540 if (resv->used == resv->len) 541 list_move_tail(&resv->list, &xnr->resv_list); 542 543 trace_xrep_newbt_claim_block(mp, resv->pag->pag_agno, agbno, 1, 544 xnr->oinfo.oi_owner); 545 546 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) 547 ptr->l = cpu_to_be64(XFS_AGB_TO_FSB(mp, resv->pag->pag_agno, 548 agbno)); 549 else 550 ptr->s = cpu_to_be32(agbno); 551 552 /* Relog all the EFIs. */ 553 return xrep_defer_finish(xnr->sc); 554 } 555 556 /* How many reserved blocks are unused? */ 557 unsigned int 558 xrep_newbt_unused_blocks( 559 struct xrep_newbt *xnr) 560 { 561 struct xrep_newbt_resv *resv; 562 unsigned int unused = 0; 563 564 list_for_each_entry(resv, &xnr->resv_list, list) 565 unused += resv->len - resv->used; 566 return unused; 567 } 568