1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2022-2023 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_btree.h" 13 #include "xfs_btree_staging.h" 14 #include "xfs_log_format.h" 15 #include "xfs_trans.h" 16 #include "xfs_sb.h" 17 #include "xfs_inode.h" 18 #include "xfs_alloc.h" 19 #include "xfs_rmap.h" 20 #include "xfs_ag.h" 21 #include "xfs_defer.h" 22 #include "scrub/scrub.h" 23 #include "scrub/common.h" 24 #include "scrub/trace.h" 25 #include "scrub/repair.h" 26 #include "scrub/newbt.h" 27 28 /* 29 * Estimate proper slack values for a btree that's being reloaded. 30 * 31 * Under most circumstances, we'll take whatever default loading value the 32 * btree bulk loading code calculates for us. However, there are some 33 * exceptions to this rule: 34 * 35 * (0) If someone turned one of the debug knobs. 36 * (1) If this is a per-AG btree and the AG has less than 10% space free. 37 * (2) If this is an inode btree and the FS has less than 10% space free. 38 39 * In either case, format the new btree blocks almost completely full to 40 * minimize space usage. 41 */ 42 static void 43 xrep_newbt_estimate_slack( 44 struct xrep_newbt *xnr) 45 { 46 struct xfs_scrub *sc = xnr->sc; 47 struct xfs_btree_bload *bload = &xnr->bload; 48 uint64_t free; 49 uint64_t sz; 50 51 /* 52 * The xfs_globals values are set to -1 (i.e. take the bload defaults) 53 * unless someone has set them otherwise, so we just pull the values 54 * here. 55 */ 56 bload->leaf_slack = xfs_globals.bload_leaf_slack; 57 bload->node_slack = xfs_globals.bload_node_slack; 58 59 if (sc->ops->type == ST_PERAG) { 60 free = sc->sa.pag->pagf_freeblks; 61 sz = xfs_ag_block_count(sc->mp, sc->sa.pag->pag_agno); 62 } else { 63 free = percpu_counter_sum(&sc->mp->m_fdblocks); 64 sz = sc->mp->m_sb.sb_dblocks; 65 } 66 67 /* No further changes if there's more than 10% free space left. */ 68 if (free >= div_u64(sz, 10)) 69 return; 70 71 /* 72 * We're low on space; load the btrees as tightly as possible. Leave 73 * a couple of open slots in each btree block so that we don't end up 74 * splitting the btrees like crazy after a mount. 75 */ 76 if (bload->leaf_slack < 0) 77 bload->leaf_slack = 2; 78 if (bload->node_slack < 0) 79 bload->node_slack = 2; 80 } 81 82 /* Initialize accounting resources for staging a new AG btree. */ 83 void 84 xrep_newbt_init_ag( 85 struct xrep_newbt *xnr, 86 struct xfs_scrub *sc, 87 const struct xfs_owner_info *oinfo, 88 xfs_fsblock_t alloc_hint, 89 enum xfs_ag_resv_type resv) 90 { 91 memset(xnr, 0, sizeof(struct xrep_newbt)); 92 xnr->sc = sc; 93 xnr->oinfo = *oinfo; /* structure copy */ 94 xnr->alloc_hint = alloc_hint; 95 xnr->resv = resv; 96 INIT_LIST_HEAD(&xnr->resv_list); 97 xnr->bload.max_dirty = XFS_B_TO_FSBT(sc->mp, 256U << 10); /* 256K */ 98 xrep_newbt_estimate_slack(xnr); 99 } 100 101 /* Initialize accounting resources for staging a new inode fork btree. */ 102 int 103 xrep_newbt_init_inode( 104 struct xrep_newbt *xnr, 105 struct xfs_scrub *sc, 106 int whichfork, 107 const struct xfs_owner_info *oinfo) 108 { 109 struct xfs_ifork *ifp; 110 111 ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS); 112 if (!ifp) 113 return -ENOMEM; 114 115 xrep_newbt_init_ag(xnr, sc, oinfo, 116 XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino), 117 XFS_AG_RESV_NONE); 118 xnr->ifake.if_fork = ifp; 119 xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, whichfork); 120 return 0; 121 } 122 123 /* 124 * Initialize accounting resources for staging a new btree. Callers are 125 * expected to add their own reservations (and clean them up) manually. 126 */ 127 void 128 xrep_newbt_init_bare( 129 struct xrep_newbt *xnr, 130 struct xfs_scrub *sc) 131 { 132 xrep_newbt_init_ag(xnr, sc, &XFS_RMAP_OINFO_ANY_OWNER, NULLFSBLOCK, 133 XFS_AG_RESV_NONE); 134 } 135 136 /* 137 * Designate specific blocks to be used to build our new btree. @pag must be 138 * a passive reference. 139 */ 140 STATIC int 141 xrep_newbt_add_blocks( 142 struct xrep_newbt *xnr, 143 struct xfs_perag *pag, 144 const struct xfs_alloc_arg *args) 145 { 146 struct xfs_mount *mp = xnr->sc->mp; 147 struct xrep_newbt_resv *resv; 148 int error; 149 150 resv = kmalloc(sizeof(struct xrep_newbt_resv), XCHK_GFP_FLAGS); 151 if (!resv) 152 return -ENOMEM; 153 154 INIT_LIST_HEAD(&resv->list); 155 resv->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); 156 resv->len = args->len; 157 resv->used = 0; 158 resv->pag = xfs_perag_hold(pag); 159 160 if (args->tp) { 161 ASSERT(xnr->oinfo.oi_offset == 0); 162 163 error = xfs_alloc_schedule_autoreap(args, true, &resv->autoreap); 164 if (error) 165 goto out_pag; 166 } 167 168 list_add_tail(&resv->list, &xnr->resv_list); 169 return 0; 170 out_pag: 171 xfs_perag_put(resv->pag); 172 kfree(resv); 173 return error; 174 } 175 176 /* 177 * Add an extent to the new btree reservation pool. Callers are required to 178 * reap this reservation manually if the repair is cancelled. @pag must be a 179 * passive reference. 180 */ 181 int 182 xrep_newbt_add_extent( 183 struct xrep_newbt *xnr, 184 struct xfs_perag *pag, 185 xfs_agblock_t agbno, 186 xfs_extlen_t len) 187 { 188 struct xfs_mount *mp = xnr->sc->mp; 189 struct xfs_alloc_arg args = { 190 .tp = NULL, /* no autoreap */ 191 .oinfo = xnr->oinfo, 192 .fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, agbno), 193 .len = len, 194 .resv = xnr->resv, 195 }; 196 197 return xrep_newbt_add_blocks(xnr, pag, &args); 198 } 199 200 /* Don't let our allocation hint take us beyond this AG */ 201 static inline void 202 xrep_newbt_validate_ag_alloc_hint( 203 struct xrep_newbt *xnr) 204 { 205 struct xfs_scrub *sc = xnr->sc; 206 xfs_agnumber_t agno = XFS_FSB_TO_AGNO(sc->mp, xnr->alloc_hint); 207 208 if (agno == sc->sa.pag->pag_agno && 209 xfs_verify_fsbno(sc->mp, xnr->alloc_hint)) 210 return; 211 212 xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, 213 XFS_AGFL_BLOCK(sc->mp) + 1); 214 } 215 216 /* Allocate disk space for a new per-AG btree. */ 217 STATIC int 218 xrep_newbt_alloc_ag_blocks( 219 struct xrep_newbt *xnr, 220 uint64_t nr_blocks) 221 { 222 struct xfs_scrub *sc = xnr->sc; 223 struct xfs_mount *mp = sc->mp; 224 int error = 0; 225 226 ASSERT(sc->sa.pag != NULL); 227 228 while (nr_blocks > 0) { 229 struct xfs_alloc_arg args = { 230 .tp = sc->tp, 231 .mp = mp, 232 .oinfo = xnr->oinfo, 233 .minlen = 1, 234 .maxlen = nr_blocks, 235 .prod = 1, 236 .resv = xnr->resv, 237 }; 238 xfs_agnumber_t agno; 239 240 xrep_newbt_validate_ag_alloc_hint(xnr); 241 242 error = xfs_alloc_vextent_near_bno(&args, xnr->alloc_hint); 243 if (error) 244 return error; 245 if (args.fsbno == NULLFSBLOCK) 246 return -ENOSPC; 247 248 agno = XFS_FSB_TO_AGNO(mp, args.fsbno); 249 250 trace_xrep_newbt_alloc_ag_blocks(mp, agno, 251 XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len, 252 xnr->oinfo.oi_owner); 253 254 if (agno != sc->sa.pag->pag_agno) { 255 ASSERT(agno == sc->sa.pag->pag_agno); 256 return -EFSCORRUPTED; 257 } 258 259 error = xrep_newbt_add_blocks(xnr, sc->sa.pag, &args); 260 if (error) 261 return error; 262 263 nr_blocks -= args.len; 264 xnr->alloc_hint = args.fsbno + args.len; 265 266 error = xrep_defer_finish(sc); 267 if (error) 268 return error; 269 } 270 271 return 0; 272 } 273 274 /* Don't let our allocation hint take us beyond EOFS */ 275 static inline void 276 xrep_newbt_validate_file_alloc_hint( 277 struct xrep_newbt *xnr) 278 { 279 struct xfs_scrub *sc = xnr->sc; 280 281 if (xfs_verify_fsbno(sc->mp, xnr->alloc_hint)) 282 return; 283 284 xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, 0, XFS_AGFL_BLOCK(sc->mp) + 1); 285 } 286 287 /* Allocate disk space for our new file-based btree. */ 288 STATIC int 289 xrep_newbt_alloc_file_blocks( 290 struct xrep_newbt *xnr, 291 uint64_t nr_blocks) 292 { 293 struct xfs_scrub *sc = xnr->sc; 294 struct xfs_mount *mp = sc->mp; 295 int error = 0; 296 297 while (nr_blocks > 0) { 298 struct xfs_alloc_arg args = { 299 .tp = sc->tp, 300 .mp = mp, 301 .oinfo = xnr->oinfo, 302 .minlen = 1, 303 .maxlen = nr_blocks, 304 .prod = 1, 305 .resv = xnr->resv, 306 }; 307 struct xfs_perag *pag; 308 xfs_agnumber_t agno; 309 310 xrep_newbt_validate_file_alloc_hint(xnr); 311 312 error = xfs_alloc_vextent_start_ag(&args, xnr->alloc_hint); 313 if (error) 314 return error; 315 if (args.fsbno == NULLFSBLOCK) 316 return -ENOSPC; 317 318 agno = XFS_FSB_TO_AGNO(mp, args.fsbno); 319 320 trace_xrep_newbt_alloc_file_blocks(mp, agno, 321 XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len, 322 xnr->oinfo.oi_owner); 323 324 pag = xfs_perag_get(mp, agno); 325 if (!pag) { 326 ASSERT(0); 327 return -EFSCORRUPTED; 328 } 329 330 error = xrep_newbt_add_blocks(xnr, pag, &args); 331 xfs_perag_put(pag); 332 if (error) 333 return error; 334 335 nr_blocks -= args.len; 336 xnr->alloc_hint = args.fsbno + args.len; 337 338 error = xrep_defer_finish(sc); 339 if (error) 340 return error; 341 } 342 343 return 0; 344 } 345 346 /* Allocate disk space for our new btree. */ 347 int 348 xrep_newbt_alloc_blocks( 349 struct xrep_newbt *xnr, 350 uint64_t nr_blocks) 351 { 352 if (xnr->sc->ip) 353 return xrep_newbt_alloc_file_blocks(xnr, nr_blocks); 354 return xrep_newbt_alloc_ag_blocks(xnr, nr_blocks); 355 } 356 357 /* 358 * Free the unused part of a space extent that was reserved for a new ondisk 359 * structure. Returns the number of EFIs logged or a negative errno. 360 */ 361 STATIC int 362 xrep_newbt_free_extent( 363 struct xrep_newbt *xnr, 364 struct xrep_newbt_resv *resv, 365 bool btree_committed) 366 { 367 struct xfs_scrub *sc = xnr->sc; 368 xfs_agblock_t free_agbno = resv->agbno; 369 xfs_extlen_t free_aglen = resv->len; 370 xfs_fsblock_t fsbno; 371 int error; 372 373 if (!btree_committed || resv->used == 0) { 374 /* 375 * If we're not committing a new btree or we didn't use the 376 * space reservation, let the existing EFI free the entire 377 * space extent. 378 */ 379 trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, 380 free_agbno, free_aglen, xnr->oinfo.oi_owner); 381 xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap); 382 return 1; 383 } 384 385 /* 386 * We used space and committed the btree. Cancel the autoreap, remove 387 * the written blocks from the reservation, and possibly log a new EFI 388 * to free any unused reservation space. 389 */ 390 xfs_alloc_cancel_autoreap(sc->tp, &resv->autoreap); 391 free_agbno += resv->used; 392 free_aglen -= resv->used; 393 394 if (free_aglen == 0) 395 return 0; 396 397 trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno, 398 free_aglen, xnr->oinfo.oi_owner); 399 400 ASSERT(xnr->resv != XFS_AG_RESV_AGFL); 401 ASSERT(xnr->resv != XFS_AG_RESV_IGNORE); 402 403 /* 404 * Use EFIs to free the reservations. This reduces the chance 405 * that we leak blocks if the system goes down. 406 */ 407 fsbno = XFS_AGB_TO_FSB(sc->mp, resv->pag->pag_agno, free_agbno); 408 error = xfs_free_extent_later(sc->tp, fsbno, free_aglen, &xnr->oinfo, 409 xnr->resv, true); 410 if (error) 411 return error; 412 413 return 1; 414 } 415 416 /* Free all the accounting info and disk space we reserved for a new btree. */ 417 STATIC int 418 xrep_newbt_free( 419 struct xrep_newbt *xnr, 420 bool btree_committed) 421 { 422 struct xfs_scrub *sc = xnr->sc; 423 struct xrep_newbt_resv *resv, *n; 424 unsigned int freed = 0; 425 int error = 0; 426 427 /* 428 * If the filesystem already went down, we can't free the blocks. Skip 429 * ahead to freeing the incore metadata because we can't fix anything. 430 */ 431 if (xfs_is_shutdown(sc->mp)) 432 goto junkit; 433 434 list_for_each_entry_safe(resv, n, &xnr->resv_list, list) { 435 int ret; 436 437 ret = xrep_newbt_free_extent(xnr, resv, btree_committed); 438 list_del(&resv->list); 439 xfs_perag_put(resv->pag); 440 kfree(resv); 441 if (ret < 0) { 442 error = ret; 443 goto junkit; 444 } 445 446 freed += ret; 447 if (freed >= XREP_MAX_ITRUNCATE_EFIS) { 448 error = xrep_defer_finish(sc); 449 if (error) 450 goto junkit; 451 freed = 0; 452 } 453 } 454 455 if (freed) 456 error = xrep_defer_finish(sc); 457 458 junkit: 459 /* 460 * If we still have reservations attached to @newbt, cleanup must have 461 * failed and the filesystem is about to go down. Clean up the incore 462 * reservations and try to commit to freeing the space we used. 463 */ 464 list_for_each_entry_safe(resv, n, &xnr->resv_list, list) { 465 xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap); 466 list_del(&resv->list); 467 xfs_perag_put(resv->pag); 468 kfree(resv); 469 } 470 471 if (sc->ip) { 472 kmem_cache_free(xfs_ifork_cache, xnr->ifake.if_fork); 473 xnr->ifake.if_fork = NULL; 474 } 475 476 return error; 477 } 478 479 /* 480 * Free all the accounting info and unused disk space allocations after 481 * committing a new btree. 482 */ 483 int 484 xrep_newbt_commit( 485 struct xrep_newbt *xnr) 486 { 487 return xrep_newbt_free(xnr, true); 488 } 489 490 /* 491 * Free all the accounting info and all of the disk space we reserved for a new 492 * btree that we're not going to commit. We want to try to roll things back 493 * cleanly for things like ENOSPC midway through allocation. 494 */ 495 void 496 xrep_newbt_cancel( 497 struct xrep_newbt *xnr) 498 { 499 xrep_newbt_free(xnr, false); 500 } 501 502 /* Feed one of the reserved btree blocks to the bulk loader. */ 503 int 504 xrep_newbt_claim_block( 505 struct xfs_btree_cur *cur, 506 struct xrep_newbt *xnr, 507 union xfs_btree_ptr *ptr) 508 { 509 struct xrep_newbt_resv *resv; 510 struct xfs_mount *mp = cur->bc_mp; 511 xfs_agblock_t agbno; 512 513 /* 514 * The first item in the list should always have a free block unless 515 * we're completely out. 516 */ 517 resv = list_first_entry(&xnr->resv_list, struct xrep_newbt_resv, list); 518 if (resv->used == resv->len) 519 return -ENOSPC; 520 521 /* 522 * Peel off a block from the start of the reservation. We allocate 523 * blocks in order to place blocks on disk in increasing record or key 524 * order. The block reservations tend to end up on the list in 525 * decreasing order, which hopefully results in leaf blocks ending up 526 * together. 527 */ 528 agbno = resv->agbno + resv->used; 529 resv->used++; 530 531 /* If we used all the blocks in this reservation, move it to the end. */ 532 if (resv->used == resv->len) 533 list_move_tail(&resv->list, &xnr->resv_list); 534 535 trace_xrep_newbt_claim_block(mp, resv->pag->pag_agno, agbno, 1, 536 xnr->oinfo.oi_owner); 537 538 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) 539 ptr->l = cpu_to_be64(XFS_AGB_TO_FSB(mp, resv->pag->pag_agno, 540 agbno)); 541 else 542 ptr->s = cpu_to_be32(agbno); 543 544 /* Relog all the EFIs. */ 545 return xrep_defer_finish(xnr->sc); 546 } 547 548 /* How many reserved blocks are unused? */ 549 unsigned int 550 xrep_newbt_unused_blocks( 551 struct xrep_newbt *xnr) 552 { 553 struct xrep_newbt_resv *resv; 554 unsigned int unused = 0; 555 556 list_for_each_entry(resv, &xnr->resv_list, list) 557 unused += resv->len - resv->used; 558 return unused; 559 } 560