1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2022-2023 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_btree.h" 13 #include "xfs_btree_staging.h" 14 #include "xfs_log_format.h" 15 #include "xfs_trans.h" 16 #include "xfs_sb.h" 17 #include "xfs_inode.h" 18 #include "xfs_alloc.h" 19 #include "xfs_rmap.h" 20 #include "xfs_ag.h" 21 #include "xfs_defer.h" 22 #include "scrub/scrub.h" 23 #include "scrub/common.h" 24 #include "scrub/trace.h" 25 #include "scrub/repair.h" 26 #include "scrub/newbt.h" 27 28 /* 29 * Estimate proper slack values for a btree that's being reloaded. 30 * 31 * Under most circumstances, we'll take whatever default loading value the 32 * btree bulk loading code calculates for us. However, there are some 33 * exceptions to this rule: 34 * 35 * (1) If this is a per-AG btree and the AG has less than 10% space free. 36 * (2) If this is an inode btree and the FS has less than 10% space free. 37 38 * In either case, format the new btree blocks almost completely full to 39 * minimize space usage. 40 */ 41 static void 42 xrep_newbt_estimate_slack( 43 struct xrep_newbt *xnr) 44 { 45 struct xfs_scrub *sc = xnr->sc; 46 struct xfs_btree_bload *bload = &xnr->bload; 47 uint64_t free; 48 uint64_t sz; 49 50 /* Let the btree code compute the default slack values. */ 51 bload->leaf_slack = -1; 52 bload->node_slack = -1; 53 54 if (sc->ops->type == ST_PERAG) { 55 free = sc->sa.pag->pagf_freeblks; 56 sz = xfs_ag_block_count(sc->mp, sc->sa.pag->pag_agno); 57 } else { 58 free = percpu_counter_sum(&sc->mp->m_fdblocks); 59 sz = sc->mp->m_sb.sb_dblocks; 60 } 61 62 /* No further changes if there's more than 10% free space left. */ 63 if (free >= div_u64(sz, 10)) 64 return; 65 66 /* 67 * We're low on space; load the btrees as tightly as possible. Leave 68 * a couple of open slots in each btree block so that we don't end up 69 * splitting the btrees like crazy after a mount. 70 */ 71 if (bload->leaf_slack < 0) 72 bload->leaf_slack = 2; 73 if (bload->node_slack < 0) 74 bload->node_slack = 2; 75 } 76 77 /* Initialize accounting resources for staging a new AG btree. */ 78 void 79 xrep_newbt_init_ag( 80 struct xrep_newbt *xnr, 81 struct xfs_scrub *sc, 82 const struct xfs_owner_info *oinfo, 83 xfs_fsblock_t alloc_hint, 84 enum xfs_ag_resv_type resv) 85 { 86 memset(xnr, 0, sizeof(struct xrep_newbt)); 87 xnr->sc = sc; 88 xnr->oinfo = *oinfo; /* structure copy */ 89 xnr->alloc_hint = alloc_hint; 90 xnr->resv = resv; 91 INIT_LIST_HEAD(&xnr->resv_list); 92 xrep_newbt_estimate_slack(xnr); 93 } 94 95 /* Initialize accounting resources for staging a new inode fork btree. */ 96 int 97 xrep_newbt_init_inode( 98 struct xrep_newbt *xnr, 99 struct xfs_scrub *sc, 100 int whichfork, 101 const struct xfs_owner_info *oinfo) 102 { 103 struct xfs_ifork *ifp; 104 105 ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS); 106 if (!ifp) 107 return -ENOMEM; 108 109 xrep_newbt_init_ag(xnr, sc, oinfo, 110 XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino), 111 XFS_AG_RESV_NONE); 112 xnr->ifake.if_fork = ifp; 113 xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, whichfork); 114 return 0; 115 } 116 117 /* 118 * Initialize accounting resources for staging a new btree. Callers are 119 * expected to add their own reservations (and clean them up) manually. 120 */ 121 void 122 xrep_newbt_init_bare( 123 struct xrep_newbt *xnr, 124 struct xfs_scrub *sc) 125 { 126 xrep_newbt_init_ag(xnr, sc, &XFS_RMAP_OINFO_ANY_OWNER, NULLFSBLOCK, 127 XFS_AG_RESV_NONE); 128 } 129 130 /* 131 * Designate specific blocks to be used to build our new btree. @pag must be 132 * a passive reference. 133 */ 134 STATIC int 135 xrep_newbt_add_blocks( 136 struct xrep_newbt *xnr, 137 struct xfs_perag *pag, 138 const struct xfs_alloc_arg *args) 139 { 140 struct xfs_mount *mp = xnr->sc->mp; 141 struct xrep_newbt_resv *resv; 142 int error; 143 144 resv = kmalloc(sizeof(struct xrep_newbt_resv), XCHK_GFP_FLAGS); 145 if (!resv) 146 return -ENOMEM; 147 148 INIT_LIST_HEAD(&resv->list); 149 resv->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); 150 resv->len = args->len; 151 resv->used = 0; 152 resv->pag = xfs_perag_hold(pag); 153 154 ASSERT(xnr->oinfo.oi_offset == 0); 155 156 error = xfs_alloc_schedule_autoreap(args, true, &resv->autoreap); 157 if (error) 158 goto out_pag; 159 160 list_add_tail(&resv->list, &xnr->resv_list); 161 return 0; 162 out_pag: 163 xfs_perag_put(resv->pag); 164 kfree(resv); 165 return error; 166 } 167 168 /* Don't let our allocation hint take us beyond this AG */ 169 static inline void 170 xrep_newbt_validate_ag_alloc_hint( 171 struct xrep_newbt *xnr) 172 { 173 struct xfs_scrub *sc = xnr->sc; 174 xfs_agnumber_t agno = XFS_FSB_TO_AGNO(sc->mp, xnr->alloc_hint); 175 176 if (agno == sc->sa.pag->pag_agno && 177 xfs_verify_fsbno(sc->mp, xnr->alloc_hint)) 178 return; 179 180 xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, 181 XFS_AGFL_BLOCK(sc->mp) + 1); 182 } 183 184 /* Allocate disk space for a new per-AG btree. */ 185 STATIC int 186 xrep_newbt_alloc_ag_blocks( 187 struct xrep_newbt *xnr, 188 uint64_t nr_blocks) 189 { 190 struct xfs_scrub *sc = xnr->sc; 191 struct xfs_mount *mp = sc->mp; 192 int error = 0; 193 194 ASSERT(sc->sa.pag != NULL); 195 196 while (nr_blocks > 0) { 197 struct xfs_alloc_arg args = { 198 .tp = sc->tp, 199 .mp = mp, 200 .oinfo = xnr->oinfo, 201 .minlen = 1, 202 .maxlen = nr_blocks, 203 .prod = 1, 204 .resv = xnr->resv, 205 }; 206 xfs_agnumber_t agno; 207 208 xrep_newbt_validate_ag_alloc_hint(xnr); 209 210 error = xfs_alloc_vextent_near_bno(&args, xnr->alloc_hint); 211 if (error) 212 return error; 213 if (args.fsbno == NULLFSBLOCK) 214 return -ENOSPC; 215 216 agno = XFS_FSB_TO_AGNO(mp, args.fsbno); 217 218 trace_xrep_newbt_alloc_ag_blocks(mp, agno, 219 XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len, 220 xnr->oinfo.oi_owner); 221 222 if (agno != sc->sa.pag->pag_agno) { 223 ASSERT(agno == sc->sa.pag->pag_agno); 224 return -EFSCORRUPTED; 225 } 226 227 error = xrep_newbt_add_blocks(xnr, sc->sa.pag, &args); 228 if (error) 229 return error; 230 231 nr_blocks -= args.len; 232 xnr->alloc_hint = args.fsbno + args.len; 233 234 error = xrep_defer_finish(sc); 235 if (error) 236 return error; 237 } 238 239 return 0; 240 } 241 242 /* Don't let our allocation hint take us beyond EOFS */ 243 static inline void 244 xrep_newbt_validate_file_alloc_hint( 245 struct xrep_newbt *xnr) 246 { 247 struct xfs_scrub *sc = xnr->sc; 248 249 if (xfs_verify_fsbno(sc->mp, xnr->alloc_hint)) 250 return; 251 252 xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, 0, XFS_AGFL_BLOCK(sc->mp) + 1); 253 } 254 255 /* Allocate disk space for our new file-based btree. */ 256 STATIC int 257 xrep_newbt_alloc_file_blocks( 258 struct xrep_newbt *xnr, 259 uint64_t nr_blocks) 260 { 261 struct xfs_scrub *sc = xnr->sc; 262 struct xfs_mount *mp = sc->mp; 263 int error = 0; 264 265 while (nr_blocks > 0) { 266 struct xfs_alloc_arg args = { 267 .tp = sc->tp, 268 .mp = mp, 269 .oinfo = xnr->oinfo, 270 .minlen = 1, 271 .maxlen = nr_blocks, 272 .prod = 1, 273 .resv = xnr->resv, 274 }; 275 struct xfs_perag *pag; 276 xfs_agnumber_t agno; 277 278 xrep_newbt_validate_file_alloc_hint(xnr); 279 280 error = xfs_alloc_vextent_start_ag(&args, xnr->alloc_hint); 281 if (error) 282 return error; 283 if (args.fsbno == NULLFSBLOCK) 284 return -ENOSPC; 285 286 agno = XFS_FSB_TO_AGNO(mp, args.fsbno); 287 288 trace_xrep_newbt_alloc_file_blocks(mp, agno, 289 XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len, 290 xnr->oinfo.oi_owner); 291 292 pag = xfs_perag_get(mp, agno); 293 if (!pag) { 294 ASSERT(0); 295 return -EFSCORRUPTED; 296 } 297 298 error = xrep_newbt_add_blocks(xnr, pag, &args); 299 xfs_perag_put(pag); 300 if (error) 301 return error; 302 303 nr_blocks -= args.len; 304 xnr->alloc_hint = args.fsbno + args.len; 305 306 error = xrep_defer_finish(sc); 307 if (error) 308 return error; 309 } 310 311 return 0; 312 } 313 314 /* Allocate disk space for our new btree. */ 315 int 316 xrep_newbt_alloc_blocks( 317 struct xrep_newbt *xnr, 318 uint64_t nr_blocks) 319 { 320 if (xnr->sc->ip) 321 return xrep_newbt_alloc_file_blocks(xnr, nr_blocks); 322 return xrep_newbt_alloc_ag_blocks(xnr, nr_blocks); 323 } 324 325 /* 326 * Free the unused part of a space extent that was reserved for a new ondisk 327 * structure. Returns the number of EFIs logged or a negative errno. 328 */ 329 STATIC int 330 xrep_newbt_free_extent( 331 struct xrep_newbt *xnr, 332 struct xrep_newbt_resv *resv, 333 bool btree_committed) 334 { 335 struct xfs_scrub *sc = xnr->sc; 336 xfs_agblock_t free_agbno = resv->agbno; 337 xfs_extlen_t free_aglen = resv->len; 338 xfs_fsblock_t fsbno; 339 int error; 340 341 if (!btree_committed || resv->used == 0) { 342 /* 343 * If we're not committing a new btree or we didn't use the 344 * space reservation, let the existing EFI free the entire 345 * space extent. 346 */ 347 trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, 348 free_agbno, free_aglen, xnr->oinfo.oi_owner); 349 xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap); 350 return 1; 351 } 352 353 /* 354 * We used space and committed the btree. Cancel the autoreap, remove 355 * the written blocks from the reservation, and possibly log a new EFI 356 * to free any unused reservation space. 357 */ 358 xfs_alloc_cancel_autoreap(sc->tp, &resv->autoreap); 359 free_agbno += resv->used; 360 free_aglen -= resv->used; 361 362 if (free_aglen == 0) 363 return 0; 364 365 trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno, 366 free_aglen, xnr->oinfo.oi_owner); 367 368 ASSERT(xnr->resv != XFS_AG_RESV_AGFL); 369 370 /* 371 * Use EFIs to free the reservations. This reduces the chance 372 * that we leak blocks if the system goes down. 373 */ 374 fsbno = XFS_AGB_TO_FSB(sc->mp, resv->pag->pag_agno, free_agbno); 375 error = xfs_free_extent_later(sc->tp, fsbno, free_aglen, &xnr->oinfo, 376 xnr->resv, true); 377 if (error) 378 return error; 379 380 return 1; 381 } 382 383 /* Free all the accounting info and disk space we reserved for a new btree. */ 384 STATIC int 385 xrep_newbt_free( 386 struct xrep_newbt *xnr, 387 bool btree_committed) 388 { 389 struct xfs_scrub *sc = xnr->sc; 390 struct xrep_newbt_resv *resv, *n; 391 unsigned int freed = 0; 392 int error = 0; 393 394 /* 395 * If the filesystem already went down, we can't free the blocks. Skip 396 * ahead to freeing the incore metadata because we can't fix anything. 397 */ 398 if (xfs_is_shutdown(sc->mp)) 399 goto junkit; 400 401 list_for_each_entry_safe(resv, n, &xnr->resv_list, list) { 402 int ret; 403 404 ret = xrep_newbt_free_extent(xnr, resv, btree_committed); 405 list_del(&resv->list); 406 xfs_perag_put(resv->pag); 407 kfree(resv); 408 if (ret < 0) { 409 error = ret; 410 goto junkit; 411 } 412 413 freed += ret; 414 if (freed >= XREP_MAX_ITRUNCATE_EFIS) { 415 error = xrep_defer_finish(sc); 416 if (error) 417 goto junkit; 418 freed = 0; 419 } 420 } 421 422 if (freed) 423 error = xrep_defer_finish(sc); 424 425 junkit: 426 /* 427 * If we still have reservations attached to @newbt, cleanup must have 428 * failed and the filesystem is about to go down. Clean up the incore 429 * reservations and try to commit to freeing the space we used. 430 */ 431 list_for_each_entry_safe(resv, n, &xnr->resv_list, list) { 432 xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap); 433 list_del(&resv->list); 434 xfs_perag_put(resv->pag); 435 kfree(resv); 436 } 437 438 if (sc->ip) { 439 kmem_cache_free(xfs_ifork_cache, xnr->ifake.if_fork); 440 xnr->ifake.if_fork = NULL; 441 } 442 443 return error; 444 } 445 446 /* 447 * Free all the accounting info and unused disk space allocations after 448 * committing a new btree. 449 */ 450 int 451 xrep_newbt_commit( 452 struct xrep_newbt *xnr) 453 { 454 return xrep_newbt_free(xnr, true); 455 } 456 457 /* 458 * Free all the accounting info and all of the disk space we reserved for a new 459 * btree that we're not going to commit. We want to try to roll things back 460 * cleanly for things like ENOSPC midway through allocation. 461 */ 462 void 463 xrep_newbt_cancel( 464 struct xrep_newbt *xnr) 465 { 466 xrep_newbt_free(xnr, false); 467 } 468 469 /* Feed one of the reserved btree blocks to the bulk loader. */ 470 int 471 xrep_newbt_claim_block( 472 struct xfs_btree_cur *cur, 473 struct xrep_newbt *xnr, 474 union xfs_btree_ptr *ptr) 475 { 476 struct xrep_newbt_resv *resv; 477 struct xfs_mount *mp = cur->bc_mp; 478 xfs_agblock_t agbno; 479 480 /* 481 * The first item in the list should always have a free block unless 482 * we're completely out. 483 */ 484 resv = list_first_entry(&xnr->resv_list, struct xrep_newbt_resv, list); 485 if (resv->used == resv->len) 486 return -ENOSPC; 487 488 /* 489 * Peel off a block from the start of the reservation. We allocate 490 * blocks in order to place blocks on disk in increasing record or key 491 * order. The block reservations tend to end up on the list in 492 * decreasing order, which hopefully results in leaf blocks ending up 493 * together. 494 */ 495 agbno = resv->agbno + resv->used; 496 resv->used++; 497 498 /* If we used all the blocks in this reservation, move it to the end. */ 499 if (resv->used == resv->len) 500 list_move_tail(&resv->list, &xnr->resv_list); 501 502 trace_xrep_newbt_claim_block(mp, resv->pag->pag_agno, agbno, 1, 503 xnr->oinfo.oi_owner); 504 505 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) 506 ptr->l = cpu_to_be64(XFS_AGB_TO_FSB(mp, resv->pag->pag_agno, 507 agbno)); 508 else 509 ptr->s = cpu_to_be32(agbno); 510 511 /* Relog all the EFIs. */ 512 return xrep_defer_finish(xnr->sc); 513 } 514