1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2022-2023 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_defer.h" 13 #include "xfs_btree.h" 14 #include "xfs_log_format.h" 15 #include "xfs_trans.h" 16 #include "xfs_inode.h" 17 #include "xfs_inode_fork.h" 18 #include "xfs_alloc.h" 19 #include "xfs_bmap.h" 20 #include "xfs_rmap.h" 21 #include "xfs_refcount.h" 22 #include "xfs_quota.h" 23 #include "xfs_ialloc.h" 24 #include "xfs_ag.h" 25 #include "xfs_error.h" 26 #include "xfs_errortag.h" 27 #include "xfs_icache.h" 28 #include "xfs_refcount_btree.h" 29 #include "scrub/xfs_scrub.h" 30 #include "scrub/scrub.h" 31 #include "scrub/common.h" 32 #include "scrub/trace.h" 33 #include "scrub/repair.h" 34 #include "scrub/bitmap.h" 35 #include "scrub/off_bitmap.h" 36 #include "scrub/fsb_bitmap.h" 37 #include "scrub/reap.h" 38 39 /* 40 * CoW Fork Mapping Repair 41 * ======================= 42 * 43 * Although CoW staging extents are owned by incore CoW inode forks, on disk 44 * they are owned by the refcount btree. The ondisk metadata does not record 45 * any ownership information, which limits what we can do to repair the 46 * mappings in the CoW fork. At most, we can replace ifork mappings that lack 47 * an entry in the refcount btree or are described by a reverse mapping record 48 * whose owner is not OWN_COW. 49 * 50 * Replacing extents is also tricky -- we can't touch written CoW fork extents 51 * since they are undergoing writeback, and delalloc extents do not require 52 * repair since they only exist incore. Hence the most we can do is find the 53 * bad parts of unwritten mappings, allocate a replacement set of blocks, and 54 * replace the incore mapping. We use the regular reaping process to unmap 55 * or free the discarded blocks, as appropriate. 56 */ 57 struct xrep_cow { 58 struct xfs_scrub *sc; 59 60 /* Bitmap of file offset ranges that need replacing. */ 61 struct xoff_bitmap bad_fileoffs; 62 63 /* Bitmap of fsblocks that were removed from the CoW fork. */ 64 struct xfsb_bitmap old_cowfork_fsblocks; 65 66 /* CoW fork mappings used to scan for bad CoW staging extents. */ 67 struct xfs_bmbt_irec irec; 68 69 /* refcount btree block number of irec.br_startblock */ 70 unsigned int irec_startbno; 71 72 /* refcount btree block number of the next refcount record we expect */ 73 unsigned int next_bno; 74 }; 75 76 /* CoW staging extent. */ 77 struct xrep_cow_extent { 78 xfs_fsblock_t fsbno; 79 xfs_extlen_t len; 80 }; 81 82 /* 83 * Mark the part of the file range that corresponds to the given physical 84 * space. Caller must ensure that the physical range is within xc->irec. 85 */ 86 STATIC int 87 xrep_cow_mark_file_range( 88 struct xrep_cow *xc, 89 xfs_fsblock_t startblock, 90 xfs_filblks_t blockcount) 91 { 92 xfs_fileoff_t startoff; 93 94 startoff = xc->irec.br_startoff + 95 (startblock - xc->irec.br_startblock); 96 97 trace_xrep_cow_mark_file_range(xc->sc->ip, startblock, startoff, 98 blockcount); 99 100 return xoff_bitmap_set(&xc->bad_fileoffs, startoff, blockcount); 101 } 102 103 /* 104 * Trim @src to fit within the CoW fork mapping being examined, and put the 105 * result in @dst. 106 */ 107 static inline void 108 xrep_cow_trim_refcount( 109 struct xrep_cow *xc, 110 struct xfs_refcount_irec *dst, 111 const struct xfs_refcount_irec *src) 112 { 113 unsigned int adj; 114 115 memcpy(dst, src, sizeof(*dst)); 116 117 if (dst->rc_startblock < xc->irec_startbno) { 118 adj = xc->irec_startbno - dst->rc_startblock; 119 dst->rc_blockcount -= adj; 120 dst->rc_startblock += adj; 121 } 122 123 if (dst->rc_startblock + dst->rc_blockcount > 124 xc->irec_startbno + xc->irec.br_blockcount) { 125 adj = (dst->rc_startblock + dst->rc_blockcount) - 126 (xc->irec_startbno + xc->irec.br_blockcount); 127 dst->rc_blockcount -= adj; 128 } 129 } 130 131 /* Mark any shared CoW staging extents. */ 132 STATIC int 133 xrep_cow_mark_shared_staging( 134 struct xfs_btree_cur *cur, 135 const struct xfs_refcount_irec *rec, 136 void *priv) 137 { 138 struct xrep_cow *xc = priv; 139 struct xfs_refcount_irec rrec; 140 xfs_fsblock_t fsbno; 141 142 if (!xfs_refcount_check_domain(rec) || 143 rec->rc_domain != XFS_REFC_DOMAIN_SHARED) 144 return -EFSCORRUPTED; 145 146 xrep_cow_trim_refcount(xc, &rrec, rec); 147 148 fsbno = XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno, 149 rrec.rc_startblock); 150 return xrep_cow_mark_file_range(xc, fsbno, rrec.rc_blockcount); 151 } 152 153 /* 154 * Mark any portion of the CoW fork file offset range where there is not a CoW 155 * staging extent record in the refcountbt, and keep a record of where we did 156 * find correct refcountbt records. Staging records are always cleaned out at 157 * mount time, so any two inodes trying to map the same staging area would have 158 * already taken the fs down due to refcount btree verifier errors. Hence this 159 * inode should be the sole creator of the staging extent records ondisk. 160 */ 161 STATIC int 162 xrep_cow_mark_missing_staging( 163 struct xfs_btree_cur *cur, 164 const struct xfs_refcount_irec *rec, 165 void *priv) 166 { 167 struct xrep_cow *xc = priv; 168 struct xfs_refcount_irec rrec; 169 int error; 170 171 if (!xfs_refcount_check_domain(rec) || 172 rec->rc_domain != XFS_REFC_DOMAIN_COW) 173 return -EFSCORRUPTED; 174 175 xrep_cow_trim_refcount(xc, &rrec, rec); 176 177 if (xc->next_bno >= rrec.rc_startblock) 178 goto next; 179 180 error = xrep_cow_mark_file_range(xc, 181 XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno, 182 xc->next_bno), 183 rrec.rc_startblock - xc->next_bno); 184 if (error) 185 return error; 186 187 next: 188 xc->next_bno = rrec.rc_startblock + rrec.rc_blockcount; 189 return 0; 190 } 191 192 /* 193 * Mark any area that does not correspond to a CoW staging rmap. These are 194 * cross-linked areas that must be avoided. 195 */ 196 STATIC int 197 xrep_cow_mark_missing_staging_rmap( 198 struct xfs_btree_cur *cur, 199 const struct xfs_rmap_irec *rec, 200 void *priv) 201 { 202 struct xrep_cow *xc = priv; 203 xfs_fsblock_t fsbno; 204 xfs_agblock_t rec_bno; 205 xfs_extlen_t rec_len; 206 unsigned int adj; 207 208 if (rec->rm_owner == XFS_RMAP_OWN_COW) 209 return 0; 210 211 rec_bno = rec->rm_startblock; 212 rec_len = rec->rm_blockcount; 213 if (rec_bno < xc->irec_startbno) { 214 adj = xc->irec_startbno - rec_bno; 215 rec_len -= adj; 216 rec_bno += adj; 217 } 218 219 if (rec_bno + rec_len > xc->irec_startbno + xc->irec.br_blockcount) { 220 adj = (rec_bno + rec_len) - 221 (xc->irec_startbno + xc->irec.br_blockcount); 222 rec_len -= adj; 223 } 224 225 fsbno = XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno, rec_bno); 226 return xrep_cow_mark_file_range(xc, fsbno, rec_len); 227 } 228 229 /* 230 * Find any part of the CoW fork mapping that isn't a single-owner CoW staging 231 * extent and mark the corresponding part of the file range in the bitmap. 232 */ 233 STATIC int 234 xrep_cow_find_bad( 235 struct xrep_cow *xc) 236 { 237 struct xfs_refcount_irec rc_low = { 0 }; 238 struct xfs_refcount_irec rc_high = { 0 }; 239 struct xfs_rmap_irec rm_low = { 0 }; 240 struct xfs_rmap_irec rm_high = { 0 }; 241 struct xfs_perag *pag; 242 struct xfs_scrub *sc = xc->sc; 243 xfs_agnumber_t agno; 244 int error; 245 246 agno = XFS_FSB_TO_AGNO(sc->mp, xc->irec.br_startblock); 247 xc->irec_startbno = XFS_FSB_TO_AGBNO(sc->mp, xc->irec.br_startblock); 248 249 pag = xfs_perag_get(sc->mp, agno); 250 if (!pag) 251 return -EFSCORRUPTED; 252 253 error = xrep_ag_init(sc, pag, &sc->sa); 254 if (error) 255 goto out_pag; 256 257 /* Mark any CoW fork extents that are shared. */ 258 rc_low.rc_startblock = xc->irec_startbno; 259 rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1; 260 rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_SHARED; 261 error = xfs_refcount_query_range(sc->sa.refc_cur, &rc_low, &rc_high, 262 xrep_cow_mark_shared_staging, xc); 263 if (error) 264 goto out_sa; 265 266 /* Make sure there are CoW staging extents for the whole mapping. */ 267 rc_low.rc_startblock = xc->irec_startbno; 268 rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1; 269 rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_COW; 270 xc->next_bno = xc->irec_startbno; 271 error = xfs_refcount_query_range(sc->sa.refc_cur, &rc_low, &rc_high, 272 xrep_cow_mark_missing_staging, xc); 273 if (error) 274 goto out_sa; 275 276 if (xc->next_bno < xc->irec_startbno + xc->irec.br_blockcount) { 277 error = xrep_cow_mark_file_range(xc, 278 XFS_AGB_TO_FSB(sc->mp, pag->pag_agno, 279 xc->next_bno), 280 xc->irec_startbno + xc->irec.br_blockcount - 281 xc->next_bno); 282 if (error) 283 goto out_sa; 284 } 285 286 /* Mark any area has an rmap that isn't a COW staging extent. */ 287 rm_low.rm_startblock = xc->irec_startbno; 288 memset(&rm_high, 0xFF, sizeof(rm_high)); 289 rm_high.rm_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1; 290 error = xfs_rmap_query_range(sc->sa.rmap_cur, &rm_low, &rm_high, 291 xrep_cow_mark_missing_staging_rmap, xc); 292 if (error) 293 goto out_sa; 294 295 /* 296 * If userspace is forcing us to rebuild the CoW fork or someone turned 297 * on the debugging knob, replace everything in the CoW fork. 298 */ 299 if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) || 300 XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) { 301 error = xrep_cow_mark_file_range(xc, xc->irec.br_startblock, 302 xc->irec.br_blockcount); 303 if (error) 304 return error; 305 } 306 307 out_sa: 308 xchk_ag_free(sc, &sc->sa); 309 out_pag: 310 xfs_perag_put(pag); 311 return 0; 312 } 313 314 /* 315 * Allocate a replacement CoW staging extent of up to the given number of 316 * blocks, and fill out the mapping. 317 */ 318 STATIC int 319 xrep_cow_alloc( 320 struct xfs_scrub *sc, 321 xfs_extlen_t maxlen, 322 struct xrep_cow_extent *repl) 323 { 324 struct xfs_alloc_arg args = { 325 .tp = sc->tp, 326 .mp = sc->mp, 327 .oinfo = XFS_RMAP_OINFO_SKIP_UPDATE, 328 .minlen = 1, 329 .maxlen = maxlen, 330 .prod = 1, 331 .resv = XFS_AG_RESV_NONE, 332 .datatype = XFS_ALLOC_USERDATA, 333 }; 334 int error; 335 336 error = xfs_trans_reserve_more(sc->tp, maxlen, 0); 337 if (error) 338 return error; 339 340 error = xfs_alloc_vextent_start_ag(&args, 341 XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino)); 342 if (error) 343 return error; 344 if (args.fsbno == NULLFSBLOCK) 345 return -ENOSPC; 346 347 xfs_refcount_alloc_cow_extent(sc->tp, args.fsbno, args.len); 348 349 repl->fsbno = args.fsbno; 350 repl->len = args.len; 351 return 0; 352 } 353 354 /* 355 * Look up the current CoW fork mapping so that we only allocate enough to 356 * replace a single mapping. If we don't find a mapping that covers the start 357 * of the file range, or we find a delalloc or written extent, something is 358 * seriously wrong, since we didn't drop the ILOCK. 359 */ 360 static inline int 361 xrep_cow_find_mapping( 362 struct xrep_cow *xc, 363 struct xfs_iext_cursor *icur, 364 xfs_fileoff_t startoff, 365 struct xfs_bmbt_irec *got) 366 { 367 struct xfs_inode *ip = xc->sc->ip; 368 struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK); 369 370 if (!xfs_iext_lookup_extent(ip, ifp, startoff, icur, got)) 371 goto bad; 372 373 if (got->br_startoff > startoff) 374 goto bad; 375 376 if (got->br_blockcount == 0) 377 goto bad; 378 379 if (isnullstartblock(got->br_startblock)) 380 goto bad; 381 382 if (xfs_bmap_is_written_extent(got)) 383 goto bad; 384 385 return 0; 386 bad: 387 ASSERT(0); 388 return -EFSCORRUPTED; 389 } 390 391 #define REPLACE_LEFT_SIDE (1U << 0) 392 #define REPLACE_RIGHT_SIDE (1U << 1) 393 394 /* 395 * Given a CoW fork mapping @got and a replacement mapping @repl, remap the 396 * beginning of @got with the space described by @rep. 397 */ 398 static inline void 399 xrep_cow_replace_mapping( 400 struct xfs_inode *ip, 401 struct xfs_iext_cursor *icur, 402 const struct xfs_bmbt_irec *got, 403 const struct xrep_cow_extent *repl) 404 { 405 struct xfs_bmbt_irec new = *got; /* struct copy */ 406 407 ASSERT(repl->len > 0); 408 ASSERT(!isnullstartblock(got->br_startblock)); 409 410 trace_xrep_cow_replace_mapping(ip, got, repl->fsbno, repl->len); 411 412 if (got->br_blockcount == repl->len) { 413 /* 414 * The new extent is a complete replacement for the existing 415 * extent. Update the COW fork record. 416 */ 417 new.br_startblock = repl->fsbno; 418 xfs_iext_update_extent(ip, BMAP_COWFORK, icur, &new); 419 return; 420 } 421 422 /* 423 * The new extent can replace the beginning of the COW fork record. 424 * Move the left side of @got upwards, then insert the new record. 425 */ 426 new.br_startoff += repl->len; 427 new.br_startblock += repl->len; 428 new.br_blockcount -= repl->len; 429 xfs_iext_update_extent(ip, BMAP_COWFORK, icur, &new); 430 431 new.br_startoff = got->br_startoff; 432 new.br_startblock = repl->fsbno; 433 new.br_blockcount = repl->len; 434 xfs_iext_insert(ip, icur, &new, BMAP_COWFORK); 435 } 436 437 /* 438 * Replace the unwritten CoW staging extent backing the given file range with a 439 * new space extent that isn't as problematic. 440 */ 441 STATIC int 442 xrep_cow_replace_range( 443 struct xrep_cow *xc, 444 xfs_fileoff_t startoff, 445 xfs_extlen_t *blockcount) 446 { 447 struct xfs_iext_cursor icur; 448 struct xrep_cow_extent repl; 449 struct xfs_bmbt_irec got; 450 struct xfs_scrub *sc = xc->sc; 451 xfs_fileoff_t nextoff; 452 xfs_extlen_t alloc_len; 453 int error; 454 455 /* 456 * Put the existing CoW fork mapping in @got. If @got ends before 457 * @rep, truncate @rep so we only replace one extent mapping at a time. 458 */ 459 error = xrep_cow_find_mapping(xc, &icur, startoff, &got); 460 if (error) 461 return error; 462 nextoff = min(startoff + *blockcount, 463 got.br_startoff + got.br_blockcount); 464 465 /* 466 * Allocate a replacement extent. If we don't fill all the blocks, 467 * shorten the quantity that will be deleted in this step. 468 */ 469 alloc_len = min_t(xfs_fileoff_t, XFS_MAX_BMBT_EXTLEN, 470 nextoff - startoff); 471 error = xrep_cow_alloc(sc, alloc_len, &repl); 472 if (error) 473 return error; 474 475 /* 476 * Replace the old mapping with the new one, and commit the metadata 477 * changes made so far. 478 */ 479 xrep_cow_replace_mapping(sc->ip, &icur, &got, &repl); 480 481 xfs_inode_set_cowblocks_tag(sc->ip); 482 error = xfs_defer_finish(&sc->tp); 483 if (error) 484 return error; 485 486 /* Note the old CoW staging extents; we'll reap them all later. */ 487 error = xfsb_bitmap_set(&xc->old_cowfork_fsblocks, got.br_startblock, 488 repl.len); 489 if (error) 490 return error; 491 492 *blockcount = repl.len; 493 return 0; 494 } 495 496 /* 497 * Replace a bad part of an unwritten CoW staging extent with a fresh delalloc 498 * reservation. 499 */ 500 STATIC int 501 xrep_cow_replace( 502 uint64_t startoff, 503 uint64_t blockcount, 504 void *priv) 505 { 506 struct xrep_cow *xc = priv; 507 int error = 0; 508 509 while (blockcount > 0) { 510 xfs_extlen_t len = min_t(xfs_filblks_t, blockcount, 511 XFS_MAX_BMBT_EXTLEN); 512 513 error = xrep_cow_replace_range(xc, startoff, &len); 514 if (error) 515 break; 516 517 blockcount -= len; 518 startoff += len; 519 } 520 521 return error; 522 } 523 524 /* 525 * Repair an inode's CoW fork. The CoW fork is an in-core structure, so 526 * there's no btree to rebuid. Instead, we replace any mappings that are 527 * cross-linked or lack ondisk CoW fork records in the refcount btree. 528 */ 529 int 530 xrep_bmap_cow( 531 struct xfs_scrub *sc) 532 { 533 struct xrep_cow *xc; 534 struct xfs_iext_cursor icur; 535 struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, XFS_COW_FORK); 536 int error; 537 538 if (!xfs_has_rmapbt(sc->mp) || !xfs_has_reflink(sc->mp)) 539 return -EOPNOTSUPP; 540 541 if (!ifp) 542 return 0; 543 544 /* realtime files aren't supported yet */ 545 if (XFS_IS_REALTIME_INODE(sc->ip)) 546 return -EOPNOTSUPP; 547 548 /* 549 * If we're somehow not in extents format, then reinitialize it to 550 * an empty extent mapping fork and exit. 551 */ 552 if (ifp->if_format != XFS_DINODE_FMT_EXTENTS) { 553 ifp->if_format = XFS_DINODE_FMT_EXTENTS; 554 ifp->if_nextents = 0; 555 return 0; 556 } 557 558 xc = kzalloc(sizeof(struct xrep_cow), XCHK_GFP_FLAGS); 559 if (!xc) 560 return -ENOMEM; 561 562 xfs_trans_ijoin(sc->tp, sc->ip, 0); 563 564 xc->sc = sc; 565 xoff_bitmap_init(&xc->bad_fileoffs); 566 xfsb_bitmap_init(&xc->old_cowfork_fsblocks); 567 568 for_each_xfs_iext(ifp, &icur, &xc->irec) { 569 if (xchk_should_terminate(sc, &error)) 570 goto out_bitmap; 571 572 /* 573 * delalloc reservations only exist incore, so there is no 574 * ondisk metadata that we can examine. Hence we leave them 575 * alone. 576 */ 577 if (isnullstartblock(xc->irec.br_startblock)) 578 continue; 579 580 /* 581 * COW fork extents are only in the written state if writeback 582 * is actively writing to disk. We cannot restart the write 583 * at a different disk address since we've already issued the 584 * IO, so we leave these alone and hope for the best. 585 */ 586 if (xfs_bmap_is_written_extent(&xc->irec)) 587 continue; 588 589 error = xrep_cow_find_bad(xc); 590 if (error) 591 goto out_bitmap; 592 } 593 594 /* Replace any bad unwritten mappings with fresh reservations. */ 595 error = xoff_bitmap_walk(&xc->bad_fileoffs, xrep_cow_replace, xc); 596 if (error) 597 goto out_bitmap; 598 599 /* 600 * Reap as many of the old CoW blocks as we can. They are owned ondisk 601 * by the refcount btree, not the inode, so it is correct to treat them 602 * like inode metadata. 603 */ 604 error = xrep_reap_fsblocks(sc, &xc->old_cowfork_fsblocks, 605 &XFS_RMAP_OINFO_COW); 606 if (error) 607 goto out_bitmap; 608 609 out_bitmap: 610 xfsb_bitmap_destroy(&xc->old_cowfork_fsblocks); 611 xoff_bitmap_destroy(&xc->bad_fileoffs); 612 kfree(xc); 613 return error; 614 } 615