1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2022-2023 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_defer.h" 13 #include "xfs_btree.h" 14 #include "xfs_log_format.h" 15 #include "xfs_trans.h" 16 #include "xfs_inode.h" 17 #include "xfs_inode_fork.h" 18 #include "xfs_alloc.h" 19 #include "xfs_bmap.h" 20 #include "xfs_rmap.h" 21 #include "xfs_refcount.h" 22 #include "xfs_quota.h" 23 #include "xfs_ialloc.h" 24 #include "xfs_ag.h" 25 #include "xfs_error.h" 26 #include "xfs_errortag.h" 27 #include "xfs_icache.h" 28 #include "xfs_refcount_btree.h" 29 #include "scrub/xfs_scrub.h" 30 #include "scrub/scrub.h" 31 #include "scrub/common.h" 32 #include "scrub/trace.h" 33 #include "scrub/repair.h" 34 #include "scrub/bitmap.h" 35 #include "scrub/off_bitmap.h" 36 #include "scrub/fsb_bitmap.h" 37 #include "scrub/reap.h" 38 39 /* 40 * CoW Fork Mapping Repair 41 * ======================= 42 * 43 * Although CoW staging extents are owned by incore CoW inode forks, on disk 44 * they are owned by the refcount btree. The ondisk metadata does not record 45 * any ownership information, which limits what we can do to repair the 46 * mappings in the CoW fork. At most, we can replace ifork mappings that lack 47 * an entry in the refcount btree or are described by a reverse mapping record 48 * whose owner is not OWN_COW. 49 * 50 * Replacing extents is also tricky -- we can't touch written CoW fork extents 51 * since they are undergoing writeback, and delalloc extents do not require 52 * repair since they only exist incore. Hence the most we can do is find the 53 * bad parts of unwritten mappings, allocate a replacement set of blocks, and 54 * replace the incore mapping. We use the regular reaping process to unmap 55 * or free the discarded blocks, as appropriate. 56 */ 57 struct xrep_cow { 58 struct xfs_scrub *sc; 59 60 /* Bitmap of file offset ranges that need replacing. */ 61 struct xoff_bitmap bad_fileoffs; 62 63 /* Bitmap of fsblocks that were removed from the CoW fork. */ 64 struct xfsb_bitmap old_cowfork_fsblocks; 65 66 /* CoW fork mappings used to scan for bad CoW staging extents. */ 67 struct xfs_bmbt_irec irec; 68 69 /* refcount btree block number of irec.br_startblock */ 70 unsigned int irec_startbno; 71 72 /* refcount btree block number of the next refcount record we expect */ 73 unsigned int next_bno; 74 }; 75 76 /* CoW staging extent. */ 77 struct xrep_cow_extent { 78 xfs_fsblock_t fsbno; 79 xfs_extlen_t len; 80 }; 81 82 /* 83 * Mark the part of the file range that corresponds to the given physical 84 * space. Caller must ensure that the physical range is within xc->irec. 85 */ 86 STATIC int 87 xrep_cow_mark_file_range( 88 struct xrep_cow *xc, 89 xfs_fsblock_t startblock, 90 xfs_filblks_t blockcount) 91 { 92 xfs_fileoff_t startoff; 93 94 startoff = xc->irec.br_startoff + 95 (startblock - xc->irec.br_startblock); 96 97 trace_xrep_cow_mark_file_range(xc->sc->ip, startblock, startoff, 98 blockcount); 99 100 return xoff_bitmap_set(&xc->bad_fileoffs, startoff, blockcount); 101 } 102 103 /* 104 * Trim @src to fit within the CoW fork mapping being examined, and put the 105 * result in @dst. 106 */ 107 static inline void 108 xrep_cow_trim_refcount( 109 struct xrep_cow *xc, 110 struct xfs_refcount_irec *dst, 111 const struct xfs_refcount_irec *src) 112 { 113 unsigned int adj; 114 115 memcpy(dst, src, sizeof(*dst)); 116 117 if (dst->rc_startblock < xc->irec_startbno) { 118 adj = xc->irec_startbno - dst->rc_startblock; 119 dst->rc_blockcount -= adj; 120 dst->rc_startblock += adj; 121 } 122 123 if (dst->rc_startblock + dst->rc_blockcount > 124 xc->irec_startbno + xc->irec.br_blockcount) { 125 adj = (dst->rc_startblock + dst->rc_blockcount) - 126 (xc->irec_startbno + xc->irec.br_blockcount); 127 dst->rc_blockcount -= adj; 128 } 129 } 130 131 /* Mark any shared CoW staging extents. */ 132 STATIC int 133 xrep_cow_mark_shared_staging( 134 struct xfs_btree_cur *cur, 135 const struct xfs_refcount_irec *rec, 136 void *priv) 137 { 138 struct xrep_cow *xc = priv; 139 struct xfs_refcount_irec rrec; 140 141 if (!xfs_refcount_check_domain(rec) || 142 rec->rc_domain != XFS_REFC_DOMAIN_SHARED) 143 return -EFSCORRUPTED; 144 145 xrep_cow_trim_refcount(xc, &rrec, rec); 146 147 return xrep_cow_mark_file_range(xc, 148 xfs_agbno_to_fsb(to_perag(cur->bc_group), 149 rrec.rc_startblock), 150 rrec.rc_blockcount); 151 } 152 153 /* 154 * Mark any portion of the CoW fork file offset range where there is not a CoW 155 * staging extent record in the refcountbt, and keep a record of where we did 156 * find correct refcountbt records. Staging records are always cleaned out at 157 * mount time, so any two inodes trying to map the same staging area would have 158 * already taken the fs down due to refcount btree verifier errors. Hence this 159 * inode should be the sole creator of the staging extent records ondisk. 160 */ 161 STATIC int 162 xrep_cow_mark_missing_staging( 163 struct xfs_btree_cur *cur, 164 const struct xfs_refcount_irec *rec, 165 void *priv) 166 { 167 struct xrep_cow *xc = priv; 168 struct xfs_refcount_irec rrec; 169 int error; 170 171 if (!xfs_refcount_check_domain(rec) || 172 rec->rc_domain != XFS_REFC_DOMAIN_COW) 173 return -EFSCORRUPTED; 174 175 xrep_cow_trim_refcount(xc, &rrec, rec); 176 177 if (xc->next_bno >= rrec.rc_startblock) 178 goto next; 179 180 181 error = xrep_cow_mark_file_range(xc, 182 xfs_agbno_to_fsb(to_perag(cur->bc_group), xc->next_bno), 183 rrec.rc_startblock - xc->next_bno); 184 if (error) 185 return error; 186 187 next: 188 xc->next_bno = rrec.rc_startblock + rrec.rc_blockcount; 189 return 0; 190 } 191 192 /* 193 * Mark any area that does not correspond to a CoW staging rmap. These are 194 * cross-linked areas that must be avoided. 195 */ 196 STATIC int 197 xrep_cow_mark_missing_staging_rmap( 198 struct xfs_btree_cur *cur, 199 const struct xfs_rmap_irec *rec, 200 void *priv) 201 { 202 struct xrep_cow *xc = priv; 203 xfs_agblock_t rec_bno; 204 xfs_extlen_t rec_len; 205 unsigned int adj; 206 207 if (rec->rm_owner == XFS_RMAP_OWN_COW) 208 return 0; 209 210 rec_bno = rec->rm_startblock; 211 rec_len = rec->rm_blockcount; 212 if (rec_bno < xc->irec_startbno) { 213 adj = xc->irec_startbno - rec_bno; 214 rec_len -= adj; 215 rec_bno += adj; 216 } 217 218 if (rec_bno + rec_len > xc->irec_startbno + xc->irec.br_blockcount) { 219 adj = (rec_bno + rec_len) - 220 (xc->irec_startbno + xc->irec.br_blockcount); 221 rec_len -= adj; 222 } 223 224 return xrep_cow_mark_file_range(xc, 225 xfs_agbno_to_fsb(to_perag(cur->bc_group), rec_bno), 226 rec_len); 227 } 228 229 /* 230 * Find any part of the CoW fork mapping that isn't a single-owner CoW staging 231 * extent and mark the corresponding part of the file range in the bitmap. 232 */ 233 STATIC int 234 xrep_cow_find_bad( 235 struct xrep_cow *xc) 236 { 237 struct xfs_refcount_irec rc_low = { 0 }; 238 struct xfs_refcount_irec rc_high = { 0 }; 239 struct xfs_rmap_irec rm_low = { 0 }; 240 struct xfs_rmap_irec rm_high = { 0 }; 241 struct xfs_perag *pag; 242 struct xfs_scrub *sc = xc->sc; 243 xfs_agnumber_t agno; 244 int error; 245 246 agno = XFS_FSB_TO_AGNO(sc->mp, xc->irec.br_startblock); 247 xc->irec_startbno = XFS_FSB_TO_AGBNO(sc->mp, xc->irec.br_startblock); 248 249 pag = xfs_perag_get(sc->mp, agno); 250 if (!pag) 251 return -EFSCORRUPTED; 252 253 error = xrep_ag_init(sc, pag, &sc->sa); 254 if (error) 255 goto out_pag; 256 257 /* Mark any CoW fork extents that are shared. */ 258 rc_low.rc_startblock = xc->irec_startbno; 259 rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1; 260 rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_SHARED; 261 error = xfs_refcount_query_range(sc->sa.refc_cur, &rc_low, &rc_high, 262 xrep_cow_mark_shared_staging, xc); 263 if (error) 264 goto out_sa; 265 266 /* Make sure there are CoW staging extents for the whole mapping. */ 267 rc_low.rc_startblock = xc->irec_startbno; 268 rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1; 269 rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_COW; 270 xc->next_bno = xc->irec_startbno; 271 error = xfs_refcount_query_range(sc->sa.refc_cur, &rc_low, &rc_high, 272 xrep_cow_mark_missing_staging, xc); 273 if (error) 274 goto out_sa; 275 276 if (xc->next_bno < xc->irec_startbno + xc->irec.br_blockcount) { 277 error = xrep_cow_mark_file_range(xc, 278 xfs_agbno_to_fsb(pag, xc->next_bno), 279 xc->irec_startbno + xc->irec.br_blockcount - 280 xc->next_bno); 281 if (error) 282 goto out_sa; 283 } 284 285 /* Mark any area has an rmap that isn't a COW staging extent. */ 286 rm_low.rm_startblock = xc->irec_startbno; 287 memset(&rm_high, 0xFF, sizeof(rm_high)); 288 rm_high.rm_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1; 289 error = xfs_rmap_query_range(sc->sa.rmap_cur, &rm_low, &rm_high, 290 xrep_cow_mark_missing_staging_rmap, xc); 291 if (error) 292 goto out_sa; 293 294 /* 295 * If userspace is forcing us to rebuild the CoW fork or someone turned 296 * on the debugging knob, replace everything in the CoW fork. 297 */ 298 if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) || 299 XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) { 300 error = xrep_cow_mark_file_range(xc, xc->irec.br_startblock, 301 xc->irec.br_blockcount); 302 if (error) 303 return error; 304 } 305 306 out_sa: 307 xchk_ag_free(sc, &sc->sa); 308 out_pag: 309 xfs_perag_put(pag); 310 return 0; 311 } 312 313 /* 314 * Allocate a replacement CoW staging extent of up to the given number of 315 * blocks, and fill out the mapping. 316 */ 317 STATIC int 318 xrep_cow_alloc( 319 struct xfs_scrub *sc, 320 xfs_extlen_t maxlen, 321 struct xrep_cow_extent *repl) 322 { 323 struct xfs_alloc_arg args = { 324 .tp = sc->tp, 325 .mp = sc->mp, 326 .oinfo = XFS_RMAP_OINFO_SKIP_UPDATE, 327 .minlen = 1, 328 .maxlen = maxlen, 329 .prod = 1, 330 .resv = XFS_AG_RESV_NONE, 331 .datatype = XFS_ALLOC_USERDATA, 332 }; 333 int error; 334 335 error = xfs_trans_reserve_more(sc->tp, maxlen, 0); 336 if (error) 337 return error; 338 339 error = xfs_alloc_vextent_start_ag(&args, 340 XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino)); 341 if (error) 342 return error; 343 if (args.fsbno == NULLFSBLOCK) 344 return -ENOSPC; 345 346 xfs_refcount_alloc_cow_extent(sc->tp, args.fsbno, args.len); 347 348 repl->fsbno = args.fsbno; 349 repl->len = args.len; 350 return 0; 351 } 352 353 /* 354 * Look up the current CoW fork mapping so that we only allocate enough to 355 * replace a single mapping. If we don't find a mapping that covers the start 356 * of the file range, or we find a delalloc or written extent, something is 357 * seriously wrong, since we didn't drop the ILOCK. 358 */ 359 static inline int 360 xrep_cow_find_mapping( 361 struct xrep_cow *xc, 362 struct xfs_iext_cursor *icur, 363 xfs_fileoff_t startoff, 364 struct xfs_bmbt_irec *got) 365 { 366 struct xfs_inode *ip = xc->sc->ip; 367 struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK); 368 369 if (!xfs_iext_lookup_extent(ip, ifp, startoff, icur, got)) 370 goto bad; 371 372 if (got->br_startoff > startoff) 373 goto bad; 374 375 if (got->br_blockcount == 0) 376 goto bad; 377 378 if (isnullstartblock(got->br_startblock)) 379 goto bad; 380 381 if (xfs_bmap_is_written_extent(got)) 382 goto bad; 383 384 return 0; 385 bad: 386 ASSERT(0); 387 return -EFSCORRUPTED; 388 } 389 390 #define REPLACE_LEFT_SIDE (1U << 0) 391 #define REPLACE_RIGHT_SIDE (1U << 1) 392 393 /* 394 * Given a CoW fork mapping @got and a replacement mapping @repl, remap the 395 * beginning of @got with the space described by @rep. 396 */ 397 static inline void 398 xrep_cow_replace_mapping( 399 struct xfs_inode *ip, 400 struct xfs_iext_cursor *icur, 401 const struct xfs_bmbt_irec *got, 402 const struct xrep_cow_extent *repl) 403 { 404 struct xfs_bmbt_irec new = *got; /* struct copy */ 405 406 ASSERT(repl->len > 0); 407 ASSERT(!isnullstartblock(got->br_startblock)); 408 409 trace_xrep_cow_replace_mapping(ip, got, repl->fsbno, repl->len); 410 411 if (got->br_blockcount == repl->len) { 412 /* 413 * The new extent is a complete replacement for the existing 414 * extent. Update the COW fork record. 415 */ 416 new.br_startblock = repl->fsbno; 417 xfs_iext_update_extent(ip, BMAP_COWFORK, icur, &new); 418 return; 419 } 420 421 /* 422 * The new extent can replace the beginning of the COW fork record. 423 * Move the left side of @got upwards, then insert the new record. 424 */ 425 new.br_startoff += repl->len; 426 new.br_startblock += repl->len; 427 new.br_blockcount -= repl->len; 428 xfs_iext_update_extent(ip, BMAP_COWFORK, icur, &new); 429 430 new.br_startoff = got->br_startoff; 431 new.br_startblock = repl->fsbno; 432 new.br_blockcount = repl->len; 433 xfs_iext_insert(ip, icur, &new, BMAP_COWFORK); 434 } 435 436 /* 437 * Replace the unwritten CoW staging extent backing the given file range with a 438 * new space extent that isn't as problematic. 439 */ 440 STATIC int 441 xrep_cow_replace_range( 442 struct xrep_cow *xc, 443 xfs_fileoff_t startoff, 444 xfs_extlen_t *blockcount) 445 { 446 struct xfs_iext_cursor icur; 447 struct xrep_cow_extent repl; 448 struct xfs_bmbt_irec got; 449 struct xfs_scrub *sc = xc->sc; 450 xfs_fileoff_t nextoff; 451 xfs_extlen_t alloc_len; 452 int error; 453 454 /* 455 * Put the existing CoW fork mapping in @got. If @got ends before 456 * @rep, truncate @rep so we only replace one extent mapping at a time. 457 */ 458 error = xrep_cow_find_mapping(xc, &icur, startoff, &got); 459 if (error) 460 return error; 461 nextoff = min(startoff + *blockcount, 462 got.br_startoff + got.br_blockcount); 463 464 /* 465 * Allocate a replacement extent. If we don't fill all the blocks, 466 * shorten the quantity that will be deleted in this step. 467 */ 468 alloc_len = min_t(xfs_fileoff_t, XFS_MAX_BMBT_EXTLEN, 469 nextoff - startoff); 470 error = xrep_cow_alloc(sc, alloc_len, &repl); 471 if (error) 472 return error; 473 474 /* 475 * Replace the old mapping with the new one, and commit the metadata 476 * changes made so far. 477 */ 478 xrep_cow_replace_mapping(sc->ip, &icur, &got, &repl); 479 480 xfs_inode_set_cowblocks_tag(sc->ip); 481 error = xfs_defer_finish(&sc->tp); 482 if (error) 483 return error; 484 485 /* Note the old CoW staging extents; we'll reap them all later. */ 486 error = xfsb_bitmap_set(&xc->old_cowfork_fsblocks, got.br_startblock, 487 repl.len); 488 if (error) 489 return error; 490 491 *blockcount = repl.len; 492 return 0; 493 } 494 495 /* 496 * Replace a bad part of an unwritten CoW staging extent with a fresh delalloc 497 * reservation. 498 */ 499 STATIC int 500 xrep_cow_replace( 501 uint64_t startoff, 502 uint64_t blockcount, 503 void *priv) 504 { 505 struct xrep_cow *xc = priv; 506 int error = 0; 507 508 while (blockcount > 0) { 509 xfs_extlen_t len = min_t(xfs_filblks_t, blockcount, 510 XFS_MAX_BMBT_EXTLEN); 511 512 error = xrep_cow_replace_range(xc, startoff, &len); 513 if (error) 514 break; 515 516 blockcount -= len; 517 startoff += len; 518 } 519 520 return error; 521 } 522 523 /* 524 * Repair an inode's CoW fork. The CoW fork is an in-core structure, so 525 * there's no btree to rebuid. Instead, we replace any mappings that are 526 * cross-linked or lack ondisk CoW fork records in the refcount btree. 527 */ 528 int 529 xrep_bmap_cow( 530 struct xfs_scrub *sc) 531 { 532 struct xrep_cow *xc; 533 struct xfs_iext_cursor icur; 534 struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, XFS_COW_FORK); 535 int error; 536 537 if (!xfs_has_rmapbt(sc->mp) || !xfs_has_reflink(sc->mp)) 538 return -EOPNOTSUPP; 539 540 if (!ifp) 541 return 0; 542 543 /* realtime files aren't supported yet */ 544 if (XFS_IS_REALTIME_INODE(sc->ip)) 545 return -EOPNOTSUPP; 546 547 /* 548 * If we're somehow not in extents format, then reinitialize it to 549 * an empty extent mapping fork and exit. 550 */ 551 if (ifp->if_format != XFS_DINODE_FMT_EXTENTS) { 552 ifp->if_format = XFS_DINODE_FMT_EXTENTS; 553 ifp->if_nextents = 0; 554 return 0; 555 } 556 557 xc = kzalloc(sizeof(struct xrep_cow), XCHK_GFP_FLAGS); 558 if (!xc) 559 return -ENOMEM; 560 561 xfs_trans_ijoin(sc->tp, sc->ip, 0); 562 563 xc->sc = sc; 564 xoff_bitmap_init(&xc->bad_fileoffs); 565 xfsb_bitmap_init(&xc->old_cowfork_fsblocks); 566 567 for_each_xfs_iext(ifp, &icur, &xc->irec) { 568 if (xchk_should_terminate(sc, &error)) 569 goto out_bitmap; 570 571 /* 572 * delalloc reservations only exist incore, so there is no 573 * ondisk metadata that we can examine. Hence we leave them 574 * alone. 575 */ 576 if (isnullstartblock(xc->irec.br_startblock)) 577 continue; 578 579 /* 580 * COW fork extents are only in the written state if writeback 581 * is actively writing to disk. We cannot restart the write 582 * at a different disk address since we've already issued the 583 * IO, so we leave these alone and hope for the best. 584 */ 585 if (xfs_bmap_is_written_extent(&xc->irec)) 586 continue; 587 588 error = xrep_cow_find_bad(xc); 589 if (error) 590 goto out_bitmap; 591 } 592 593 /* Replace any bad unwritten mappings with fresh reservations. */ 594 error = xoff_bitmap_walk(&xc->bad_fileoffs, xrep_cow_replace, xc); 595 if (error) 596 goto out_bitmap; 597 598 /* 599 * Reap as many of the old CoW blocks as we can. They are owned ondisk 600 * by the refcount btree, not the inode, so it is correct to treat them 601 * like inode metadata. 602 */ 603 error = xrep_reap_fsblocks(sc, &xc->old_cowfork_fsblocks, 604 &XFS_RMAP_OINFO_COW); 605 if (error) 606 goto out_bitmap; 607 608 out_bitmap: 609 xfsb_bitmap_destroy(&xc->old_cowfork_fsblocks); 610 xoff_bitmap_destroy(&xc->bad_fileoffs); 611 kfree(xc); 612 return error; 613 } 614