1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2019-2023 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_log_format.h" 12 #include "xfs_trans.h" 13 #include "xfs_mount.h" 14 #include "xfs_alloc.h" 15 #include "xfs_ialloc.h" 16 #include "xfs_health.h" 17 #include "xfs_btree.h" 18 #include "xfs_ag.h" 19 #include "xfs_rtbitmap.h" 20 #include "xfs_inode.h" 21 #include "xfs_icache.h" 22 #include "xfs_rtgroup.h" 23 #include "scrub/scrub.h" 24 #include "scrub/common.h" 25 #include "scrub/trace.h" 26 #include "scrub/fscounters.h" 27 28 /* 29 * FS Summary Counters 30 * =================== 31 * 32 * The basics of filesystem summary counter checking are that we iterate the 33 * AGs counting the number of free blocks, free space btree blocks, per-AG 34 * reservations, inodes, delayed allocation reservations, and free inodes. 35 * Then we compare what we computed against the in-core counters. 36 * 37 * However, the reality is that summary counters are a tricky beast to check. 38 * While we /could/ freeze the filesystem and scramble around the AGs counting 39 * the free blocks, in practice we prefer not do that for a scan because 40 * freezing is costly. To get around this, we added a per-cpu counter of the 41 * delalloc reservations so that we can rotor around the AGs relatively 42 * quickly, and we allow the counts to be slightly off because we're not taking 43 * any locks while we do this. 44 * 45 * So the first thing we do is warm up the buffer cache in the setup routine by 46 * walking all the AGs to make sure the incore per-AG structure has been 47 * initialized. The expected value calculation then iterates the incore per-AG 48 * structures as quickly as it can. We snapshot the percpu counters before and 49 * after this operation and use the difference in counter values to guess at 50 * our tolerance for mismatch between expected and actual counter values. 51 */ 52 53 /* 54 * Since the expected value computation is lockless but only browses incore 55 * values, the percpu counters should be fairly close to each other. However, 56 * we'll allow ourselves to be off by at least this (arbitrary) amount. 57 */ 58 #define XCHK_FSCOUNT_MIN_VARIANCE (512) 59 60 /* 61 * Make sure the per-AG structure has been initialized from the on-disk header 62 * contents and trust that the incore counters match the ondisk counters. (The 63 * AGF and AGI scrubbers check them, and a normal xfs_scrub run checks the 64 * summary counters after checking all AG headers). Do this from the setup 65 * function so that the inner AG aggregation loop runs as quickly as possible. 66 * 67 * This function runs during the setup phase /before/ we start checking any 68 * metadata. 69 */ 70 STATIC int 71 xchk_fscount_warmup( 72 struct xfs_scrub *sc) 73 { 74 struct xfs_mount *mp = sc->mp; 75 struct xfs_buf *agi_bp = NULL; 76 struct xfs_buf *agf_bp = NULL; 77 struct xfs_perag *pag = NULL; 78 int error = 0; 79 80 while ((pag = xfs_perag_next(mp, pag))) { 81 if (xchk_should_terminate(sc, &error)) 82 break; 83 if (xfs_perag_initialised_agi(pag) && 84 xfs_perag_initialised_agf(pag)) 85 continue; 86 87 /* Lock both AG headers. */ 88 error = xfs_ialloc_read_agi(pag, sc->tp, 0, &agi_bp); 89 if (error) 90 break; 91 error = xfs_alloc_read_agf(pag, sc->tp, 0, &agf_bp); 92 if (error) 93 break; 94 95 /* 96 * These are supposed to be initialized by the header read 97 * function. 98 */ 99 if (!xfs_perag_initialised_agi(pag) || 100 !xfs_perag_initialised_agf(pag)) { 101 error = -EFSCORRUPTED; 102 break; 103 } 104 105 xfs_buf_relse(agf_bp); 106 agf_bp = NULL; 107 xfs_buf_relse(agi_bp); 108 agi_bp = NULL; 109 } 110 111 if (agf_bp) 112 xfs_buf_relse(agf_bp); 113 if (agi_bp) 114 xfs_buf_relse(agi_bp); 115 if (pag) 116 xfs_perag_rele(pag); 117 return error; 118 } 119 120 static inline int 121 xchk_fsfreeze( 122 struct xfs_scrub *sc) 123 { 124 int error; 125 126 error = freeze_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL, NULL); 127 trace_xchk_fsfreeze(sc, error); 128 return error; 129 } 130 131 static inline int 132 xchk_fsthaw( 133 struct xfs_scrub *sc) 134 { 135 int error; 136 137 /* This should always succeed, we have a kernel freeze */ 138 error = thaw_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL, NULL); 139 trace_xchk_fsthaw(sc, error); 140 return error; 141 } 142 143 /* 144 * We couldn't stabilize the filesystem long enough to sample all the variables 145 * that comprise the summary counters and compare them to the percpu counters. 146 * We need to disable all writer threads, which means taking the first two 147 * freeze levels to put userspace to sleep, and the third freeze level to 148 * prevent background threads from starting new transactions. Take one level 149 * more to prevent other callers from unfreezing the filesystem while we run. 150 */ 151 STATIC int 152 xchk_fscounters_freeze( 153 struct xfs_scrub *sc) 154 { 155 struct xchk_fscounters *fsc = sc->buf; 156 int error = 0; 157 158 if (sc->flags & XCHK_HAVE_FREEZE_PROT) { 159 sc->flags &= ~XCHK_HAVE_FREEZE_PROT; 160 mnt_drop_write_file(sc->file); 161 } 162 163 /* Try to grab a kernel freeze. */ 164 while ((error = xchk_fsfreeze(sc)) == -EBUSY) { 165 if (xchk_should_terminate(sc, &error)) 166 return error; 167 168 delay(HZ / 10); 169 } 170 if (error) 171 return error; 172 173 fsc->frozen = true; 174 return 0; 175 } 176 177 /* Thaw the filesystem after checking or repairing fscounters. */ 178 STATIC void 179 xchk_fscounters_cleanup( 180 void *buf) 181 { 182 struct xchk_fscounters *fsc = buf; 183 struct xfs_scrub *sc = fsc->sc; 184 int error; 185 186 if (!fsc->frozen) 187 return; 188 189 error = xchk_fsthaw(sc); 190 if (error) 191 xfs_emerg(sc->mp, "still frozen after scrub, err=%d", error); 192 else 193 fsc->frozen = false; 194 } 195 196 int 197 xchk_setup_fscounters( 198 struct xfs_scrub *sc) 199 { 200 struct xchk_fscounters *fsc; 201 int error; 202 203 /* 204 * If the AGF doesn't track btreeblks, we have to lock the AGF to count 205 * btree block usage by walking the actual btrees. 206 */ 207 if (!xfs_has_lazysbcount(sc->mp)) 208 xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); 209 210 sc->buf = kzalloc(sizeof(struct xchk_fscounters), XCHK_GFP_FLAGS); 211 if (!sc->buf) 212 return -ENOMEM; 213 sc->buf_cleanup = xchk_fscounters_cleanup; 214 fsc = sc->buf; 215 fsc->sc = sc; 216 217 xfs_icount_range(sc->mp, &fsc->icount_min, &fsc->icount_max); 218 219 /* We must get the incore counters set up before we can proceed. */ 220 error = xchk_fscount_warmup(sc); 221 if (error) 222 return error; 223 224 /* 225 * Pause all writer activity in the filesystem while we're scrubbing to 226 * reduce the likelihood of background perturbations to the counters 227 * throwing off our calculations. 228 * 229 * If we're repairing, we need to prevent any other thread from 230 * changing the global fs summary counters while we're repairing them. 231 * This requires the fs to be frozen, which will disable background 232 * reclaim and purge all inactive inodes. 233 */ 234 if ((sc->flags & XCHK_TRY_HARDER) || xchk_could_repair(sc)) { 235 error = xchk_fscounters_freeze(sc); 236 if (error) 237 return error; 238 } 239 240 xchk_trans_alloc_empty(sc); 241 return 0; 242 } 243 244 /* 245 * Part 1: Collecting filesystem summary counts. For each AG, we add its 246 * summary counts (total inodes, free inodes, free data blocks) to an incore 247 * copy of the overall filesystem summary counts. 248 * 249 * To avoid false corruption reports in part 2, any failure in this part must 250 * set the INCOMPLETE flag even when a negative errno is returned. This care 251 * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED, 252 * ECANCELED) that are absorbed into a scrub state flag update by 253 * xchk_*_process_error. Scrub and repair share the same incore data 254 * structures, so the INCOMPLETE flag is critical to prevent a repair based on 255 * insufficient information. 256 */ 257 258 /* Count free space btree blocks manually for pre-lazysbcount filesystems. */ 259 static int 260 xchk_fscount_btreeblks( 261 struct xfs_scrub *sc, 262 struct xchk_fscounters *fsc, 263 xfs_agnumber_t agno) 264 { 265 xfs_filblks_t blocks; 266 int error; 267 268 error = xchk_ag_init_existing(sc, agno, &sc->sa); 269 if (error) 270 goto out_free; 271 272 error = xfs_btree_count_blocks(sc->sa.bno_cur, &blocks); 273 if (error) 274 goto out_free; 275 fsc->fdblocks += blocks - 1; 276 277 error = xfs_btree_count_blocks(sc->sa.cnt_cur, &blocks); 278 if (error) 279 goto out_free; 280 fsc->fdblocks += blocks - 1; 281 282 out_free: 283 xchk_ag_free(sc, &sc->sa); 284 return error; 285 } 286 287 /* 288 * Calculate what the global in-core counters ought to be from the incore 289 * per-AG structure. Callers can compare this to the actual in-core counters 290 * to estimate by how much both in-core and on-disk counters need to be 291 * adjusted. 292 */ 293 STATIC int 294 xchk_fscount_aggregate_agcounts( 295 struct xfs_scrub *sc, 296 struct xchk_fscounters *fsc) 297 { 298 struct xfs_mount *mp = sc->mp; 299 struct xfs_perag *pag = NULL; 300 uint64_t delayed; 301 int tries = 8; 302 int error = 0; 303 304 retry: 305 fsc->icount = 0; 306 fsc->ifree = 0; 307 fsc->fdblocks = 0; 308 309 while ((pag = xfs_perag_next(mp, pag))) { 310 if (xchk_should_terminate(sc, &error)) 311 break; 312 313 /* This somehow got unset since the warmup? */ 314 if (!xfs_perag_initialised_agi(pag) || 315 !xfs_perag_initialised_agf(pag)) { 316 error = -EFSCORRUPTED; 317 break; 318 } 319 320 /* Count all the inodes */ 321 fsc->icount += pag->pagi_count; 322 fsc->ifree += pag->pagi_freecount; 323 324 /* Add up the free/freelist/bnobt/cntbt blocks */ 325 fsc->fdblocks += pag->pagf_freeblks; 326 fsc->fdblocks += pag->pagf_flcount; 327 if (xfs_has_lazysbcount(sc->mp)) { 328 fsc->fdblocks += pag->pagf_btreeblks; 329 } else { 330 error = xchk_fscount_btreeblks(sc, fsc, pag_agno(pag)); 331 if (error) 332 break; 333 } 334 335 /* 336 * Per-AG reservations are taken out of the incore counters, 337 * so they must be left out of the free blocks computation. 338 */ 339 fsc->fdblocks -= pag->pag_meta_resv.ar_reserved; 340 fsc->fdblocks -= pag->pag_rmapbt_resv.ar_orig_reserved; 341 342 } 343 if (pag) 344 xfs_perag_rele(pag); 345 if (error) { 346 xchk_set_incomplete(sc); 347 return error; 348 } 349 350 /* 351 * The global incore space reservation is taken from the incore 352 * counters, so leave that out of the computation. 353 */ 354 fsc->fdblocks -= mp->m_free[XC_FREE_BLOCKS].res_avail; 355 356 /* 357 * Delayed allocation reservations are taken out of the incore counters 358 * but not recorded on disk, so leave them and their indlen blocks out 359 * of the computation. 360 */ 361 delayed = percpu_counter_sum(&mp->m_delalloc_blks); 362 fsc->fdblocks -= delayed; 363 364 trace_xchk_fscounters_calc(mp, fsc->icount, fsc->ifree, fsc->fdblocks, 365 delayed); 366 367 368 /* Bail out if the values we compute are totally nonsense. */ 369 if (fsc->icount < fsc->icount_min || fsc->icount > fsc->icount_max || 370 fsc->fdblocks > mp->m_sb.sb_dblocks || 371 fsc->ifree > fsc->icount_max) 372 return -EFSCORRUPTED; 373 374 /* 375 * If ifree > icount then we probably had some perturbation in the 376 * counters while we were calculating things. We'll try a few times 377 * to maintain ifree <= icount before giving up. 378 */ 379 if (fsc->ifree > fsc->icount) { 380 if (tries--) 381 goto retry; 382 return -EDEADLOCK; 383 } 384 385 return 0; 386 } 387 388 #ifdef CONFIG_XFS_RT 389 STATIC int 390 xchk_fscount_add_frextent( 391 struct xfs_rtgroup *rtg, 392 struct xfs_trans *tp, 393 const struct xfs_rtalloc_rec *rec, 394 void *priv) 395 { 396 struct xchk_fscounters *fsc = priv; 397 int error = 0; 398 399 fsc->frextents += rec->ar_extcount; 400 401 xchk_should_terminate(fsc->sc, &error); 402 return error; 403 } 404 405 /* Calculate the number of free realtime extents from the realtime bitmap. */ 406 STATIC int 407 xchk_fscount_count_frextents( 408 struct xfs_scrub *sc, 409 struct xchk_fscounters *fsc) 410 { 411 struct xfs_mount *mp = sc->mp; 412 struct xfs_rtgroup *rtg = NULL; 413 int error; 414 415 fsc->frextents = 0; 416 fsc->frextents_delayed = 0; 417 418 /* 419 * Don't bother verifying and repairing the fs counters for zoned file 420 * systems as they don't track an on-disk frextents count, and the 421 * in-memory percpu counter also includes reservations. 422 */ 423 if (!xfs_has_realtime(mp) || xfs_has_zoned(mp)) 424 return 0; 425 426 while ((rtg = xfs_rtgroup_next(mp, rtg))) { 427 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP_SHARED); 428 error = xfs_rtalloc_query_all(rtg, sc->tp, 429 xchk_fscount_add_frextent, fsc); 430 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED); 431 if (error) { 432 xchk_set_incomplete(sc); 433 xfs_rtgroup_rele(rtg); 434 return error; 435 } 436 } 437 438 fsc->frextents_delayed = percpu_counter_sum(&mp->m_delalloc_rtextents); 439 return 0; 440 } 441 #else 442 STATIC int 443 xchk_fscount_count_frextents( 444 struct xfs_scrub *sc, 445 struct xchk_fscounters *fsc) 446 { 447 fsc->frextents = 0; 448 fsc->frextents_delayed = 0; 449 return 0; 450 } 451 #endif /* CONFIG_XFS_RT */ 452 453 /* 454 * Part 2: Comparing filesystem summary counters. All we have to do here is 455 * sum the percpu counters and compare them to what we've observed. 456 */ 457 458 /* 459 * Is the @counter reasonably close to the @expected value? 460 * 461 * We neither locked nor froze anything in the filesystem while aggregating the 462 * per-AG data to compute the @expected value, which means that the counter 463 * could have changed. We know the @old_value of the summation of the counter 464 * before the aggregation, and we re-sum the counter now. If the expected 465 * value falls between the two summations, we're ok. 466 * 467 * Otherwise, we /might/ have a problem. If the change in the summations is 468 * more than we want to tolerate, the filesystem is probably busy and we should 469 * just send back INCOMPLETE and see if userspace will try again. 470 * 471 * If we're repairing then we require an exact match. 472 */ 473 static inline bool 474 xchk_fscount_within_range( 475 struct xfs_scrub *sc, 476 const int64_t old_value, 477 struct percpu_counter *counter, 478 uint64_t expected) 479 { 480 int64_t min_value, max_value; 481 int64_t curr_value = percpu_counter_sum(counter); 482 483 trace_xchk_fscounters_within_range(sc->mp, expected, curr_value, 484 old_value); 485 486 /* Negative values are always wrong. */ 487 if (curr_value < 0) 488 return false; 489 490 /* Exact matches are always ok. */ 491 if (curr_value == expected) 492 return true; 493 494 /* We require exact matches when repair is running. */ 495 if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) 496 return false; 497 498 min_value = min(old_value, curr_value); 499 max_value = max(old_value, curr_value); 500 501 /* Within the before-and-after range is ok. */ 502 if (expected >= min_value && expected <= max_value) 503 return true; 504 505 /* Everything else is bad. */ 506 return false; 507 } 508 509 /* Check the superblock counters. */ 510 int 511 xchk_fscounters( 512 struct xfs_scrub *sc) 513 { 514 struct xfs_mount *mp = sc->mp; 515 struct xchk_fscounters *fsc = sc->buf; 516 int64_t icount, ifree, fdblocks, frextents; 517 bool try_again = false; 518 int error; 519 520 /* Snapshot the percpu counters. */ 521 icount = percpu_counter_sum(&mp->m_icount); 522 ifree = percpu_counter_sum(&mp->m_ifree); 523 fdblocks = xfs_sum_freecounter_raw(mp, XC_FREE_BLOCKS); 524 frextents = xfs_sum_freecounter_raw(mp, XC_FREE_RTEXTENTS); 525 526 /* No negative values, please! */ 527 if (icount < 0 || ifree < 0) 528 xchk_set_corrupt(sc); 529 530 /* 531 * If the filesystem is not frozen, the counter summation calls above 532 * can race with xfs_dec_freecounter, which subtracts a requested space 533 * reservation from the counter and undoes the subtraction if that made 534 * the counter go negative. Therefore, it's possible to see negative 535 * values here, and we should only flag that as a corruption if we 536 * froze the fs. This is much more likely to happen with frextents 537 * since there are no reserved pools. 538 */ 539 if (fdblocks < 0 || frextents < 0) { 540 if (!fsc->frozen) 541 return -EDEADLOCK; 542 543 xchk_set_corrupt(sc); 544 return 0; 545 } 546 547 /* See if icount is obviously wrong. */ 548 if (icount < fsc->icount_min || icount > fsc->icount_max) 549 xchk_set_corrupt(sc); 550 551 /* See if fdblocks is obviously wrong. */ 552 if (fdblocks > mp->m_sb.sb_dblocks) 553 xchk_set_corrupt(sc); 554 555 /* See if frextents is obviously wrong. */ 556 if (frextents > mp->m_sb.sb_rextents) 557 xchk_set_corrupt(sc); 558 559 /* 560 * If ifree exceeds icount by more than the minimum variance then 561 * something's probably wrong with the counters. 562 */ 563 if (ifree > icount && ifree - icount > XCHK_FSCOUNT_MIN_VARIANCE) 564 xchk_set_corrupt(sc); 565 566 /* Walk the incore AG headers to calculate the expected counters. */ 567 error = xchk_fscount_aggregate_agcounts(sc, fsc); 568 if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error)) 569 return error; 570 571 /* Count the free extents counter for rt volumes. */ 572 error = xchk_fscount_count_frextents(sc, fsc); 573 if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error)) 574 return error; 575 if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE) 576 return 0; 577 578 /* 579 * Compare the in-core counters with whatever we counted. If the fs is 580 * frozen, we treat the discrepancy as a corruption because the freeze 581 * should have stabilized the counter values. Otherwise, we need 582 * userspace to call us back having granted us freeze permission. 583 */ 584 if (!xchk_fscount_within_range(sc, icount, &mp->m_icount, 585 fsc->icount)) { 586 if (fsc->frozen) 587 xchk_set_corrupt(sc); 588 else 589 try_again = true; 590 } 591 592 if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree)) { 593 if (fsc->frozen) 594 xchk_set_corrupt(sc); 595 else 596 try_again = true; 597 } 598 599 if (!xchk_fscount_within_range(sc, fdblocks, 600 &mp->m_free[XC_FREE_BLOCKS].count, fsc->fdblocks)) { 601 if (fsc->frozen) 602 xchk_set_corrupt(sc); 603 else 604 try_again = true; 605 } 606 607 if (!xfs_has_zoned(mp) && 608 !xchk_fscount_within_range(sc, frextents, 609 &mp->m_free[XC_FREE_RTEXTENTS].count, 610 fsc->frextents - fsc->frextents_delayed)) { 611 if (fsc->frozen) 612 xchk_set_corrupt(sc); 613 else 614 try_again = true; 615 } 616 617 if (try_again) 618 return -EDEADLOCK; 619 620 return 0; 621 } 622