1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2019-2023 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_log_format.h" 12 #include "xfs_trans.h" 13 #include "xfs_mount.h" 14 #include "xfs_alloc.h" 15 #include "xfs_ialloc.h" 16 #include "xfs_health.h" 17 #include "xfs_btree.h" 18 #include "xfs_ag.h" 19 #include "xfs_rtbitmap.h" 20 #include "xfs_inode.h" 21 #include "xfs_icache.h" 22 #include "scrub/scrub.h" 23 #include "scrub/common.h" 24 #include "scrub/trace.h" 25 #include "scrub/fscounters.h" 26 27 /* 28 * FS Summary Counters 29 * =================== 30 * 31 * The basics of filesystem summary counter checking are that we iterate the 32 * AGs counting the number of free blocks, free space btree blocks, per-AG 33 * reservations, inodes, delayed allocation reservations, and free inodes. 34 * Then we compare what we computed against the in-core counters. 35 * 36 * However, the reality is that summary counters are a tricky beast to check. 37 * While we /could/ freeze the filesystem and scramble around the AGs counting 38 * the free blocks, in practice we prefer not do that for a scan because 39 * freezing is costly. To get around this, we added a per-cpu counter of the 40 * delalloc reservations so that we can rotor around the AGs relatively 41 * quickly, and we allow the counts to be slightly off because we're not taking 42 * any locks while we do this. 43 * 44 * So the first thing we do is warm up the buffer cache in the setup routine by 45 * walking all the AGs to make sure the incore per-AG structure has been 46 * initialized. The expected value calculation then iterates the incore per-AG 47 * structures as quickly as it can. We snapshot the percpu counters before and 48 * after this operation and use the difference in counter values to guess at 49 * our tolerance for mismatch between expected and actual counter values. 50 */ 51 52 /* 53 * Since the expected value computation is lockless but only browses incore 54 * values, the percpu counters should be fairly close to each other. However, 55 * we'll allow ourselves to be off by at least this (arbitrary) amount. 56 */ 57 #define XCHK_FSCOUNT_MIN_VARIANCE (512) 58 59 /* 60 * Make sure the per-AG structure has been initialized from the on-disk header 61 * contents and trust that the incore counters match the ondisk counters. (The 62 * AGF and AGI scrubbers check them, and a normal xfs_scrub run checks the 63 * summary counters after checking all AG headers). Do this from the setup 64 * function so that the inner AG aggregation loop runs as quickly as possible. 65 * 66 * This function runs during the setup phase /before/ we start checking any 67 * metadata. 68 */ 69 STATIC int 70 xchk_fscount_warmup( 71 struct xfs_scrub *sc) 72 { 73 struct xfs_mount *mp = sc->mp; 74 struct xfs_buf *agi_bp = NULL; 75 struct xfs_buf *agf_bp = NULL; 76 struct xfs_perag *pag = NULL; 77 xfs_agnumber_t agno; 78 int error = 0; 79 80 for_each_perag(mp, agno, pag) { 81 if (xchk_should_terminate(sc, &error)) 82 break; 83 if (xfs_perag_initialised_agi(pag) && 84 xfs_perag_initialised_agf(pag)) 85 continue; 86 87 /* Lock both AG headers. */ 88 error = xfs_ialloc_read_agi(pag, sc->tp, 0, &agi_bp); 89 if (error) 90 break; 91 error = xfs_alloc_read_agf(pag, sc->tp, 0, &agf_bp); 92 if (error) 93 break; 94 95 /* 96 * These are supposed to be initialized by the header read 97 * function. 98 */ 99 if (!xfs_perag_initialised_agi(pag) || 100 !xfs_perag_initialised_agf(pag)) { 101 error = -EFSCORRUPTED; 102 break; 103 } 104 105 xfs_buf_relse(agf_bp); 106 agf_bp = NULL; 107 xfs_buf_relse(agi_bp); 108 agi_bp = NULL; 109 } 110 111 if (agf_bp) 112 xfs_buf_relse(agf_bp); 113 if (agi_bp) 114 xfs_buf_relse(agi_bp); 115 if (pag) 116 xfs_perag_rele(pag); 117 return error; 118 } 119 120 static inline int 121 xchk_fsfreeze( 122 struct xfs_scrub *sc) 123 { 124 int error; 125 126 error = freeze_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL); 127 trace_xchk_fsfreeze(sc, error); 128 return error; 129 } 130 131 static inline int 132 xchk_fsthaw( 133 struct xfs_scrub *sc) 134 { 135 int error; 136 137 /* This should always succeed, we have a kernel freeze */ 138 error = thaw_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL); 139 trace_xchk_fsthaw(sc, error); 140 return error; 141 } 142 143 /* 144 * We couldn't stabilize the filesystem long enough to sample all the variables 145 * that comprise the summary counters and compare them to the percpu counters. 146 * We need to disable all writer threads, which means taking the first two 147 * freeze levels to put userspace to sleep, and the third freeze level to 148 * prevent background threads from starting new transactions. Take one level 149 * more to prevent other callers from unfreezing the filesystem while we run. 150 */ 151 STATIC int 152 xchk_fscounters_freeze( 153 struct xfs_scrub *sc) 154 { 155 struct xchk_fscounters *fsc = sc->buf; 156 int error = 0; 157 158 if (sc->flags & XCHK_HAVE_FREEZE_PROT) { 159 sc->flags &= ~XCHK_HAVE_FREEZE_PROT; 160 mnt_drop_write_file(sc->file); 161 } 162 163 /* Try to grab a kernel freeze. */ 164 while ((error = xchk_fsfreeze(sc)) == -EBUSY) { 165 if (xchk_should_terminate(sc, &error)) 166 return error; 167 168 delay(HZ / 10); 169 } 170 if (error) 171 return error; 172 173 fsc->frozen = true; 174 return 0; 175 } 176 177 /* Thaw the filesystem after checking or repairing fscounters. */ 178 STATIC void 179 xchk_fscounters_cleanup( 180 void *buf) 181 { 182 struct xchk_fscounters *fsc = buf; 183 struct xfs_scrub *sc = fsc->sc; 184 int error; 185 186 if (!fsc->frozen) 187 return; 188 189 error = xchk_fsthaw(sc); 190 if (error) 191 xfs_emerg(sc->mp, "still frozen after scrub, err=%d", error); 192 else 193 fsc->frozen = false; 194 } 195 196 int 197 xchk_setup_fscounters( 198 struct xfs_scrub *sc) 199 { 200 struct xchk_fscounters *fsc; 201 int error; 202 203 /* 204 * If the AGF doesn't track btreeblks, we have to lock the AGF to count 205 * btree block usage by walking the actual btrees. 206 */ 207 if (!xfs_has_lazysbcount(sc->mp)) 208 xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); 209 210 sc->buf = kzalloc(sizeof(struct xchk_fscounters), XCHK_GFP_FLAGS); 211 if (!sc->buf) 212 return -ENOMEM; 213 sc->buf_cleanup = xchk_fscounters_cleanup; 214 fsc = sc->buf; 215 fsc->sc = sc; 216 217 xfs_icount_range(sc->mp, &fsc->icount_min, &fsc->icount_max); 218 219 /* We must get the incore counters set up before we can proceed. */ 220 error = xchk_fscount_warmup(sc); 221 if (error) 222 return error; 223 224 /* 225 * Pause all writer activity in the filesystem while we're scrubbing to 226 * reduce the likelihood of background perturbations to the counters 227 * throwing off our calculations. 228 * 229 * If we're repairing, we need to prevent any other thread from 230 * changing the global fs summary counters while we're repairing them. 231 * This requires the fs to be frozen, which will disable background 232 * reclaim and purge all inactive inodes. 233 */ 234 if ((sc->flags & XCHK_TRY_HARDER) || xchk_could_repair(sc)) { 235 error = xchk_fscounters_freeze(sc); 236 if (error) 237 return error; 238 } 239 240 return xchk_trans_alloc_empty(sc); 241 } 242 243 /* 244 * Part 1: Collecting filesystem summary counts. For each AG, we add its 245 * summary counts (total inodes, free inodes, free data blocks) to an incore 246 * copy of the overall filesystem summary counts. 247 * 248 * To avoid false corruption reports in part 2, any failure in this part must 249 * set the INCOMPLETE flag even when a negative errno is returned. This care 250 * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED, 251 * ECANCELED) that are absorbed into a scrub state flag update by 252 * xchk_*_process_error. Scrub and repair share the same incore data 253 * structures, so the INCOMPLETE flag is critical to prevent a repair based on 254 * insufficient information. 255 */ 256 257 /* Count free space btree blocks manually for pre-lazysbcount filesystems. */ 258 static int 259 xchk_fscount_btreeblks( 260 struct xfs_scrub *sc, 261 struct xchk_fscounters *fsc, 262 xfs_agnumber_t agno) 263 { 264 xfs_extlen_t blocks; 265 int error; 266 267 error = xchk_ag_init_existing(sc, agno, &sc->sa); 268 if (error) 269 goto out_free; 270 271 error = xfs_btree_count_blocks(sc->sa.bno_cur, &blocks); 272 if (error) 273 goto out_free; 274 fsc->fdblocks += blocks - 1; 275 276 error = xfs_btree_count_blocks(sc->sa.cnt_cur, &blocks); 277 if (error) 278 goto out_free; 279 fsc->fdblocks += blocks - 1; 280 281 out_free: 282 xchk_ag_free(sc, &sc->sa); 283 return error; 284 } 285 286 /* 287 * Calculate what the global in-core counters ought to be from the incore 288 * per-AG structure. Callers can compare this to the actual in-core counters 289 * to estimate by how much both in-core and on-disk counters need to be 290 * adjusted. 291 */ 292 STATIC int 293 xchk_fscount_aggregate_agcounts( 294 struct xfs_scrub *sc, 295 struct xchk_fscounters *fsc) 296 { 297 struct xfs_mount *mp = sc->mp; 298 struct xfs_perag *pag; 299 uint64_t delayed; 300 xfs_agnumber_t agno; 301 int tries = 8; 302 int error = 0; 303 304 retry: 305 fsc->icount = 0; 306 fsc->ifree = 0; 307 fsc->fdblocks = 0; 308 309 for_each_perag(mp, agno, pag) { 310 if (xchk_should_terminate(sc, &error)) 311 break; 312 313 /* This somehow got unset since the warmup? */ 314 if (!xfs_perag_initialised_agi(pag) || 315 !xfs_perag_initialised_agf(pag)) { 316 error = -EFSCORRUPTED; 317 break; 318 } 319 320 /* Count all the inodes */ 321 fsc->icount += pag->pagi_count; 322 fsc->ifree += pag->pagi_freecount; 323 324 /* Add up the free/freelist/bnobt/cntbt blocks */ 325 fsc->fdblocks += pag->pagf_freeblks; 326 fsc->fdblocks += pag->pagf_flcount; 327 if (xfs_has_lazysbcount(sc->mp)) { 328 fsc->fdblocks += pag->pagf_btreeblks; 329 } else { 330 error = xchk_fscount_btreeblks(sc, fsc, agno); 331 if (error) 332 break; 333 } 334 335 /* 336 * Per-AG reservations are taken out of the incore counters, 337 * so they must be left out of the free blocks computation. 338 */ 339 fsc->fdblocks -= pag->pag_meta_resv.ar_reserved; 340 fsc->fdblocks -= pag->pag_rmapbt_resv.ar_orig_reserved; 341 342 } 343 if (pag) 344 xfs_perag_rele(pag); 345 if (error) { 346 xchk_set_incomplete(sc); 347 return error; 348 } 349 350 /* 351 * The global incore space reservation is taken from the incore 352 * counters, so leave that out of the computation. 353 */ 354 fsc->fdblocks -= mp->m_resblks_avail; 355 356 /* 357 * Delayed allocation reservations are taken out of the incore counters 358 * but not recorded on disk, so leave them and their indlen blocks out 359 * of the computation. 360 */ 361 delayed = percpu_counter_sum(&mp->m_delalloc_blks); 362 fsc->fdblocks -= delayed; 363 364 trace_xchk_fscounters_calc(mp, fsc->icount, fsc->ifree, fsc->fdblocks, 365 delayed); 366 367 368 /* Bail out if the values we compute are totally nonsense. */ 369 if (fsc->icount < fsc->icount_min || fsc->icount > fsc->icount_max || 370 fsc->fdblocks > mp->m_sb.sb_dblocks || 371 fsc->ifree > fsc->icount_max) 372 return -EFSCORRUPTED; 373 374 /* 375 * If ifree > icount then we probably had some perturbation in the 376 * counters while we were calculating things. We'll try a few times 377 * to maintain ifree <= icount before giving up. 378 */ 379 if (fsc->ifree > fsc->icount) { 380 if (tries--) 381 goto retry; 382 return -EDEADLOCK; 383 } 384 385 return 0; 386 } 387 388 #ifdef CONFIG_XFS_RT 389 STATIC int 390 xchk_fscount_add_frextent( 391 struct xfs_mount *mp, 392 struct xfs_trans *tp, 393 const struct xfs_rtalloc_rec *rec, 394 void *priv) 395 { 396 struct xchk_fscounters *fsc = priv; 397 int error = 0; 398 399 fsc->frextents += rec->ar_extcount; 400 401 xchk_should_terminate(fsc->sc, &error); 402 return error; 403 } 404 405 /* Calculate the number of free realtime extents from the realtime bitmap. */ 406 STATIC int 407 xchk_fscount_count_frextents( 408 struct xfs_scrub *sc, 409 struct xchk_fscounters *fsc) 410 { 411 struct xfs_mount *mp = sc->mp; 412 int error; 413 414 fsc->frextents = 0; 415 fsc->frextents_delayed = 0; 416 if (!xfs_has_realtime(mp)) 417 return 0; 418 419 xfs_rtbitmap_lock_shared(sc->mp, XFS_RBMLOCK_BITMAP); 420 error = xfs_rtalloc_query_all(sc->mp, sc->tp, 421 xchk_fscount_add_frextent, fsc); 422 if (error) { 423 xchk_set_incomplete(sc); 424 goto out_unlock; 425 } 426 427 fsc->frextents_delayed = percpu_counter_sum(&mp->m_delalloc_rtextents); 428 429 out_unlock: 430 xfs_rtbitmap_unlock_shared(sc->mp, XFS_RBMLOCK_BITMAP); 431 return error; 432 } 433 #else 434 STATIC int 435 xchk_fscount_count_frextents( 436 struct xfs_scrub *sc, 437 struct xchk_fscounters *fsc) 438 { 439 fsc->frextents = 0; 440 fsc->frextents_delayed = 0; 441 return 0; 442 } 443 #endif /* CONFIG_XFS_RT */ 444 445 /* 446 * Part 2: Comparing filesystem summary counters. All we have to do here is 447 * sum the percpu counters and compare them to what we've observed. 448 */ 449 450 /* 451 * Is the @counter reasonably close to the @expected value? 452 * 453 * We neither locked nor froze anything in the filesystem while aggregating the 454 * per-AG data to compute the @expected value, which means that the counter 455 * could have changed. We know the @old_value of the summation of the counter 456 * before the aggregation, and we re-sum the counter now. If the expected 457 * value falls between the two summations, we're ok. 458 * 459 * Otherwise, we /might/ have a problem. If the change in the summations is 460 * more than we want to tolerate, the filesystem is probably busy and we should 461 * just send back INCOMPLETE and see if userspace will try again. 462 * 463 * If we're repairing then we require an exact match. 464 */ 465 static inline bool 466 xchk_fscount_within_range( 467 struct xfs_scrub *sc, 468 const int64_t old_value, 469 struct percpu_counter *counter, 470 uint64_t expected) 471 { 472 int64_t min_value, max_value; 473 int64_t curr_value = percpu_counter_sum(counter); 474 475 trace_xchk_fscounters_within_range(sc->mp, expected, curr_value, 476 old_value); 477 478 /* Negative values are always wrong. */ 479 if (curr_value < 0) 480 return false; 481 482 /* Exact matches are always ok. */ 483 if (curr_value == expected) 484 return true; 485 486 /* We require exact matches when repair is running. */ 487 if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) 488 return false; 489 490 min_value = min(old_value, curr_value); 491 max_value = max(old_value, curr_value); 492 493 /* Within the before-and-after range is ok. */ 494 if (expected >= min_value && expected <= max_value) 495 return true; 496 497 /* Everything else is bad. */ 498 return false; 499 } 500 501 /* Check the superblock counters. */ 502 int 503 xchk_fscounters( 504 struct xfs_scrub *sc) 505 { 506 struct xfs_mount *mp = sc->mp; 507 struct xchk_fscounters *fsc = sc->buf; 508 int64_t icount, ifree, fdblocks, frextents; 509 bool try_again = false; 510 int error; 511 512 /* Snapshot the percpu counters. */ 513 icount = percpu_counter_sum(&mp->m_icount); 514 ifree = percpu_counter_sum(&mp->m_ifree); 515 fdblocks = percpu_counter_sum(&mp->m_fdblocks); 516 frextents = percpu_counter_sum(&mp->m_frextents); 517 518 /* No negative values, please! */ 519 if (icount < 0 || ifree < 0) 520 xchk_set_corrupt(sc); 521 522 /* 523 * If the filesystem is not frozen, the counter summation calls above 524 * can race with xfs_dec_freecounter, which subtracts a requested space 525 * reservation from the counter and undoes the subtraction if that made 526 * the counter go negative. Therefore, it's possible to see negative 527 * values here, and we should only flag that as a corruption if we 528 * froze the fs. This is much more likely to happen with frextents 529 * since there are no reserved pools. 530 */ 531 if (fdblocks < 0 || frextents < 0) { 532 if (!fsc->frozen) 533 return -EDEADLOCK; 534 535 xchk_set_corrupt(sc); 536 return 0; 537 } 538 539 /* See if icount is obviously wrong. */ 540 if (icount < fsc->icount_min || icount > fsc->icount_max) 541 xchk_set_corrupt(sc); 542 543 /* See if fdblocks is obviously wrong. */ 544 if (fdblocks > mp->m_sb.sb_dblocks) 545 xchk_set_corrupt(sc); 546 547 /* See if frextents is obviously wrong. */ 548 if (frextents > mp->m_sb.sb_rextents) 549 xchk_set_corrupt(sc); 550 551 /* 552 * If ifree exceeds icount by more than the minimum variance then 553 * something's probably wrong with the counters. 554 */ 555 if (ifree > icount && ifree - icount > XCHK_FSCOUNT_MIN_VARIANCE) 556 xchk_set_corrupt(sc); 557 558 /* Walk the incore AG headers to calculate the expected counters. */ 559 error = xchk_fscount_aggregate_agcounts(sc, fsc); 560 if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error)) 561 return error; 562 563 /* Count the free extents counter for rt volumes. */ 564 error = xchk_fscount_count_frextents(sc, fsc); 565 if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error)) 566 return error; 567 if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE) 568 return 0; 569 570 /* 571 * Compare the in-core counters with whatever we counted. If the fs is 572 * frozen, we treat the discrepancy as a corruption because the freeze 573 * should have stabilized the counter values. Otherwise, we need 574 * userspace to call us back having granted us freeze permission. 575 */ 576 if (!xchk_fscount_within_range(sc, icount, &mp->m_icount, 577 fsc->icount)) { 578 if (fsc->frozen) 579 xchk_set_corrupt(sc); 580 else 581 try_again = true; 582 } 583 584 if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree)) { 585 if (fsc->frozen) 586 xchk_set_corrupt(sc); 587 else 588 try_again = true; 589 } 590 591 if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks, 592 fsc->fdblocks)) { 593 if (fsc->frozen) 594 xchk_set_corrupt(sc); 595 else 596 try_again = true; 597 } 598 599 if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents, 600 fsc->frextents - fsc->frextents_delayed)) { 601 if (fsc->frozen) 602 xchk_set_corrupt(sc); 603 else 604 try_again = true; 605 } 606 607 if (try_again) 608 return -EDEADLOCK; 609 610 return 0; 611 } 612