1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2019-2023 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_log_format.h" 12 #include "xfs_trans.h" 13 #include "xfs_mount.h" 14 #include "xfs_alloc.h" 15 #include "xfs_ialloc.h" 16 #include "xfs_health.h" 17 #include "xfs_btree.h" 18 #include "xfs_ag.h" 19 #include "xfs_rtbitmap.h" 20 #include "xfs_inode.h" 21 #include "xfs_icache.h" 22 #include "scrub/scrub.h" 23 #include "scrub/common.h" 24 #include "scrub/trace.h" 25 #include "scrub/fscounters.h" 26 27 /* 28 * FS Summary Counters 29 * =================== 30 * 31 * The basics of filesystem summary counter checking are that we iterate the 32 * AGs counting the number of free blocks, free space btree blocks, per-AG 33 * reservations, inodes, delayed allocation reservations, and free inodes. 34 * Then we compare what we computed against the in-core counters. 35 * 36 * However, the reality is that summary counters are a tricky beast to check. 37 * While we /could/ freeze the filesystem and scramble around the AGs counting 38 * the free blocks, in practice we prefer not do that for a scan because 39 * freezing is costly. To get around this, we added a per-cpu counter of the 40 * delalloc reservations so that we can rotor around the AGs relatively 41 * quickly, and we allow the counts to be slightly off because we're not taking 42 * any locks while we do this. 43 * 44 * So the first thing we do is warm up the buffer cache in the setup routine by 45 * walking all the AGs to make sure the incore per-AG structure has been 46 * initialized. The expected value calculation then iterates the incore per-AG 47 * structures as quickly as it can. We snapshot the percpu counters before and 48 * after this operation and use the difference in counter values to guess at 49 * our tolerance for mismatch between expected and actual counter values. 50 */ 51 52 /* 53 * Since the expected value computation is lockless but only browses incore 54 * values, the percpu counters should be fairly close to each other. However, 55 * we'll allow ourselves to be off by at least this (arbitrary) amount. 56 */ 57 #define XCHK_FSCOUNT_MIN_VARIANCE (512) 58 59 /* 60 * Make sure the per-AG structure has been initialized from the on-disk header 61 * contents and trust that the incore counters match the ondisk counters. (The 62 * AGF and AGI scrubbers check them, and a normal xfs_scrub run checks the 63 * summary counters after checking all AG headers). Do this from the setup 64 * function so that the inner AG aggregation loop runs as quickly as possible. 65 * 66 * This function runs during the setup phase /before/ we start checking any 67 * metadata. 68 */ 69 STATIC int 70 xchk_fscount_warmup( 71 struct xfs_scrub *sc) 72 { 73 struct xfs_mount *mp = sc->mp; 74 struct xfs_buf *agi_bp = NULL; 75 struct xfs_buf *agf_bp = NULL; 76 struct xfs_perag *pag = NULL; 77 xfs_agnumber_t agno; 78 int error = 0; 79 80 for_each_perag(mp, agno, pag) { 81 if (xchk_should_terminate(sc, &error)) 82 break; 83 if (xfs_perag_initialised_agi(pag) && 84 xfs_perag_initialised_agf(pag)) 85 continue; 86 87 /* Lock both AG headers. */ 88 error = xfs_ialloc_read_agi(pag, sc->tp, &agi_bp); 89 if (error) 90 break; 91 error = xfs_alloc_read_agf(pag, sc->tp, 0, &agf_bp); 92 if (error) 93 break; 94 95 /* 96 * These are supposed to be initialized by the header read 97 * function. 98 */ 99 if (!xfs_perag_initialised_agi(pag) || 100 !xfs_perag_initialised_agf(pag)) { 101 error = -EFSCORRUPTED; 102 break; 103 } 104 105 xfs_buf_relse(agf_bp); 106 agf_bp = NULL; 107 xfs_buf_relse(agi_bp); 108 agi_bp = NULL; 109 } 110 111 if (agf_bp) 112 xfs_buf_relse(agf_bp); 113 if (agi_bp) 114 xfs_buf_relse(agi_bp); 115 if (pag) 116 xfs_perag_rele(pag); 117 return error; 118 } 119 120 static inline int 121 xchk_fsfreeze( 122 struct xfs_scrub *sc) 123 { 124 int error; 125 126 error = freeze_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL); 127 trace_xchk_fsfreeze(sc, error); 128 return error; 129 } 130 131 static inline int 132 xchk_fsthaw( 133 struct xfs_scrub *sc) 134 { 135 int error; 136 137 /* This should always succeed, we have a kernel freeze */ 138 error = thaw_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL); 139 trace_xchk_fsthaw(sc, error); 140 return error; 141 } 142 143 /* 144 * We couldn't stabilize the filesystem long enough to sample all the variables 145 * that comprise the summary counters and compare them to the percpu counters. 146 * We need to disable all writer threads, which means taking the first two 147 * freeze levels to put userspace to sleep, and the third freeze level to 148 * prevent background threads from starting new transactions. Take one level 149 * more to prevent other callers from unfreezing the filesystem while we run. 150 */ 151 STATIC int 152 xchk_fscounters_freeze( 153 struct xfs_scrub *sc) 154 { 155 struct xchk_fscounters *fsc = sc->buf; 156 int error = 0; 157 158 if (sc->flags & XCHK_HAVE_FREEZE_PROT) { 159 sc->flags &= ~XCHK_HAVE_FREEZE_PROT; 160 mnt_drop_write_file(sc->file); 161 } 162 163 /* Try to grab a kernel freeze. */ 164 while ((error = xchk_fsfreeze(sc)) == -EBUSY) { 165 if (xchk_should_terminate(sc, &error)) 166 return error; 167 168 delay(HZ / 10); 169 } 170 if (error) 171 return error; 172 173 fsc->frozen = true; 174 return 0; 175 } 176 177 /* Thaw the filesystem after checking or repairing fscounters. */ 178 STATIC void 179 xchk_fscounters_cleanup( 180 void *buf) 181 { 182 struct xchk_fscounters *fsc = buf; 183 struct xfs_scrub *sc = fsc->sc; 184 int error; 185 186 if (!fsc->frozen) 187 return; 188 189 error = xchk_fsthaw(sc); 190 if (error) 191 xfs_emerg(sc->mp, "still frozen after scrub, err=%d", error); 192 else 193 fsc->frozen = false; 194 } 195 196 int 197 xchk_setup_fscounters( 198 struct xfs_scrub *sc) 199 { 200 struct xchk_fscounters *fsc; 201 int error; 202 203 /* 204 * If the AGF doesn't track btreeblks, we have to lock the AGF to count 205 * btree block usage by walking the actual btrees. 206 */ 207 if (!xfs_has_lazysbcount(sc->mp)) 208 xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); 209 210 sc->buf = kzalloc(sizeof(struct xchk_fscounters), XCHK_GFP_FLAGS); 211 if (!sc->buf) 212 return -ENOMEM; 213 sc->buf_cleanup = xchk_fscounters_cleanup; 214 fsc = sc->buf; 215 fsc->sc = sc; 216 217 xfs_icount_range(sc->mp, &fsc->icount_min, &fsc->icount_max); 218 219 /* We must get the incore counters set up before we can proceed. */ 220 error = xchk_fscount_warmup(sc); 221 if (error) 222 return error; 223 224 /* 225 * Pause all writer activity in the filesystem while we're scrubbing to 226 * reduce the likelihood of background perturbations to the counters 227 * throwing off our calculations. 228 * 229 * If we're repairing, we need to prevent any other thread from 230 * changing the global fs summary counters while we're repairing them. 231 * This requires the fs to be frozen, which will disable background 232 * reclaim and purge all inactive inodes. 233 */ 234 if ((sc->flags & XCHK_TRY_HARDER) || xchk_could_repair(sc)) { 235 error = xchk_fscounters_freeze(sc); 236 if (error) 237 return error; 238 } 239 240 return xchk_trans_alloc_empty(sc); 241 } 242 243 /* 244 * Part 1: Collecting filesystem summary counts. For each AG, we add its 245 * summary counts (total inodes, free inodes, free data blocks) to an incore 246 * copy of the overall filesystem summary counts. 247 * 248 * To avoid false corruption reports in part 2, any failure in this part must 249 * set the INCOMPLETE flag even when a negative errno is returned. This care 250 * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED, 251 * ECANCELED) that are absorbed into a scrub state flag update by 252 * xchk_*_process_error. Scrub and repair share the same incore data 253 * structures, so the INCOMPLETE flag is critical to prevent a repair based on 254 * insufficient information. 255 */ 256 257 /* Count free space btree blocks manually for pre-lazysbcount filesystems. */ 258 static int 259 xchk_fscount_btreeblks( 260 struct xfs_scrub *sc, 261 struct xchk_fscounters *fsc, 262 xfs_agnumber_t agno) 263 { 264 xfs_extlen_t blocks; 265 int error; 266 267 error = xchk_ag_init_existing(sc, agno, &sc->sa); 268 if (error) 269 goto out_free; 270 271 error = xfs_btree_count_blocks(sc->sa.bno_cur, &blocks); 272 if (error) 273 goto out_free; 274 fsc->fdblocks += blocks - 1; 275 276 error = xfs_btree_count_blocks(sc->sa.cnt_cur, &blocks); 277 if (error) 278 goto out_free; 279 fsc->fdblocks += blocks - 1; 280 281 out_free: 282 xchk_ag_free(sc, &sc->sa); 283 return error; 284 } 285 286 /* 287 * Calculate what the global in-core counters ought to be from the incore 288 * per-AG structure. Callers can compare this to the actual in-core counters 289 * to estimate by how much both in-core and on-disk counters need to be 290 * adjusted. 291 */ 292 STATIC int 293 xchk_fscount_aggregate_agcounts( 294 struct xfs_scrub *sc, 295 struct xchk_fscounters *fsc) 296 { 297 struct xfs_mount *mp = sc->mp; 298 struct xfs_perag *pag; 299 uint64_t delayed; 300 xfs_agnumber_t agno; 301 int tries = 8; 302 int error = 0; 303 304 retry: 305 fsc->icount = 0; 306 fsc->ifree = 0; 307 fsc->fdblocks = 0; 308 309 for_each_perag(mp, agno, pag) { 310 if (xchk_should_terminate(sc, &error)) 311 break; 312 313 /* This somehow got unset since the warmup? */ 314 if (!xfs_perag_initialised_agi(pag) || 315 !xfs_perag_initialised_agf(pag)) { 316 error = -EFSCORRUPTED; 317 break; 318 } 319 320 /* Count all the inodes */ 321 fsc->icount += pag->pagi_count; 322 fsc->ifree += pag->pagi_freecount; 323 324 /* Add up the free/freelist/bnobt/cntbt blocks */ 325 fsc->fdblocks += pag->pagf_freeblks; 326 fsc->fdblocks += pag->pagf_flcount; 327 if (xfs_has_lazysbcount(sc->mp)) { 328 fsc->fdblocks += pag->pagf_btreeblks; 329 } else { 330 error = xchk_fscount_btreeblks(sc, fsc, agno); 331 if (error) 332 break; 333 } 334 335 /* 336 * Per-AG reservations are taken out of the incore counters, 337 * so they must be left out of the free blocks computation. 338 */ 339 fsc->fdblocks -= pag->pag_meta_resv.ar_reserved; 340 fsc->fdblocks -= pag->pag_rmapbt_resv.ar_orig_reserved; 341 342 } 343 if (pag) 344 xfs_perag_rele(pag); 345 if (error) { 346 xchk_set_incomplete(sc); 347 return error; 348 } 349 350 /* 351 * The global incore space reservation is taken from the incore 352 * counters, so leave that out of the computation. 353 */ 354 fsc->fdblocks -= mp->m_resblks_avail; 355 356 /* 357 * Delayed allocation reservations are taken out of the incore counters 358 * but not recorded on disk, so leave them and their indlen blocks out 359 * of the computation. 360 */ 361 delayed = percpu_counter_sum(&mp->m_delalloc_blks); 362 fsc->fdblocks -= delayed; 363 364 trace_xchk_fscounters_calc(mp, fsc->icount, fsc->ifree, fsc->fdblocks, 365 delayed); 366 367 368 /* Bail out if the values we compute are totally nonsense. */ 369 if (fsc->icount < fsc->icount_min || fsc->icount > fsc->icount_max || 370 fsc->fdblocks > mp->m_sb.sb_dblocks || 371 fsc->ifree > fsc->icount_max) 372 return -EFSCORRUPTED; 373 374 /* 375 * If ifree > icount then we probably had some perturbation in the 376 * counters while we were calculating things. We'll try a few times 377 * to maintain ifree <= icount before giving up. 378 */ 379 if (fsc->ifree > fsc->icount) { 380 if (tries--) 381 goto retry; 382 return -EDEADLOCK; 383 } 384 385 return 0; 386 } 387 388 #ifdef CONFIG_XFS_RT 389 STATIC int 390 xchk_fscount_add_frextent( 391 struct xfs_mount *mp, 392 struct xfs_trans *tp, 393 const struct xfs_rtalloc_rec *rec, 394 void *priv) 395 { 396 struct xchk_fscounters *fsc = priv; 397 int error = 0; 398 399 fsc->frextents += rec->ar_extcount; 400 401 xchk_should_terminate(fsc->sc, &error); 402 return error; 403 } 404 405 /* Calculate the number of free realtime extents from the realtime bitmap. */ 406 STATIC int 407 xchk_fscount_count_frextents( 408 struct xfs_scrub *sc, 409 struct xchk_fscounters *fsc) 410 { 411 struct xfs_mount *mp = sc->mp; 412 int error; 413 414 fsc->frextents = 0; 415 if (!xfs_has_realtime(mp)) 416 return 0; 417 418 xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); 419 error = xfs_rtalloc_query_all(sc->mp, sc->tp, 420 xchk_fscount_add_frextent, fsc); 421 if (error) { 422 xchk_set_incomplete(sc); 423 goto out_unlock; 424 } 425 426 out_unlock: 427 xfs_iunlock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); 428 return error; 429 } 430 #else 431 STATIC int 432 xchk_fscount_count_frextents( 433 struct xfs_scrub *sc, 434 struct xchk_fscounters *fsc) 435 { 436 fsc->frextents = 0; 437 return 0; 438 } 439 #endif /* CONFIG_XFS_RT */ 440 441 /* 442 * Part 2: Comparing filesystem summary counters. All we have to do here is 443 * sum the percpu counters and compare them to what we've observed. 444 */ 445 446 /* 447 * Is the @counter reasonably close to the @expected value? 448 * 449 * We neither locked nor froze anything in the filesystem while aggregating the 450 * per-AG data to compute the @expected value, which means that the counter 451 * could have changed. We know the @old_value of the summation of the counter 452 * before the aggregation, and we re-sum the counter now. If the expected 453 * value falls between the two summations, we're ok. 454 * 455 * Otherwise, we /might/ have a problem. If the change in the summations is 456 * more than we want to tolerate, the filesystem is probably busy and we should 457 * just send back INCOMPLETE and see if userspace will try again. 458 * 459 * If we're repairing then we require an exact match. 460 */ 461 static inline bool 462 xchk_fscount_within_range( 463 struct xfs_scrub *sc, 464 const int64_t old_value, 465 struct percpu_counter *counter, 466 uint64_t expected) 467 { 468 int64_t min_value, max_value; 469 int64_t curr_value = percpu_counter_sum(counter); 470 471 trace_xchk_fscounters_within_range(sc->mp, expected, curr_value, 472 old_value); 473 474 /* Negative values are always wrong. */ 475 if (curr_value < 0) 476 return false; 477 478 /* Exact matches are always ok. */ 479 if (curr_value == expected) 480 return true; 481 482 /* We require exact matches when repair is running. */ 483 if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) 484 return false; 485 486 min_value = min(old_value, curr_value); 487 max_value = max(old_value, curr_value); 488 489 /* Within the before-and-after range is ok. */ 490 if (expected >= min_value && expected <= max_value) 491 return true; 492 493 /* Everything else is bad. */ 494 return false; 495 } 496 497 /* Check the superblock counters. */ 498 int 499 xchk_fscounters( 500 struct xfs_scrub *sc) 501 { 502 struct xfs_mount *mp = sc->mp; 503 struct xchk_fscounters *fsc = sc->buf; 504 int64_t icount, ifree, fdblocks, frextents; 505 bool try_again = false; 506 int error; 507 508 /* Snapshot the percpu counters. */ 509 icount = percpu_counter_sum(&mp->m_icount); 510 ifree = percpu_counter_sum(&mp->m_ifree); 511 fdblocks = percpu_counter_sum(&mp->m_fdblocks); 512 frextents = percpu_counter_sum(&mp->m_frextents); 513 514 /* No negative values, please! */ 515 if (icount < 0 || ifree < 0) 516 xchk_set_corrupt(sc); 517 518 /* 519 * If the filesystem is not frozen, the counter summation calls above 520 * can race with xfs_mod_freecounter, which subtracts a requested space 521 * reservation from the counter and undoes the subtraction if that made 522 * the counter go negative. Therefore, it's possible to see negative 523 * values here, and we should only flag that as a corruption if we 524 * froze the fs. This is much more likely to happen with frextents 525 * since there are no reserved pools. 526 */ 527 if (fdblocks < 0 || frextents < 0) { 528 if (!fsc->frozen) 529 return -EDEADLOCK; 530 531 xchk_set_corrupt(sc); 532 return 0; 533 } 534 535 /* See if icount is obviously wrong. */ 536 if (icount < fsc->icount_min || icount > fsc->icount_max) 537 xchk_set_corrupt(sc); 538 539 /* See if fdblocks is obviously wrong. */ 540 if (fdblocks > mp->m_sb.sb_dblocks) 541 xchk_set_corrupt(sc); 542 543 /* See if frextents is obviously wrong. */ 544 if (frextents > mp->m_sb.sb_rextents) 545 xchk_set_corrupt(sc); 546 547 /* 548 * If ifree exceeds icount by more than the minimum variance then 549 * something's probably wrong with the counters. 550 */ 551 if (ifree > icount && ifree - icount > XCHK_FSCOUNT_MIN_VARIANCE) 552 xchk_set_corrupt(sc); 553 554 /* Walk the incore AG headers to calculate the expected counters. */ 555 error = xchk_fscount_aggregate_agcounts(sc, fsc); 556 if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error)) 557 return error; 558 559 /* Count the free extents counter for rt volumes. */ 560 error = xchk_fscount_count_frextents(sc, fsc); 561 if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error)) 562 return error; 563 if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE) 564 return 0; 565 566 /* 567 * Compare the in-core counters with whatever we counted. If the fs is 568 * frozen, we treat the discrepancy as a corruption because the freeze 569 * should have stabilized the counter values. Otherwise, we need 570 * userspace to call us back having granted us freeze permission. 571 */ 572 if (!xchk_fscount_within_range(sc, icount, &mp->m_icount, 573 fsc->icount)) { 574 if (fsc->frozen) 575 xchk_set_corrupt(sc); 576 else 577 try_again = true; 578 } 579 580 if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree)) { 581 if (fsc->frozen) 582 xchk_set_corrupt(sc); 583 else 584 try_again = true; 585 } 586 587 if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks, 588 fsc->fdblocks)) { 589 if (fsc->frozen) 590 xchk_set_corrupt(sc); 591 else 592 try_again = true; 593 } 594 595 if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents, 596 fsc->frextents)) { 597 if (fsc->frozen) 598 xchk_set_corrupt(sc); 599 else 600 try_again = true; 601 } 602 603 if (try_again) 604 return -EDEADLOCK; 605 606 return 0; 607 } 608