1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2019-2023 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_log_format.h" 12 #include "xfs_trans.h" 13 #include "xfs_mount.h" 14 #include "xfs_alloc.h" 15 #include "xfs_ialloc.h" 16 #include "xfs_health.h" 17 #include "xfs_btree.h" 18 #include "xfs_ag.h" 19 #include "xfs_rtbitmap.h" 20 #include "xfs_inode.h" 21 #include "xfs_icache.h" 22 #include "xfs_rtgroup.h" 23 #include "scrub/scrub.h" 24 #include "scrub/common.h" 25 #include "scrub/trace.h" 26 #include "scrub/fscounters.h" 27 28 /* 29 * FS Summary Counters 30 * =================== 31 * 32 * The basics of filesystem summary counter checking are that we iterate the 33 * AGs counting the number of free blocks, free space btree blocks, per-AG 34 * reservations, inodes, delayed allocation reservations, and free inodes. 35 * Then we compare what we computed against the in-core counters. 36 * 37 * However, the reality is that summary counters are a tricky beast to check. 38 * While we /could/ freeze the filesystem and scramble around the AGs counting 39 * the free blocks, in practice we prefer not do that for a scan because 40 * freezing is costly. To get around this, we added a per-cpu counter of the 41 * delalloc reservations so that we can rotor around the AGs relatively 42 * quickly, and we allow the counts to be slightly off because we're not taking 43 * any locks while we do this. 44 * 45 * So the first thing we do is warm up the buffer cache in the setup routine by 46 * walking all the AGs to make sure the incore per-AG structure has been 47 * initialized. The expected value calculation then iterates the incore per-AG 48 * structures as quickly as it can. We snapshot the percpu counters before and 49 * after this operation and use the difference in counter values to guess at 50 * our tolerance for mismatch between expected and actual counter values. 51 */ 52 53 /* 54 * Since the expected value computation is lockless but only browses incore 55 * values, the percpu counters should be fairly close to each other. However, 56 * we'll allow ourselves to be off by at least this (arbitrary) amount. 57 */ 58 #define XCHK_FSCOUNT_MIN_VARIANCE (512) 59 60 /* 61 * Make sure the per-AG structure has been initialized from the on-disk header 62 * contents and trust that the incore counters match the ondisk counters. (The 63 * AGF and AGI scrubbers check them, and a normal xfs_scrub run checks the 64 * summary counters after checking all AG headers). Do this from the setup 65 * function so that the inner AG aggregation loop runs as quickly as possible. 66 * 67 * This function runs during the setup phase /before/ we start checking any 68 * metadata. 69 */ 70 STATIC int 71 xchk_fscount_warmup( 72 struct xfs_scrub *sc) 73 { 74 struct xfs_mount *mp = sc->mp; 75 struct xfs_buf *agi_bp = NULL; 76 struct xfs_buf *agf_bp = NULL; 77 struct xfs_perag *pag = NULL; 78 int error = 0; 79 80 while ((pag = xfs_perag_next(mp, pag))) { 81 if (xchk_should_terminate(sc, &error)) 82 break; 83 if (xfs_perag_initialised_agi(pag) && 84 xfs_perag_initialised_agf(pag)) 85 continue; 86 87 /* Lock both AG headers. */ 88 error = xfs_ialloc_read_agi(pag, sc->tp, 0, &agi_bp); 89 if (error) 90 break; 91 error = xfs_alloc_read_agf(pag, sc->tp, 0, &agf_bp); 92 if (error) 93 break; 94 95 /* 96 * These are supposed to be initialized by the header read 97 * function. 98 */ 99 if (!xfs_perag_initialised_agi(pag) || 100 !xfs_perag_initialised_agf(pag)) { 101 error = -EFSCORRUPTED; 102 break; 103 } 104 105 xfs_buf_relse(agf_bp); 106 agf_bp = NULL; 107 xfs_buf_relse(agi_bp); 108 agi_bp = NULL; 109 } 110 111 if (agf_bp) 112 xfs_buf_relse(agf_bp); 113 if (agi_bp) 114 xfs_buf_relse(agi_bp); 115 if (pag) 116 xfs_perag_rele(pag); 117 return error; 118 } 119 120 static inline int 121 xchk_fsfreeze( 122 struct xfs_scrub *sc) 123 { 124 int error; 125 126 error = freeze_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL); 127 trace_xchk_fsfreeze(sc, error); 128 return error; 129 } 130 131 static inline int 132 xchk_fsthaw( 133 struct xfs_scrub *sc) 134 { 135 int error; 136 137 /* This should always succeed, we have a kernel freeze */ 138 error = thaw_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL); 139 trace_xchk_fsthaw(sc, error); 140 return error; 141 } 142 143 /* 144 * We couldn't stabilize the filesystem long enough to sample all the variables 145 * that comprise the summary counters and compare them to the percpu counters. 146 * We need to disable all writer threads, which means taking the first two 147 * freeze levels to put userspace to sleep, and the third freeze level to 148 * prevent background threads from starting new transactions. Take one level 149 * more to prevent other callers from unfreezing the filesystem while we run. 150 */ 151 STATIC int 152 xchk_fscounters_freeze( 153 struct xfs_scrub *sc) 154 { 155 struct xchk_fscounters *fsc = sc->buf; 156 int error = 0; 157 158 if (sc->flags & XCHK_HAVE_FREEZE_PROT) { 159 sc->flags &= ~XCHK_HAVE_FREEZE_PROT; 160 mnt_drop_write_file(sc->file); 161 } 162 163 /* Try to grab a kernel freeze. */ 164 while ((error = xchk_fsfreeze(sc)) == -EBUSY) { 165 if (xchk_should_terminate(sc, &error)) 166 return error; 167 168 delay(HZ / 10); 169 } 170 if (error) 171 return error; 172 173 fsc->frozen = true; 174 return 0; 175 } 176 177 /* Thaw the filesystem after checking or repairing fscounters. */ 178 STATIC void 179 xchk_fscounters_cleanup( 180 void *buf) 181 { 182 struct xchk_fscounters *fsc = buf; 183 struct xfs_scrub *sc = fsc->sc; 184 int error; 185 186 if (!fsc->frozen) 187 return; 188 189 error = xchk_fsthaw(sc); 190 if (error) 191 xfs_emerg(sc->mp, "still frozen after scrub, err=%d", error); 192 else 193 fsc->frozen = false; 194 } 195 196 int 197 xchk_setup_fscounters( 198 struct xfs_scrub *sc) 199 { 200 struct xchk_fscounters *fsc; 201 int error; 202 203 /* 204 * If the AGF doesn't track btreeblks, we have to lock the AGF to count 205 * btree block usage by walking the actual btrees. 206 */ 207 if (!xfs_has_lazysbcount(sc->mp)) 208 xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); 209 210 sc->buf = kzalloc(sizeof(struct xchk_fscounters), XCHK_GFP_FLAGS); 211 if (!sc->buf) 212 return -ENOMEM; 213 sc->buf_cleanup = xchk_fscounters_cleanup; 214 fsc = sc->buf; 215 fsc->sc = sc; 216 217 xfs_icount_range(sc->mp, &fsc->icount_min, &fsc->icount_max); 218 219 /* We must get the incore counters set up before we can proceed. */ 220 error = xchk_fscount_warmup(sc); 221 if (error) 222 return error; 223 224 /* 225 * Pause all writer activity in the filesystem while we're scrubbing to 226 * reduce the likelihood of background perturbations to the counters 227 * throwing off our calculations. 228 * 229 * If we're repairing, we need to prevent any other thread from 230 * changing the global fs summary counters while we're repairing them. 231 * This requires the fs to be frozen, which will disable background 232 * reclaim and purge all inactive inodes. 233 */ 234 if ((sc->flags & XCHK_TRY_HARDER) || xchk_could_repair(sc)) { 235 error = xchk_fscounters_freeze(sc); 236 if (error) 237 return error; 238 } 239 240 return xchk_trans_alloc_empty(sc); 241 } 242 243 /* 244 * Part 1: Collecting filesystem summary counts. For each AG, we add its 245 * summary counts (total inodes, free inodes, free data blocks) to an incore 246 * copy of the overall filesystem summary counts. 247 * 248 * To avoid false corruption reports in part 2, any failure in this part must 249 * set the INCOMPLETE flag even when a negative errno is returned. This care 250 * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED, 251 * ECANCELED) that are absorbed into a scrub state flag update by 252 * xchk_*_process_error. Scrub and repair share the same incore data 253 * structures, so the INCOMPLETE flag is critical to prevent a repair based on 254 * insufficient information. 255 */ 256 257 /* Count free space btree blocks manually for pre-lazysbcount filesystems. */ 258 static int 259 xchk_fscount_btreeblks( 260 struct xfs_scrub *sc, 261 struct xchk_fscounters *fsc, 262 xfs_agnumber_t agno) 263 { 264 xfs_filblks_t blocks; 265 int error; 266 267 error = xchk_ag_init_existing(sc, agno, &sc->sa); 268 if (error) 269 goto out_free; 270 271 error = xfs_btree_count_blocks(sc->sa.bno_cur, &blocks); 272 if (error) 273 goto out_free; 274 fsc->fdblocks += blocks - 1; 275 276 error = xfs_btree_count_blocks(sc->sa.cnt_cur, &blocks); 277 if (error) 278 goto out_free; 279 fsc->fdblocks += blocks - 1; 280 281 out_free: 282 xchk_ag_free(sc, &sc->sa); 283 return error; 284 } 285 286 /* 287 * Calculate what the global in-core counters ought to be from the incore 288 * per-AG structure. Callers can compare this to the actual in-core counters 289 * to estimate by how much both in-core and on-disk counters need to be 290 * adjusted. 291 */ 292 STATIC int 293 xchk_fscount_aggregate_agcounts( 294 struct xfs_scrub *sc, 295 struct xchk_fscounters *fsc) 296 { 297 struct xfs_mount *mp = sc->mp; 298 struct xfs_perag *pag = NULL; 299 uint64_t delayed; 300 int tries = 8; 301 int error = 0; 302 303 retry: 304 fsc->icount = 0; 305 fsc->ifree = 0; 306 fsc->fdblocks = 0; 307 308 while ((pag = xfs_perag_next(mp, pag))) { 309 if (xchk_should_terminate(sc, &error)) 310 break; 311 312 /* This somehow got unset since the warmup? */ 313 if (!xfs_perag_initialised_agi(pag) || 314 !xfs_perag_initialised_agf(pag)) { 315 error = -EFSCORRUPTED; 316 break; 317 } 318 319 /* Count all the inodes */ 320 fsc->icount += pag->pagi_count; 321 fsc->ifree += pag->pagi_freecount; 322 323 /* Add up the free/freelist/bnobt/cntbt blocks */ 324 fsc->fdblocks += pag->pagf_freeblks; 325 fsc->fdblocks += pag->pagf_flcount; 326 if (xfs_has_lazysbcount(sc->mp)) { 327 fsc->fdblocks += pag->pagf_btreeblks; 328 } else { 329 error = xchk_fscount_btreeblks(sc, fsc, pag_agno(pag)); 330 if (error) 331 break; 332 } 333 334 /* 335 * Per-AG reservations are taken out of the incore counters, 336 * so they must be left out of the free blocks computation. 337 */ 338 fsc->fdblocks -= pag->pag_meta_resv.ar_reserved; 339 fsc->fdblocks -= pag->pag_rmapbt_resv.ar_orig_reserved; 340 341 } 342 if (pag) 343 xfs_perag_rele(pag); 344 if (error) { 345 xchk_set_incomplete(sc); 346 return error; 347 } 348 349 /* 350 * The global incore space reservation is taken from the incore 351 * counters, so leave that out of the computation. 352 */ 353 fsc->fdblocks -= mp->m_resblks_avail; 354 355 /* 356 * Delayed allocation reservations are taken out of the incore counters 357 * but not recorded on disk, so leave them and their indlen blocks out 358 * of the computation. 359 */ 360 delayed = percpu_counter_sum(&mp->m_delalloc_blks); 361 fsc->fdblocks -= delayed; 362 363 trace_xchk_fscounters_calc(mp, fsc->icount, fsc->ifree, fsc->fdblocks, 364 delayed); 365 366 367 /* Bail out if the values we compute are totally nonsense. */ 368 if (fsc->icount < fsc->icount_min || fsc->icount > fsc->icount_max || 369 fsc->fdblocks > mp->m_sb.sb_dblocks || 370 fsc->ifree > fsc->icount_max) 371 return -EFSCORRUPTED; 372 373 /* 374 * If ifree > icount then we probably had some perturbation in the 375 * counters while we were calculating things. We'll try a few times 376 * to maintain ifree <= icount before giving up. 377 */ 378 if (fsc->ifree > fsc->icount) { 379 if (tries--) 380 goto retry; 381 return -EDEADLOCK; 382 } 383 384 return 0; 385 } 386 387 #ifdef CONFIG_XFS_RT 388 STATIC int 389 xchk_fscount_add_frextent( 390 struct xfs_rtgroup *rtg, 391 struct xfs_trans *tp, 392 const struct xfs_rtalloc_rec *rec, 393 void *priv) 394 { 395 struct xchk_fscounters *fsc = priv; 396 int error = 0; 397 398 fsc->frextents += rec->ar_extcount; 399 400 xchk_should_terminate(fsc->sc, &error); 401 return error; 402 } 403 404 /* Calculate the number of free realtime extents from the realtime bitmap. */ 405 STATIC int 406 xchk_fscount_count_frextents( 407 struct xfs_scrub *sc, 408 struct xchk_fscounters *fsc) 409 { 410 struct xfs_mount *mp = sc->mp; 411 struct xfs_rtgroup *rtg = NULL; 412 int error; 413 414 fsc->frextents = 0; 415 fsc->frextents_delayed = 0; 416 if (!xfs_has_realtime(mp)) 417 return 0; 418 419 while ((rtg = xfs_rtgroup_next(mp, rtg))) { 420 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP_SHARED); 421 error = xfs_rtalloc_query_all(rtg, sc->tp, 422 xchk_fscount_add_frextent, fsc); 423 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED); 424 if (error) { 425 xchk_set_incomplete(sc); 426 xfs_rtgroup_rele(rtg); 427 return error; 428 } 429 } 430 431 fsc->frextents_delayed = percpu_counter_sum(&mp->m_delalloc_rtextents); 432 return 0; 433 } 434 #else 435 STATIC int 436 xchk_fscount_count_frextents( 437 struct xfs_scrub *sc, 438 struct xchk_fscounters *fsc) 439 { 440 fsc->frextents = 0; 441 fsc->frextents_delayed = 0; 442 return 0; 443 } 444 #endif /* CONFIG_XFS_RT */ 445 446 /* 447 * Part 2: Comparing filesystem summary counters. All we have to do here is 448 * sum the percpu counters and compare them to what we've observed. 449 */ 450 451 /* 452 * Is the @counter reasonably close to the @expected value? 453 * 454 * We neither locked nor froze anything in the filesystem while aggregating the 455 * per-AG data to compute the @expected value, which means that the counter 456 * could have changed. We know the @old_value of the summation of the counter 457 * before the aggregation, and we re-sum the counter now. If the expected 458 * value falls between the two summations, we're ok. 459 * 460 * Otherwise, we /might/ have a problem. If the change in the summations is 461 * more than we want to tolerate, the filesystem is probably busy and we should 462 * just send back INCOMPLETE and see if userspace will try again. 463 * 464 * If we're repairing then we require an exact match. 465 */ 466 static inline bool 467 xchk_fscount_within_range( 468 struct xfs_scrub *sc, 469 const int64_t old_value, 470 struct percpu_counter *counter, 471 uint64_t expected) 472 { 473 int64_t min_value, max_value; 474 int64_t curr_value = percpu_counter_sum(counter); 475 476 trace_xchk_fscounters_within_range(sc->mp, expected, curr_value, 477 old_value); 478 479 /* Negative values are always wrong. */ 480 if (curr_value < 0) 481 return false; 482 483 /* Exact matches are always ok. */ 484 if (curr_value == expected) 485 return true; 486 487 /* We require exact matches when repair is running. */ 488 if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) 489 return false; 490 491 min_value = min(old_value, curr_value); 492 max_value = max(old_value, curr_value); 493 494 /* Within the before-and-after range is ok. */ 495 if (expected >= min_value && expected <= max_value) 496 return true; 497 498 /* Everything else is bad. */ 499 return false; 500 } 501 502 /* Check the superblock counters. */ 503 int 504 xchk_fscounters( 505 struct xfs_scrub *sc) 506 { 507 struct xfs_mount *mp = sc->mp; 508 struct xchk_fscounters *fsc = sc->buf; 509 int64_t icount, ifree, fdblocks, frextents; 510 bool try_again = false; 511 int error; 512 513 /* Snapshot the percpu counters. */ 514 icount = percpu_counter_sum(&mp->m_icount); 515 ifree = percpu_counter_sum(&mp->m_ifree); 516 fdblocks = percpu_counter_sum(&mp->m_fdblocks); 517 frextents = percpu_counter_sum(&mp->m_frextents); 518 519 /* No negative values, please! */ 520 if (icount < 0 || ifree < 0) 521 xchk_set_corrupt(sc); 522 523 /* 524 * If the filesystem is not frozen, the counter summation calls above 525 * can race with xfs_dec_freecounter, which subtracts a requested space 526 * reservation from the counter and undoes the subtraction if that made 527 * the counter go negative. Therefore, it's possible to see negative 528 * values here, and we should only flag that as a corruption if we 529 * froze the fs. This is much more likely to happen with frextents 530 * since there are no reserved pools. 531 */ 532 if (fdblocks < 0 || frextents < 0) { 533 if (!fsc->frozen) 534 return -EDEADLOCK; 535 536 xchk_set_corrupt(sc); 537 return 0; 538 } 539 540 /* See if icount is obviously wrong. */ 541 if (icount < fsc->icount_min || icount > fsc->icount_max) 542 xchk_set_corrupt(sc); 543 544 /* See if fdblocks is obviously wrong. */ 545 if (fdblocks > mp->m_sb.sb_dblocks) 546 xchk_set_corrupt(sc); 547 548 /* See if frextents is obviously wrong. */ 549 if (frextents > mp->m_sb.sb_rextents) 550 xchk_set_corrupt(sc); 551 552 /* 553 * If ifree exceeds icount by more than the minimum variance then 554 * something's probably wrong with the counters. 555 */ 556 if (ifree > icount && ifree - icount > XCHK_FSCOUNT_MIN_VARIANCE) 557 xchk_set_corrupt(sc); 558 559 /* Walk the incore AG headers to calculate the expected counters. */ 560 error = xchk_fscount_aggregate_agcounts(sc, fsc); 561 if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error)) 562 return error; 563 564 /* Count the free extents counter for rt volumes. */ 565 error = xchk_fscount_count_frextents(sc, fsc); 566 if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error)) 567 return error; 568 if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE) 569 return 0; 570 571 /* 572 * Compare the in-core counters with whatever we counted. If the fs is 573 * frozen, we treat the discrepancy as a corruption because the freeze 574 * should have stabilized the counter values. Otherwise, we need 575 * userspace to call us back having granted us freeze permission. 576 */ 577 if (!xchk_fscount_within_range(sc, icount, &mp->m_icount, 578 fsc->icount)) { 579 if (fsc->frozen) 580 xchk_set_corrupt(sc); 581 else 582 try_again = true; 583 } 584 585 if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree)) { 586 if (fsc->frozen) 587 xchk_set_corrupt(sc); 588 else 589 try_again = true; 590 } 591 592 if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks, 593 fsc->fdblocks)) { 594 if (fsc->frozen) 595 xchk_set_corrupt(sc); 596 else 597 try_again = true; 598 } 599 600 if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents, 601 fsc->frextents - fsc->frextents_delayed)) { 602 if (fsc->frozen) 603 xchk_set_corrupt(sc); 604 else 605 try_again = true; 606 } 607 608 if (try_again) 609 return -EDEADLOCK; 610 611 return 0; 612 } 613