1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2017-2023 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_log_format.h" 13 #include "xfs_trans.h" 14 #include "xfs_inode.h" 15 #include "xfs_quota.h" 16 #include "xfs_qm.h" 17 #include "xfs_scrub.h" 18 #include "xfs_buf_mem.h" 19 #include "xfs_rmap.h" 20 #include "xfs_exchrange.h" 21 #include "xfs_exchmaps.h" 22 #include "xfs_dir2.h" 23 #include "xfs_parent.h" 24 #include "xfs_icache.h" 25 #include "scrub/scrub.h" 26 #include "scrub/common.h" 27 #include "scrub/trace.h" 28 #include "scrub/repair.h" 29 #include "scrub/health.h" 30 #include "scrub/stats.h" 31 #include "scrub/xfile.h" 32 #include "scrub/tempfile.h" 33 #include "scrub/orphanage.h" 34 35 /* 36 * Online Scrub and Repair 37 * 38 * Traditionally, XFS (the kernel driver) did not know how to check or 39 * repair on-disk data structures. That task was left to the xfs_check 40 * and xfs_repair tools, both of which require taking the filesystem 41 * offline for a thorough but time consuming examination. Online 42 * scrub & repair, on the other hand, enables us to check the metadata 43 * for obvious errors while carefully stepping around the filesystem's 44 * ongoing operations, locking rules, etc. 45 * 46 * Given that most XFS metadata consist of records stored in a btree, 47 * most of the checking functions iterate the btree blocks themselves 48 * looking for irregularities. When a record block is encountered, each 49 * record can be checked for obviously bad values. Record values can 50 * also be cross-referenced against other btrees to look for potential 51 * misunderstandings between pieces of metadata. 52 * 53 * It is expected that the checkers responsible for per-AG metadata 54 * structures will lock the AG headers (AGI, AGF, AGFL), iterate the 55 * metadata structure, and perform any relevant cross-referencing before 56 * unlocking the AG and returning the results to userspace. These 57 * scrubbers must not keep an AG locked for too long to avoid tying up 58 * the block and inode allocators. 59 * 60 * Block maps and b-trees rooted in an inode present a special challenge 61 * because they can involve extents from any AG. The general scrubber 62 * structure of lock -> check -> xref -> unlock still holds, but AG 63 * locking order rules /must/ be obeyed to avoid deadlocks. The 64 * ordering rule, of course, is that we must lock in increasing AG 65 * order. Helper functions are provided to track which AG headers we've 66 * already locked. If we detect an imminent locking order violation, we 67 * can signal a potential deadlock, in which case the scrubber can jump 68 * out to the top level, lock all the AGs in order, and retry the scrub. 69 * 70 * For file data (directories, extended attributes, symlinks) scrub, we 71 * can simply lock the inode and walk the data. For btree data 72 * (directories and attributes) we follow the same btree-scrubbing 73 * strategy outlined previously to check the records. 74 * 75 * We use a bit of trickery with transactions to avoid buffer deadlocks 76 * if there is a cycle in the metadata. The basic problem is that 77 * travelling down a btree involves locking the current buffer at each 78 * tree level. If a pointer should somehow point back to a buffer that 79 * we've already examined, we will deadlock due to the second buffer 80 * locking attempt. Note however that grabbing a buffer in transaction 81 * context links the locked buffer to the transaction. If we try to 82 * re-grab the buffer in the context of the same transaction, we avoid 83 * the second lock attempt and continue. Between the verifier and the 84 * scrubber, something will notice that something is amiss and report 85 * the corruption. Therefore, each scrubber will allocate an empty 86 * transaction, attach buffers to it, and cancel the transaction at the 87 * end of the scrub run. Cancelling a non-dirty transaction simply 88 * unlocks the buffers. 89 * 90 * There are four pieces of data that scrub can communicate to 91 * userspace. The first is the error code (errno), which can be used to 92 * communicate operational errors in performing the scrub. There are 93 * also three flags that can be set in the scrub context. If the data 94 * structure itself is corrupt, the CORRUPT flag will be set. If 95 * the metadata is correct but otherwise suboptimal, the PREEN flag 96 * will be set. 97 * 98 * We perform secondary validation of filesystem metadata by 99 * cross-referencing every record with all other available metadata. 100 * For example, for block mapping extents, we verify that there are no 101 * records in the free space and inode btrees corresponding to that 102 * space extent and that there is a corresponding entry in the reverse 103 * mapping btree. Inconsistent metadata is noted by setting the 104 * XCORRUPT flag; btree query function errors are noted by setting the 105 * XFAIL flag and deleting the cursor to prevent further attempts to 106 * cross-reference with a defective btree. 107 * 108 * If a piece of metadata proves corrupt or suboptimal, the userspace 109 * program can ask the kernel to apply some tender loving care (TLC) to 110 * the metadata object by setting the REPAIR flag and re-calling the 111 * scrub ioctl. "Corruption" is defined by metadata violating the 112 * on-disk specification; operations cannot continue if the violation is 113 * left untreated. It is possible for XFS to continue if an object is 114 * "suboptimal", however performance may be degraded. Repairs are 115 * usually performed by rebuilding the metadata entirely out of 116 * redundant metadata. Optimizing, on the other hand, can sometimes be 117 * done without rebuilding entire structures. 118 * 119 * Generally speaking, the repair code has the following code structure: 120 * Lock -> scrub -> repair -> commit -> re-lock -> re-scrub -> unlock. 121 * The first check helps us figure out if we need to rebuild or simply 122 * optimize the structure so that the rebuild knows what to do. The 123 * second check evaluates the completeness of the repair; that is what 124 * is reported to userspace. 125 * 126 * A quick note on symbol prefixes: 127 * - "xfs_" are general XFS symbols. 128 * - "xchk_" are symbols related to metadata checking. 129 * - "xrep_" are symbols related to metadata repair. 130 * - "xfs_scrub_" are symbols that tie online fsck to the rest of XFS. 131 */ 132 133 /* 134 * Scrub probe -- userspace uses this to probe if we're willing to scrub 135 * or repair a given mountpoint. This will be used by xfs_scrub to 136 * probe the kernel's abilities to scrub (and repair) the metadata. We 137 * do this by validating the ioctl inputs from userspace, preparing the 138 * filesystem for a scrub (or a repair) operation, and immediately 139 * returning to userspace. Userspace can use the returned errno and 140 * structure state to decide (in broad terms) if scrub/repair are 141 * supported by the running kernel. 142 */ 143 static int 144 xchk_probe( 145 struct xfs_scrub *sc) 146 { 147 int error = 0; 148 149 if (xchk_should_terminate(sc, &error)) 150 return error; 151 152 return 0; 153 } 154 155 /* Scrub setup and teardown */ 156 157 static inline void 158 xchk_fsgates_disable( 159 struct xfs_scrub *sc) 160 { 161 if (!(sc->flags & XCHK_FSGATES_ALL)) 162 return; 163 164 trace_xchk_fsgates_disable(sc, sc->flags & XCHK_FSGATES_ALL); 165 166 if (sc->flags & XCHK_FSGATES_DRAIN) 167 xfs_defer_drain_wait_disable(); 168 169 if (sc->flags & XCHK_FSGATES_QUOTA) 170 xfs_dqtrx_hook_disable(); 171 172 if (sc->flags & XCHK_FSGATES_DIRENTS) 173 xfs_dir_hook_disable(); 174 175 if (sc->flags & XCHK_FSGATES_RMAP) 176 xfs_rmap_hook_disable(); 177 178 sc->flags &= ~XCHK_FSGATES_ALL; 179 } 180 181 /* Free the resources associated with a scrub subtype. */ 182 void 183 xchk_scrub_free_subord( 184 struct xfs_scrub_subord *sub) 185 { 186 struct xfs_scrub *sc = sub->parent_sc; 187 188 ASSERT(sc->ip == sub->sc.ip); 189 ASSERT(sc->orphanage == sub->sc.orphanage); 190 ASSERT(sc->tempip == sub->sc.tempip); 191 192 sc->sm->sm_type = sub->old_smtype; 193 sc->sm->sm_flags = sub->old_smflags | 194 (sc->sm->sm_flags & XFS_SCRUB_FLAGS_OUT); 195 sc->tp = sub->sc.tp; 196 197 if (sub->sc.buf) { 198 if (sub->sc.buf_cleanup) 199 sub->sc.buf_cleanup(sub->sc.buf); 200 kvfree(sub->sc.buf); 201 } 202 if (sub->sc.xmbtp) 203 xmbuf_free(sub->sc.xmbtp); 204 if (sub->sc.xfile) 205 xfile_destroy(sub->sc.xfile); 206 207 sc->ilock_flags = sub->sc.ilock_flags; 208 sc->orphanage_ilock_flags = sub->sc.orphanage_ilock_flags; 209 sc->temp_ilock_flags = sub->sc.temp_ilock_flags; 210 211 kfree(sub); 212 } 213 214 /* Free all the resources and finish the transactions. */ 215 STATIC int 216 xchk_teardown( 217 struct xfs_scrub *sc, 218 int error) 219 { 220 xchk_ag_free(sc, &sc->sa); 221 xchk_rtgroup_btcur_free(&sc->sr); 222 223 if (sc->tp) { 224 if (error == 0 && (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)) 225 error = xfs_trans_commit(sc->tp); 226 else 227 xfs_trans_cancel(sc->tp); 228 sc->tp = NULL; 229 } 230 if (sc->sr.rtg) 231 xchk_rtgroup_free(sc, &sc->sr); 232 if (sc->ip) { 233 if (sc->ilock_flags) 234 xchk_iunlock(sc, sc->ilock_flags); 235 xchk_irele(sc, sc->ip); 236 sc->ip = NULL; 237 } 238 if (sc->flags & XCHK_HAVE_FREEZE_PROT) { 239 sc->flags &= ~XCHK_HAVE_FREEZE_PROT; 240 mnt_drop_write_file(sc->file); 241 } 242 if (sc->xmbtp) { 243 xmbuf_free(sc->xmbtp); 244 sc->xmbtp = NULL; 245 } 246 if (sc->xfile) { 247 xfile_destroy(sc->xfile); 248 sc->xfile = NULL; 249 } 250 if (sc->buf) { 251 if (sc->buf_cleanup) 252 sc->buf_cleanup(sc->buf); 253 kvfree(sc->buf); 254 sc->buf_cleanup = NULL; 255 sc->buf = NULL; 256 } 257 258 xrep_tempfile_rele(sc); 259 xrep_orphanage_rele(sc); 260 xchk_fsgates_disable(sc); 261 return error; 262 } 263 264 /* Scrubbing dispatch. */ 265 266 static const struct xchk_meta_ops meta_scrub_ops[] = { 267 [XFS_SCRUB_TYPE_PROBE] = { /* ioctl presence test */ 268 .type = ST_NONE, 269 .setup = xchk_setup_fs, 270 .scrub = xchk_probe, 271 .repair = xrep_probe, 272 }, 273 [XFS_SCRUB_TYPE_SB] = { /* superblock */ 274 .type = ST_PERAG, 275 .setup = xchk_setup_agheader, 276 .scrub = xchk_superblock, 277 .repair = xrep_superblock, 278 }, 279 [XFS_SCRUB_TYPE_AGF] = { /* agf */ 280 .type = ST_PERAG, 281 .setup = xchk_setup_agheader, 282 .scrub = xchk_agf, 283 .repair = xrep_agf, 284 }, 285 [XFS_SCRUB_TYPE_AGFL]= { /* agfl */ 286 .type = ST_PERAG, 287 .setup = xchk_setup_agheader, 288 .scrub = xchk_agfl, 289 .repair = xrep_agfl, 290 }, 291 [XFS_SCRUB_TYPE_AGI] = { /* agi */ 292 .type = ST_PERAG, 293 .setup = xchk_setup_agheader, 294 .scrub = xchk_agi, 295 .repair = xrep_agi, 296 }, 297 [XFS_SCRUB_TYPE_BNOBT] = { /* bnobt */ 298 .type = ST_PERAG, 299 .setup = xchk_setup_ag_allocbt, 300 .scrub = xchk_allocbt, 301 .repair = xrep_allocbt, 302 .repair_eval = xrep_revalidate_allocbt, 303 }, 304 [XFS_SCRUB_TYPE_CNTBT] = { /* cntbt */ 305 .type = ST_PERAG, 306 .setup = xchk_setup_ag_allocbt, 307 .scrub = xchk_allocbt, 308 .repair = xrep_allocbt, 309 .repair_eval = xrep_revalidate_allocbt, 310 }, 311 [XFS_SCRUB_TYPE_INOBT] = { /* inobt */ 312 .type = ST_PERAG, 313 .setup = xchk_setup_ag_iallocbt, 314 .scrub = xchk_iallocbt, 315 .repair = xrep_iallocbt, 316 .repair_eval = xrep_revalidate_iallocbt, 317 }, 318 [XFS_SCRUB_TYPE_FINOBT] = { /* finobt */ 319 .type = ST_PERAG, 320 .setup = xchk_setup_ag_iallocbt, 321 .scrub = xchk_iallocbt, 322 .has = xfs_has_finobt, 323 .repair = xrep_iallocbt, 324 .repair_eval = xrep_revalidate_iallocbt, 325 }, 326 [XFS_SCRUB_TYPE_RMAPBT] = { /* rmapbt */ 327 .type = ST_PERAG, 328 .setup = xchk_setup_ag_rmapbt, 329 .scrub = xchk_rmapbt, 330 .has = xfs_has_rmapbt, 331 .repair = xrep_rmapbt, 332 }, 333 [XFS_SCRUB_TYPE_REFCNTBT] = { /* refcountbt */ 334 .type = ST_PERAG, 335 .setup = xchk_setup_ag_refcountbt, 336 .scrub = xchk_refcountbt, 337 .has = xfs_has_reflink, 338 .repair = xrep_refcountbt, 339 }, 340 [XFS_SCRUB_TYPE_INODE] = { /* inode record */ 341 .type = ST_INODE, 342 .setup = xchk_setup_inode, 343 .scrub = xchk_inode, 344 .repair = xrep_inode, 345 }, 346 [XFS_SCRUB_TYPE_BMBTD] = { /* inode data fork */ 347 .type = ST_INODE, 348 .setup = xchk_setup_inode_bmap, 349 .scrub = xchk_bmap_data, 350 .repair = xrep_bmap_data, 351 }, 352 [XFS_SCRUB_TYPE_BMBTA] = { /* inode attr fork */ 353 .type = ST_INODE, 354 .setup = xchk_setup_inode_bmap, 355 .scrub = xchk_bmap_attr, 356 .repair = xrep_bmap_attr, 357 }, 358 [XFS_SCRUB_TYPE_BMBTC] = { /* inode CoW fork */ 359 .type = ST_INODE, 360 .setup = xchk_setup_inode_bmap, 361 .scrub = xchk_bmap_cow, 362 .repair = xrep_bmap_cow, 363 }, 364 [XFS_SCRUB_TYPE_DIR] = { /* directory */ 365 .type = ST_INODE, 366 .setup = xchk_setup_directory, 367 .scrub = xchk_directory, 368 .repair = xrep_directory, 369 }, 370 [XFS_SCRUB_TYPE_XATTR] = { /* extended attributes */ 371 .type = ST_INODE, 372 .setup = xchk_setup_xattr, 373 .scrub = xchk_xattr, 374 .repair = xrep_xattr, 375 }, 376 [XFS_SCRUB_TYPE_SYMLINK] = { /* symbolic link */ 377 .type = ST_INODE, 378 .setup = xchk_setup_symlink, 379 .scrub = xchk_symlink, 380 .repair = xrep_symlink, 381 }, 382 [XFS_SCRUB_TYPE_PARENT] = { /* parent pointers */ 383 .type = ST_INODE, 384 .setup = xchk_setup_parent, 385 .scrub = xchk_parent, 386 .repair = xrep_parent, 387 }, 388 [XFS_SCRUB_TYPE_RTBITMAP] = { /* realtime bitmap */ 389 .type = ST_RTGROUP, 390 .setup = xchk_setup_rtbitmap, 391 .scrub = xchk_rtbitmap, 392 .repair = xrep_rtbitmap, 393 }, 394 [XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */ 395 .type = ST_RTGROUP, 396 .setup = xchk_setup_rtsummary, 397 .scrub = xchk_rtsummary, 398 .repair = xrep_rtsummary, 399 }, 400 [XFS_SCRUB_TYPE_UQUOTA] = { /* user quota */ 401 .type = ST_FS, 402 .setup = xchk_setup_quota, 403 .scrub = xchk_quota, 404 .repair = xrep_quota, 405 }, 406 [XFS_SCRUB_TYPE_GQUOTA] = { /* group quota */ 407 .type = ST_FS, 408 .setup = xchk_setup_quota, 409 .scrub = xchk_quota, 410 .repair = xrep_quota, 411 }, 412 [XFS_SCRUB_TYPE_PQUOTA] = { /* project quota */ 413 .type = ST_FS, 414 .setup = xchk_setup_quota, 415 .scrub = xchk_quota, 416 .repair = xrep_quota, 417 }, 418 [XFS_SCRUB_TYPE_FSCOUNTERS] = { /* fs summary counters */ 419 .type = ST_FS, 420 .setup = xchk_setup_fscounters, 421 .scrub = xchk_fscounters, 422 .repair = xrep_fscounters, 423 }, 424 [XFS_SCRUB_TYPE_QUOTACHECK] = { /* quota counters */ 425 .type = ST_FS, 426 .setup = xchk_setup_quotacheck, 427 .scrub = xchk_quotacheck, 428 .repair = xrep_quotacheck, 429 }, 430 [XFS_SCRUB_TYPE_NLINKS] = { /* inode link counts */ 431 .type = ST_FS, 432 .setup = xchk_setup_nlinks, 433 .scrub = xchk_nlinks, 434 .repair = xrep_nlinks, 435 }, 436 [XFS_SCRUB_TYPE_HEALTHY] = { /* fs healthy; clean all reminders */ 437 .type = ST_FS, 438 .setup = xchk_setup_fs, 439 .scrub = xchk_health_record, 440 .repair = xrep_notsupported, 441 }, 442 [XFS_SCRUB_TYPE_DIRTREE] = { /* directory tree structure */ 443 .type = ST_INODE, 444 .setup = xchk_setup_dirtree, 445 .scrub = xchk_dirtree, 446 .has = xfs_has_parent, 447 .repair = xrep_dirtree, 448 }, 449 [XFS_SCRUB_TYPE_METAPATH] = { /* metadata directory tree path */ 450 .type = ST_GENERIC, 451 .setup = xchk_setup_metapath, 452 .scrub = xchk_metapath, 453 .has = xfs_has_metadir, 454 .repair = xrep_metapath, 455 }, 456 [XFS_SCRUB_TYPE_RGSUPER] = { /* realtime group superblock */ 457 .type = ST_RTGROUP, 458 .setup = xchk_setup_rgsuperblock, 459 .scrub = xchk_rgsuperblock, 460 .has = xfs_has_rtsb, 461 .repair = xrep_rgsuperblock, 462 }, 463 [XFS_SCRUB_TYPE_RTRMAPBT] = { /* realtime group rmapbt */ 464 .type = ST_RTGROUP, 465 .setup = xchk_setup_rtrmapbt, 466 .scrub = xchk_rtrmapbt, 467 .has = xfs_has_rtrmapbt, 468 .repair = xrep_rtrmapbt, 469 }, 470 [XFS_SCRUB_TYPE_RTREFCBT] = { /* realtime refcountbt */ 471 .type = ST_RTGROUP, 472 .setup = xchk_setup_rtrefcountbt, 473 .scrub = xchk_rtrefcountbt, 474 .has = xfs_has_rtreflink, 475 .repair = xrep_rtrefcountbt, 476 }, 477 }; 478 479 static int 480 xchk_validate_inputs( 481 struct xfs_mount *mp, 482 struct xfs_scrub_metadata *sm) 483 { 484 int error; 485 const struct xchk_meta_ops *ops; 486 487 error = -EINVAL; 488 /* Check our inputs. */ 489 sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; 490 if (sm->sm_flags & ~XFS_SCRUB_FLAGS_IN) 491 goto out; 492 /* sm_reserved[] must be zero */ 493 if (memchr_inv(sm->sm_reserved, 0, sizeof(sm->sm_reserved))) 494 goto out; 495 496 error = -ENOENT; 497 /* Do we know about this type of metadata? */ 498 if (sm->sm_type >= XFS_SCRUB_TYPE_NR) 499 goto out; 500 ops = &meta_scrub_ops[sm->sm_type]; 501 if (ops->setup == NULL || ops->scrub == NULL) 502 goto out; 503 /* Does this fs even support this type of metadata? */ 504 if (ops->has && !ops->has(mp)) 505 goto out; 506 507 error = -EINVAL; 508 /* restricting fields must be appropriate for type */ 509 switch (ops->type) { 510 case ST_NONE: 511 case ST_FS: 512 if (sm->sm_ino || sm->sm_gen || sm->sm_agno) 513 goto out; 514 break; 515 case ST_PERAG: 516 if (sm->sm_ino || sm->sm_gen || 517 sm->sm_agno >= mp->m_sb.sb_agcount) 518 goto out; 519 break; 520 case ST_INODE: 521 if (sm->sm_agno || (sm->sm_gen && !sm->sm_ino)) 522 goto out; 523 break; 524 case ST_GENERIC: 525 break; 526 case ST_RTGROUP: 527 if (sm->sm_ino || sm->sm_gen) 528 goto out; 529 if (xfs_has_rtgroups(mp)) { 530 /* 531 * On a rtgroups filesystem, there won't be an rtbitmap 532 * or rtsummary file for group 0 unless there's 533 * actually a realtime volume attached. However, older 534 * xfs_scrub always calls the rtbitmap/rtsummary 535 * scrubbers with sm_agno==0 so transform the error 536 * code to ENOENT. 537 */ 538 if (sm->sm_agno >= mp->m_sb.sb_rgcount) { 539 if (sm->sm_agno == 0) 540 error = -ENOENT; 541 goto out; 542 } 543 } else { 544 /* 545 * Prior to rtgroups, the rtbitmap/rtsummary scrubbers 546 * accepted sm_agno==0, so we still accept that for 547 * scrubbing pre-rtgroups filesystems. 548 */ 549 if (sm->sm_agno != 0) 550 goto out; 551 } 552 break; 553 default: 554 goto out; 555 } 556 557 /* No rebuild without repair. */ 558 if ((sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) && 559 !(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)) 560 return -EINVAL; 561 562 /* 563 * We only want to repair read-write v5+ filesystems. Defer the check 564 * for ops->repair until after our scrub confirms that we need to 565 * perform repairs so that we avoid failing due to not supporting 566 * repairing an object that doesn't need repairs. 567 */ 568 if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) { 569 error = -EOPNOTSUPP; 570 if (!xfs_has_crc(mp)) 571 goto out; 572 573 error = -EROFS; 574 if (xfs_is_readonly(mp)) 575 goto out; 576 } 577 578 error = 0; 579 out: 580 return error; 581 } 582 583 #ifdef CONFIG_XFS_ONLINE_REPAIR 584 static inline void xchk_postmortem(struct xfs_scrub *sc) 585 { 586 /* 587 * Userspace asked us to repair something, we repaired it, rescanned 588 * it, and the rescan says it's still broken. Scream about this in 589 * the system logs. 590 */ 591 if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && 592 (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | 593 XFS_SCRUB_OFLAG_XCORRUPT))) 594 xrep_failure(sc->mp); 595 } 596 #else 597 static inline void xchk_postmortem(struct xfs_scrub *sc) 598 { 599 /* 600 * Userspace asked us to scrub something, it's broken, and we have no 601 * way of fixing it. Scream in the logs. 602 */ 603 if (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | 604 XFS_SCRUB_OFLAG_XCORRUPT)) 605 xfs_alert_ratelimited(sc->mp, 606 "Corruption detected during scrub."); 607 } 608 #endif /* CONFIG_XFS_ONLINE_REPAIR */ 609 610 /* 611 * Create a new scrub context from an existing one, but with a different scrub 612 * type. 613 */ 614 struct xfs_scrub_subord * 615 xchk_scrub_create_subord( 616 struct xfs_scrub *sc, 617 unsigned int subtype) 618 { 619 struct xfs_scrub_subord *sub; 620 621 sub = kzalloc(sizeof(*sub), XCHK_GFP_FLAGS); 622 if (!sub) 623 return ERR_PTR(-ENOMEM); 624 625 sub->old_smtype = sc->sm->sm_type; 626 sub->old_smflags = sc->sm->sm_flags; 627 sub->parent_sc = sc; 628 memcpy(&sub->sc, sc, sizeof(struct xfs_scrub)); 629 sub->sc.ops = &meta_scrub_ops[subtype]; 630 sub->sc.sm->sm_type = subtype; 631 sub->sc.sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; 632 sub->sc.buf = NULL; 633 sub->sc.buf_cleanup = NULL; 634 sub->sc.xfile = NULL; 635 sub->sc.xmbtp = NULL; 636 637 return sub; 638 } 639 640 /* Dispatch metadata scrubbing. */ 641 STATIC int 642 xfs_scrub_metadata( 643 struct file *file, 644 struct xfs_scrub_metadata *sm) 645 { 646 struct xchk_stats_run run = { }; 647 struct xfs_scrub *sc; 648 struct xfs_mount *mp = XFS_I(file_inode(file))->i_mount; 649 u64 check_start; 650 int error = 0; 651 652 BUILD_BUG_ON(sizeof(meta_scrub_ops) != 653 (sizeof(struct xchk_meta_ops) * XFS_SCRUB_TYPE_NR)); 654 655 trace_xchk_start(XFS_I(file_inode(file)), sm, error); 656 657 /* Forbidden if we are shut down or mounted norecovery. */ 658 error = -ESHUTDOWN; 659 if (xfs_is_shutdown(mp)) 660 goto out; 661 error = -ENOTRECOVERABLE; 662 if (xfs_has_norecovery(mp)) 663 goto out; 664 665 error = xchk_validate_inputs(mp, sm); 666 if (error) 667 goto out; 668 669 xfs_warn_experimental(mp, XFS_EXPERIMENTAL_SCRUB); 670 671 sc = kzalloc(sizeof(struct xfs_scrub), XCHK_GFP_FLAGS); 672 if (!sc) { 673 error = -ENOMEM; 674 goto out; 675 } 676 677 sc->mp = mp; 678 sc->file = file; 679 sc->sm = sm; 680 sc->ops = &meta_scrub_ops[sm->sm_type]; 681 sc->sick_mask = xchk_health_mask_for_scrub_type(sm->sm_type); 682 sc->relax = INIT_XCHK_RELAX; 683 retry_op: 684 /* 685 * When repairs are allowed, prevent freezing or readonly remount while 686 * scrub is running with a real transaction. 687 */ 688 if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) { 689 error = mnt_want_write_file(sc->file); 690 if (error) 691 goto out_sc; 692 693 sc->flags |= XCHK_HAVE_FREEZE_PROT; 694 } 695 696 /* Set up for the operation. */ 697 error = sc->ops->setup(sc); 698 if (error == -EDEADLOCK && !(sc->flags & XCHK_TRY_HARDER)) 699 goto try_harder; 700 if (error == -ECHRNG && !(sc->flags & XCHK_NEED_DRAIN)) 701 goto need_drain; 702 if (error) 703 goto out_teardown; 704 705 /* Scrub for errors. */ 706 check_start = xchk_stats_now(); 707 if ((sc->flags & XREP_ALREADY_FIXED) && sc->ops->repair_eval != NULL) 708 error = sc->ops->repair_eval(sc); 709 else 710 error = sc->ops->scrub(sc); 711 run.scrub_ns += xchk_stats_elapsed_ns(check_start); 712 if (error == -EDEADLOCK && !(sc->flags & XCHK_TRY_HARDER)) 713 goto try_harder; 714 if (error == -ECHRNG && !(sc->flags & XCHK_NEED_DRAIN)) 715 goto need_drain; 716 if (error || (sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)) 717 goto out_teardown; 718 719 xchk_update_health(sc); 720 721 if (xchk_could_repair(sc)) { 722 /* 723 * If userspace asked for a repair but it wasn't necessary, 724 * report that back to userspace. 725 */ 726 if (!xrep_will_attempt(sc)) { 727 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED; 728 goto out_nofix; 729 } 730 731 /* 732 * If it's broken, userspace wants us to fix it, and we haven't 733 * already tried to fix it, then attempt a repair. 734 */ 735 error = xrep_attempt(sc, &run); 736 if (error == -EAGAIN) { 737 /* 738 * Either the repair function succeeded or it couldn't 739 * get all the resources it needs; either way, we go 740 * back to the beginning and call the scrub function. 741 */ 742 error = xchk_teardown(sc, 0); 743 if (error) { 744 xrep_failure(mp); 745 goto out_sc; 746 } 747 goto retry_op; 748 } 749 } 750 751 out_nofix: 752 xchk_postmortem(sc); 753 out_teardown: 754 error = xchk_teardown(sc, error); 755 out_sc: 756 if (error != -ENOENT) 757 xchk_stats_merge(mp, sm, &run); 758 kfree(sc); 759 out: 760 trace_xchk_done(XFS_I(file_inode(file)), sm, error); 761 if (error == -EFSCORRUPTED || error == -EFSBADCRC) { 762 sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; 763 error = 0; 764 } 765 return error; 766 need_drain: 767 error = xchk_teardown(sc, 0); 768 if (error) 769 goto out_sc; 770 sc->flags |= XCHK_NEED_DRAIN; 771 run.retries++; 772 goto retry_op; 773 try_harder: 774 /* 775 * Scrubbers return -EDEADLOCK to mean 'try harder'. Tear down 776 * everything we hold, then set up again with preparation for 777 * worst-case scenarios. 778 */ 779 error = xchk_teardown(sc, 0); 780 if (error) 781 goto out_sc; 782 sc->flags |= XCHK_TRY_HARDER; 783 run.retries++; 784 goto retry_op; 785 } 786 787 /* Scrub one aspect of one piece of metadata. */ 788 int 789 xfs_ioc_scrub_metadata( 790 struct file *file, 791 void __user *arg) 792 { 793 struct xfs_scrub_metadata scrub; 794 int error; 795 796 if (!capable(CAP_SYS_ADMIN)) 797 return -EPERM; 798 799 if (copy_from_user(&scrub, arg, sizeof(scrub))) 800 return -EFAULT; 801 802 error = xfs_scrub_metadata(file, &scrub); 803 if (error) 804 return error; 805 806 if (copy_to_user(arg, &scrub, sizeof(scrub))) 807 return -EFAULT; 808 809 return 0; 810 } 811 812 /* Decide if there have been any scrub failures up to this point. */ 813 static inline int 814 xfs_scrubv_check_barrier( 815 struct xfs_mount *mp, 816 const struct xfs_scrub_vec *vectors, 817 const struct xfs_scrub_vec *stop_vec) 818 { 819 const struct xfs_scrub_vec *v; 820 __u32 failmask; 821 822 failmask = stop_vec->sv_flags & XFS_SCRUB_FLAGS_OUT; 823 824 for (v = vectors; v < stop_vec; v++) { 825 if (v->sv_type == XFS_SCRUB_TYPE_BARRIER) 826 continue; 827 828 /* 829 * Runtime errors count as a previous failure, except the ones 830 * used to ask userspace to retry. 831 */ 832 switch (v->sv_ret) { 833 case -EBUSY: 834 case -ENOENT: 835 case -EUSERS: 836 case 0: 837 break; 838 default: 839 return -ECANCELED; 840 } 841 842 /* 843 * If any of the out-flags on the scrub vector match the mask 844 * that was set on the barrier vector, that's a previous fail. 845 */ 846 if (v->sv_flags & failmask) 847 return -ECANCELED; 848 } 849 850 return 0; 851 } 852 853 /* 854 * If the caller provided us with a nonzero inode number that isn't the ioctl 855 * file, try to grab a reference to it to eliminate all further untrusted inode 856 * lookups. If we can't get the inode, let each scrub function try again. 857 */ 858 STATIC struct xfs_inode * 859 xchk_scrubv_open_by_handle( 860 struct xfs_mount *mp, 861 const struct xfs_scrub_vec_head *head) 862 { 863 struct xfs_trans *tp; 864 struct xfs_inode *ip; 865 int error; 866 867 error = xfs_trans_alloc_empty(mp, &tp); 868 if (error) 869 return NULL; 870 871 error = xfs_iget(mp, tp, head->svh_ino, XCHK_IGET_FLAGS, 0, &ip); 872 xfs_trans_cancel(tp); 873 if (error) 874 return NULL; 875 876 if (VFS_I(ip)->i_generation != head->svh_gen) { 877 xfs_irele(ip); 878 return NULL; 879 } 880 881 return ip; 882 } 883 884 /* Vectored scrub implementation to reduce ioctl calls. */ 885 int 886 xfs_ioc_scrubv_metadata( 887 struct file *file, 888 void __user *arg) 889 { 890 struct xfs_scrub_vec_head head; 891 struct xfs_scrub_vec_head __user *uhead = arg; 892 struct xfs_scrub_vec *vectors; 893 struct xfs_scrub_vec __user *uvectors; 894 struct xfs_inode *ip_in = XFS_I(file_inode(file)); 895 struct xfs_mount *mp = ip_in->i_mount; 896 struct xfs_inode *handle_ip = NULL; 897 struct xfs_scrub_vec *v; 898 size_t vec_bytes; 899 unsigned int i; 900 int error = 0; 901 902 if (!capable(CAP_SYS_ADMIN)) 903 return -EPERM; 904 905 if (copy_from_user(&head, uhead, sizeof(head))) 906 return -EFAULT; 907 908 if (head.svh_reserved) 909 return -EINVAL; 910 if (head.svh_flags & ~XFS_SCRUB_VEC_FLAGS_ALL) 911 return -EINVAL; 912 if (head.svh_nr == 0) 913 return 0; 914 915 vec_bytes = array_size(head.svh_nr, sizeof(struct xfs_scrub_vec)); 916 if (vec_bytes > PAGE_SIZE) 917 return -ENOMEM; 918 919 uvectors = u64_to_user_ptr(head.svh_vectors); 920 vectors = memdup_user(uvectors, vec_bytes); 921 if (IS_ERR(vectors)) 922 return PTR_ERR(vectors); 923 924 trace_xchk_scrubv_start(ip_in, &head); 925 926 for (i = 0, v = vectors; i < head.svh_nr; i++, v++) { 927 if (v->sv_reserved) { 928 error = -EINVAL; 929 goto out_free; 930 } 931 932 if (v->sv_type == XFS_SCRUB_TYPE_BARRIER && 933 (v->sv_flags & ~XFS_SCRUB_FLAGS_OUT)) { 934 error = -EINVAL; 935 goto out_free; 936 } 937 938 trace_xchk_scrubv_item(mp, &head, i, v); 939 } 940 941 /* 942 * If the caller wants us to do a scrub-by-handle and the file used to 943 * call the ioctl is not the same file, load the incore inode and pin 944 * it across all the scrubv actions to avoid repeated UNTRUSTED 945 * lookups. The reference is not passed to deeper layers of scrub 946 * because each scrubber gets to decide its own strategy and return 947 * values for getting an inode. 948 */ 949 if (head.svh_ino && head.svh_ino != ip_in->i_ino) 950 handle_ip = xchk_scrubv_open_by_handle(mp, &head); 951 952 /* Run all the scrubbers. */ 953 for (i = 0, v = vectors; i < head.svh_nr; i++, v++) { 954 struct xfs_scrub_metadata sm = { 955 .sm_type = v->sv_type, 956 .sm_flags = v->sv_flags, 957 .sm_ino = head.svh_ino, 958 .sm_gen = head.svh_gen, 959 .sm_agno = head.svh_agno, 960 }; 961 962 if (v->sv_type == XFS_SCRUB_TYPE_BARRIER) { 963 v->sv_ret = xfs_scrubv_check_barrier(mp, vectors, v); 964 if (v->sv_ret) { 965 trace_xchk_scrubv_barrier_fail(mp, &head, i, v); 966 break; 967 } 968 969 continue; 970 } 971 972 v->sv_ret = xfs_scrub_metadata(file, &sm); 973 v->sv_flags = sm.sm_flags; 974 975 trace_xchk_scrubv_outcome(mp, &head, i, v); 976 977 if (head.svh_rest_us) { 978 ktime_t expires; 979 980 expires = ktime_add_ns(ktime_get(), 981 head.svh_rest_us * 1000); 982 set_current_state(TASK_KILLABLE); 983 schedule_hrtimeout(&expires, HRTIMER_MODE_ABS); 984 } 985 986 if (fatal_signal_pending(current)) { 987 error = -EINTR; 988 goto out_free; 989 } 990 } 991 992 if (copy_to_user(uvectors, vectors, vec_bytes) || 993 copy_to_user(uhead, &head, sizeof(head))) { 994 error = -EFAULT; 995 goto out_free; 996 } 997 998 out_free: 999 if (handle_ip) 1000 xfs_irele(handle_ip); 1001 kfree(vectors); 1002 return error; 1003 } 1004