1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2017-2023 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_log_format.h" 13 #include "xfs_trans.h" 14 #include "xfs_inode.h" 15 #include "xfs_quota.h" 16 #include "xfs_qm.h" 17 #include "xfs_scrub.h" 18 #include "xfs_buf_mem.h" 19 #include "xfs_rmap.h" 20 #include "xfs_exchrange.h" 21 #include "xfs_exchmaps.h" 22 #include "xfs_dir2.h" 23 #include "xfs_parent.h" 24 #include "xfs_icache.h" 25 #include "scrub/scrub.h" 26 #include "scrub/common.h" 27 #include "scrub/trace.h" 28 #include "scrub/repair.h" 29 #include "scrub/health.h" 30 #include "scrub/stats.h" 31 #include "scrub/xfile.h" 32 #include "scrub/tempfile.h" 33 #include "scrub/orphanage.h" 34 35 /* 36 * Online Scrub and Repair 37 * 38 * Traditionally, XFS (the kernel driver) did not know how to check or 39 * repair on-disk data structures. That task was left to the xfs_check 40 * and xfs_repair tools, both of which require taking the filesystem 41 * offline for a thorough but time consuming examination. Online 42 * scrub & repair, on the other hand, enables us to check the metadata 43 * for obvious errors while carefully stepping around the filesystem's 44 * ongoing operations, locking rules, etc. 45 * 46 * Given that most XFS metadata consist of records stored in a btree, 47 * most of the checking functions iterate the btree blocks themselves 48 * looking for irregularities. When a record block is encountered, each 49 * record can be checked for obviously bad values. Record values can 50 * also be cross-referenced against other btrees to look for potential 51 * misunderstandings between pieces of metadata. 52 * 53 * It is expected that the checkers responsible for per-AG metadata 54 * structures will lock the AG headers (AGI, AGF, AGFL), iterate the 55 * metadata structure, and perform any relevant cross-referencing before 56 * unlocking the AG and returning the results to userspace. These 57 * scrubbers must not keep an AG locked for too long to avoid tying up 58 * the block and inode allocators. 59 * 60 * Block maps and b-trees rooted in an inode present a special challenge 61 * because they can involve extents from any AG. The general scrubber 62 * structure of lock -> check -> xref -> unlock still holds, but AG 63 * locking order rules /must/ be obeyed to avoid deadlocks. The 64 * ordering rule, of course, is that we must lock in increasing AG 65 * order. Helper functions are provided to track which AG headers we've 66 * already locked. If we detect an imminent locking order violation, we 67 * can signal a potential deadlock, in which case the scrubber can jump 68 * out to the top level, lock all the AGs in order, and retry the scrub. 69 * 70 * For file data (directories, extended attributes, symlinks) scrub, we 71 * can simply lock the inode and walk the data. For btree data 72 * (directories and attributes) we follow the same btree-scrubbing 73 * strategy outlined previously to check the records. 74 * 75 * We use a bit of trickery with transactions to avoid buffer deadlocks 76 * if there is a cycle in the metadata. The basic problem is that 77 * travelling down a btree involves locking the current buffer at each 78 * tree level. If a pointer should somehow point back to a buffer that 79 * we've already examined, we will deadlock due to the second buffer 80 * locking attempt. Note however that grabbing a buffer in transaction 81 * context links the locked buffer to the transaction. If we try to 82 * re-grab the buffer in the context of the same transaction, we avoid 83 * the second lock attempt and continue. Between the verifier and the 84 * scrubber, something will notice that something is amiss and report 85 * the corruption. Therefore, each scrubber will allocate an empty 86 * transaction, attach buffers to it, and cancel the transaction at the 87 * end of the scrub run. Cancelling a non-dirty transaction simply 88 * unlocks the buffers. 89 * 90 * There are four pieces of data that scrub can communicate to 91 * userspace. The first is the error code (errno), which can be used to 92 * communicate operational errors in performing the scrub. There are 93 * also three flags that can be set in the scrub context. If the data 94 * structure itself is corrupt, the CORRUPT flag will be set. If 95 * the metadata is correct but otherwise suboptimal, the PREEN flag 96 * will be set. 97 * 98 * We perform secondary validation of filesystem metadata by 99 * cross-referencing every record with all other available metadata. 100 * For example, for block mapping extents, we verify that there are no 101 * records in the free space and inode btrees corresponding to that 102 * space extent and that there is a corresponding entry in the reverse 103 * mapping btree. Inconsistent metadata is noted by setting the 104 * XCORRUPT flag; btree query function errors are noted by setting the 105 * XFAIL flag and deleting the cursor to prevent further attempts to 106 * cross-reference with a defective btree. 107 * 108 * If a piece of metadata proves corrupt or suboptimal, the userspace 109 * program can ask the kernel to apply some tender loving care (TLC) to 110 * the metadata object by setting the REPAIR flag and re-calling the 111 * scrub ioctl. "Corruption" is defined by metadata violating the 112 * on-disk specification; operations cannot continue if the violation is 113 * left untreated. It is possible for XFS to continue if an object is 114 * "suboptimal", however performance may be degraded. Repairs are 115 * usually performed by rebuilding the metadata entirely out of 116 * redundant metadata. Optimizing, on the other hand, can sometimes be 117 * done without rebuilding entire structures. 118 * 119 * Generally speaking, the repair code has the following code structure: 120 * Lock -> scrub -> repair -> commit -> re-lock -> re-scrub -> unlock. 121 * The first check helps us figure out if we need to rebuild or simply 122 * optimize the structure so that the rebuild knows what to do. The 123 * second check evaluates the completeness of the repair; that is what 124 * is reported to userspace. 125 * 126 * A quick note on symbol prefixes: 127 * - "xfs_" are general XFS symbols. 128 * - "xchk_" are symbols related to metadata checking. 129 * - "xrep_" are symbols related to metadata repair. 130 * - "xfs_scrub_" are symbols that tie online fsck to the rest of XFS. 131 */ 132 133 /* 134 * Scrub probe -- userspace uses this to probe if we're willing to scrub 135 * or repair a given mountpoint. This will be used by xfs_scrub to 136 * probe the kernel's abilities to scrub (and repair) the metadata. We 137 * do this by validating the ioctl inputs from userspace, preparing the 138 * filesystem for a scrub (or a repair) operation, and immediately 139 * returning to userspace. Userspace can use the returned errno and 140 * structure state to decide (in broad terms) if scrub/repair are 141 * supported by the running kernel. 142 */ 143 static int 144 xchk_probe( 145 struct xfs_scrub *sc) 146 { 147 int error = 0; 148 149 if (xchk_should_terminate(sc, &error)) 150 return error; 151 152 return 0; 153 } 154 155 /* Scrub setup and teardown */ 156 157 static inline void 158 xchk_fsgates_disable( 159 struct xfs_scrub *sc) 160 { 161 if (!(sc->flags & XCHK_FSGATES_ALL)) 162 return; 163 164 trace_xchk_fsgates_disable(sc, sc->flags & XCHK_FSGATES_ALL); 165 166 if (sc->flags & XCHK_FSGATES_DRAIN) 167 xfs_drain_wait_disable(); 168 169 if (sc->flags & XCHK_FSGATES_QUOTA) 170 xfs_dqtrx_hook_disable(); 171 172 if (sc->flags & XCHK_FSGATES_DIRENTS) 173 xfs_dir_hook_disable(); 174 175 if (sc->flags & XCHK_FSGATES_RMAP) 176 xfs_rmap_hook_disable(); 177 178 sc->flags &= ~XCHK_FSGATES_ALL; 179 } 180 181 /* Free the resources associated with a scrub subtype. */ 182 void 183 xchk_scrub_free_subord( 184 struct xfs_scrub_subord *sub) 185 { 186 struct xfs_scrub *sc = sub->parent_sc; 187 188 ASSERT(sc->ip == sub->sc.ip); 189 ASSERT(sc->orphanage == sub->sc.orphanage); 190 ASSERT(sc->tempip == sub->sc.tempip); 191 192 sc->sm->sm_type = sub->old_smtype; 193 sc->sm->sm_flags = sub->old_smflags | 194 (sc->sm->sm_flags & XFS_SCRUB_FLAGS_OUT); 195 sc->tp = sub->sc.tp; 196 197 if (sub->sc.buf) { 198 if (sub->sc.buf_cleanup) 199 sub->sc.buf_cleanup(sub->sc.buf); 200 kvfree(sub->sc.buf); 201 } 202 if (sub->sc.xmbtp) 203 xmbuf_free(sub->sc.xmbtp); 204 if (sub->sc.xfile) 205 xfile_destroy(sub->sc.xfile); 206 207 sc->ilock_flags = sub->sc.ilock_flags; 208 sc->orphanage_ilock_flags = sub->sc.orphanage_ilock_flags; 209 sc->temp_ilock_flags = sub->sc.temp_ilock_flags; 210 211 kfree(sub); 212 } 213 214 /* Free all the resources and finish the transactions. */ 215 STATIC int 216 xchk_teardown( 217 struct xfs_scrub *sc, 218 int error) 219 { 220 xchk_ag_free(sc, &sc->sa); 221 if (sc->tp) { 222 if (error == 0 && (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)) 223 error = xfs_trans_commit(sc->tp); 224 else 225 xfs_trans_cancel(sc->tp); 226 sc->tp = NULL; 227 } 228 if (sc->sr.rtg) 229 xchk_rtgroup_free(sc, &sc->sr); 230 if (sc->ip) { 231 if (sc->ilock_flags) 232 xchk_iunlock(sc, sc->ilock_flags); 233 xchk_irele(sc, sc->ip); 234 sc->ip = NULL; 235 } 236 if (sc->flags & XCHK_HAVE_FREEZE_PROT) { 237 sc->flags &= ~XCHK_HAVE_FREEZE_PROT; 238 mnt_drop_write_file(sc->file); 239 } 240 if (sc->xmbtp) { 241 xmbuf_free(sc->xmbtp); 242 sc->xmbtp = NULL; 243 } 244 if (sc->xfile) { 245 xfile_destroy(sc->xfile); 246 sc->xfile = NULL; 247 } 248 if (sc->buf) { 249 if (sc->buf_cleanup) 250 sc->buf_cleanup(sc->buf); 251 kvfree(sc->buf); 252 sc->buf_cleanup = NULL; 253 sc->buf = NULL; 254 } 255 256 xrep_tempfile_rele(sc); 257 xrep_orphanage_rele(sc); 258 xchk_fsgates_disable(sc); 259 return error; 260 } 261 262 /* Scrubbing dispatch. */ 263 264 static const struct xchk_meta_ops meta_scrub_ops[] = { 265 [XFS_SCRUB_TYPE_PROBE] = { /* ioctl presence test */ 266 .type = ST_NONE, 267 .setup = xchk_setup_fs, 268 .scrub = xchk_probe, 269 .repair = xrep_probe, 270 }, 271 [XFS_SCRUB_TYPE_SB] = { /* superblock */ 272 .type = ST_PERAG, 273 .setup = xchk_setup_agheader, 274 .scrub = xchk_superblock, 275 .repair = xrep_superblock, 276 }, 277 [XFS_SCRUB_TYPE_AGF] = { /* agf */ 278 .type = ST_PERAG, 279 .setup = xchk_setup_agheader, 280 .scrub = xchk_agf, 281 .repair = xrep_agf, 282 }, 283 [XFS_SCRUB_TYPE_AGFL]= { /* agfl */ 284 .type = ST_PERAG, 285 .setup = xchk_setup_agheader, 286 .scrub = xchk_agfl, 287 .repair = xrep_agfl, 288 }, 289 [XFS_SCRUB_TYPE_AGI] = { /* agi */ 290 .type = ST_PERAG, 291 .setup = xchk_setup_agheader, 292 .scrub = xchk_agi, 293 .repair = xrep_agi, 294 }, 295 [XFS_SCRUB_TYPE_BNOBT] = { /* bnobt */ 296 .type = ST_PERAG, 297 .setup = xchk_setup_ag_allocbt, 298 .scrub = xchk_allocbt, 299 .repair = xrep_allocbt, 300 .repair_eval = xrep_revalidate_allocbt, 301 }, 302 [XFS_SCRUB_TYPE_CNTBT] = { /* cntbt */ 303 .type = ST_PERAG, 304 .setup = xchk_setup_ag_allocbt, 305 .scrub = xchk_allocbt, 306 .repair = xrep_allocbt, 307 .repair_eval = xrep_revalidate_allocbt, 308 }, 309 [XFS_SCRUB_TYPE_INOBT] = { /* inobt */ 310 .type = ST_PERAG, 311 .setup = xchk_setup_ag_iallocbt, 312 .scrub = xchk_iallocbt, 313 .repair = xrep_iallocbt, 314 .repair_eval = xrep_revalidate_iallocbt, 315 }, 316 [XFS_SCRUB_TYPE_FINOBT] = { /* finobt */ 317 .type = ST_PERAG, 318 .setup = xchk_setup_ag_iallocbt, 319 .scrub = xchk_iallocbt, 320 .has = xfs_has_finobt, 321 .repair = xrep_iallocbt, 322 .repair_eval = xrep_revalidate_iallocbt, 323 }, 324 [XFS_SCRUB_TYPE_RMAPBT] = { /* rmapbt */ 325 .type = ST_PERAG, 326 .setup = xchk_setup_ag_rmapbt, 327 .scrub = xchk_rmapbt, 328 .has = xfs_has_rmapbt, 329 .repair = xrep_rmapbt, 330 }, 331 [XFS_SCRUB_TYPE_REFCNTBT] = { /* refcountbt */ 332 .type = ST_PERAG, 333 .setup = xchk_setup_ag_refcountbt, 334 .scrub = xchk_refcountbt, 335 .has = xfs_has_reflink, 336 .repair = xrep_refcountbt, 337 }, 338 [XFS_SCRUB_TYPE_INODE] = { /* inode record */ 339 .type = ST_INODE, 340 .setup = xchk_setup_inode, 341 .scrub = xchk_inode, 342 .repair = xrep_inode, 343 }, 344 [XFS_SCRUB_TYPE_BMBTD] = { /* inode data fork */ 345 .type = ST_INODE, 346 .setup = xchk_setup_inode_bmap, 347 .scrub = xchk_bmap_data, 348 .repair = xrep_bmap_data, 349 }, 350 [XFS_SCRUB_TYPE_BMBTA] = { /* inode attr fork */ 351 .type = ST_INODE, 352 .setup = xchk_setup_inode_bmap, 353 .scrub = xchk_bmap_attr, 354 .repair = xrep_bmap_attr, 355 }, 356 [XFS_SCRUB_TYPE_BMBTC] = { /* inode CoW fork */ 357 .type = ST_INODE, 358 .setup = xchk_setup_inode_bmap, 359 .scrub = xchk_bmap_cow, 360 .repair = xrep_bmap_cow, 361 }, 362 [XFS_SCRUB_TYPE_DIR] = { /* directory */ 363 .type = ST_INODE, 364 .setup = xchk_setup_directory, 365 .scrub = xchk_directory, 366 .repair = xrep_directory, 367 }, 368 [XFS_SCRUB_TYPE_XATTR] = { /* extended attributes */ 369 .type = ST_INODE, 370 .setup = xchk_setup_xattr, 371 .scrub = xchk_xattr, 372 .repair = xrep_xattr, 373 }, 374 [XFS_SCRUB_TYPE_SYMLINK] = { /* symbolic link */ 375 .type = ST_INODE, 376 .setup = xchk_setup_symlink, 377 .scrub = xchk_symlink, 378 .repair = xrep_symlink, 379 }, 380 [XFS_SCRUB_TYPE_PARENT] = { /* parent pointers */ 381 .type = ST_INODE, 382 .setup = xchk_setup_parent, 383 .scrub = xchk_parent, 384 .repair = xrep_parent, 385 }, 386 [XFS_SCRUB_TYPE_RTBITMAP] = { /* realtime bitmap */ 387 .type = ST_RTGROUP, 388 .setup = xchk_setup_rtbitmap, 389 .scrub = xchk_rtbitmap, 390 .repair = xrep_rtbitmap, 391 }, 392 [XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */ 393 .type = ST_RTGROUP, 394 .setup = xchk_setup_rtsummary, 395 .scrub = xchk_rtsummary, 396 .repair = xrep_rtsummary, 397 }, 398 [XFS_SCRUB_TYPE_UQUOTA] = { /* user quota */ 399 .type = ST_FS, 400 .setup = xchk_setup_quota, 401 .scrub = xchk_quota, 402 .repair = xrep_quota, 403 }, 404 [XFS_SCRUB_TYPE_GQUOTA] = { /* group quota */ 405 .type = ST_FS, 406 .setup = xchk_setup_quota, 407 .scrub = xchk_quota, 408 .repair = xrep_quota, 409 }, 410 [XFS_SCRUB_TYPE_PQUOTA] = { /* project quota */ 411 .type = ST_FS, 412 .setup = xchk_setup_quota, 413 .scrub = xchk_quota, 414 .repair = xrep_quota, 415 }, 416 [XFS_SCRUB_TYPE_FSCOUNTERS] = { /* fs summary counters */ 417 .type = ST_FS, 418 .setup = xchk_setup_fscounters, 419 .scrub = xchk_fscounters, 420 .repair = xrep_fscounters, 421 }, 422 [XFS_SCRUB_TYPE_QUOTACHECK] = { /* quota counters */ 423 .type = ST_FS, 424 .setup = xchk_setup_quotacheck, 425 .scrub = xchk_quotacheck, 426 .repair = xrep_quotacheck, 427 }, 428 [XFS_SCRUB_TYPE_NLINKS] = { /* inode link counts */ 429 .type = ST_FS, 430 .setup = xchk_setup_nlinks, 431 .scrub = xchk_nlinks, 432 .repair = xrep_nlinks, 433 }, 434 [XFS_SCRUB_TYPE_HEALTHY] = { /* fs healthy; clean all reminders */ 435 .type = ST_FS, 436 .setup = xchk_setup_fs, 437 .scrub = xchk_health_record, 438 .repair = xrep_notsupported, 439 }, 440 [XFS_SCRUB_TYPE_DIRTREE] = { /* directory tree structure */ 441 .type = ST_INODE, 442 .setup = xchk_setup_dirtree, 443 .scrub = xchk_dirtree, 444 .has = xfs_has_parent, 445 .repair = xrep_dirtree, 446 }, 447 [XFS_SCRUB_TYPE_METAPATH] = { /* metadata directory tree path */ 448 .type = ST_GENERIC, 449 .setup = xchk_setup_metapath, 450 .scrub = xchk_metapath, 451 .has = xfs_has_metadir, 452 .repair = xrep_metapath, 453 }, 454 [XFS_SCRUB_TYPE_RGSUPER] = { /* realtime group superblock */ 455 .type = ST_RTGROUP, 456 .setup = xchk_setup_rgsuperblock, 457 .scrub = xchk_rgsuperblock, 458 .has = xfs_has_rtsb, 459 .repair = xrep_rgsuperblock, 460 }, 461 }; 462 463 static int 464 xchk_validate_inputs( 465 struct xfs_mount *mp, 466 struct xfs_scrub_metadata *sm) 467 { 468 int error; 469 const struct xchk_meta_ops *ops; 470 471 error = -EINVAL; 472 /* Check our inputs. */ 473 sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; 474 if (sm->sm_flags & ~XFS_SCRUB_FLAGS_IN) 475 goto out; 476 /* sm_reserved[] must be zero */ 477 if (memchr_inv(sm->sm_reserved, 0, sizeof(sm->sm_reserved))) 478 goto out; 479 480 error = -ENOENT; 481 /* Do we know about this type of metadata? */ 482 if (sm->sm_type >= XFS_SCRUB_TYPE_NR) 483 goto out; 484 ops = &meta_scrub_ops[sm->sm_type]; 485 if (ops->setup == NULL || ops->scrub == NULL) 486 goto out; 487 /* Does this fs even support this type of metadata? */ 488 if (ops->has && !ops->has(mp)) 489 goto out; 490 491 error = -EINVAL; 492 /* restricting fields must be appropriate for type */ 493 switch (ops->type) { 494 case ST_NONE: 495 case ST_FS: 496 if (sm->sm_ino || sm->sm_gen || sm->sm_agno) 497 goto out; 498 break; 499 case ST_PERAG: 500 if (sm->sm_ino || sm->sm_gen || 501 sm->sm_agno >= mp->m_sb.sb_agcount) 502 goto out; 503 break; 504 case ST_INODE: 505 if (sm->sm_agno || (sm->sm_gen && !sm->sm_ino)) 506 goto out; 507 break; 508 case ST_GENERIC: 509 break; 510 case ST_RTGROUP: 511 if (sm->sm_ino || sm->sm_gen) 512 goto out; 513 if (xfs_has_rtgroups(mp)) { 514 /* 515 * On a rtgroups filesystem, there won't be an rtbitmap 516 * or rtsummary file for group 0 unless there's 517 * actually a realtime volume attached. However, older 518 * xfs_scrub always calls the rtbitmap/rtsummary 519 * scrubbers with sm_agno==0 so transform the error 520 * code to ENOENT. 521 */ 522 if (sm->sm_agno >= mp->m_sb.sb_rgcount) { 523 if (sm->sm_agno == 0) 524 error = -ENOENT; 525 goto out; 526 } 527 } else { 528 /* 529 * Prior to rtgroups, the rtbitmap/rtsummary scrubbers 530 * accepted sm_agno==0, so we still accept that for 531 * scrubbing pre-rtgroups filesystems. 532 */ 533 if (sm->sm_agno != 0) 534 goto out; 535 } 536 break; 537 default: 538 goto out; 539 } 540 541 /* No rebuild without repair. */ 542 if ((sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) && 543 !(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)) 544 return -EINVAL; 545 546 /* 547 * We only want to repair read-write v5+ filesystems. Defer the check 548 * for ops->repair until after our scrub confirms that we need to 549 * perform repairs so that we avoid failing due to not supporting 550 * repairing an object that doesn't need repairs. 551 */ 552 if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) { 553 error = -EOPNOTSUPP; 554 if (!xfs_has_crc(mp)) 555 goto out; 556 557 error = -EROFS; 558 if (xfs_is_readonly(mp)) 559 goto out; 560 } 561 562 error = 0; 563 out: 564 return error; 565 } 566 567 #ifdef CONFIG_XFS_ONLINE_REPAIR 568 static inline void xchk_postmortem(struct xfs_scrub *sc) 569 { 570 /* 571 * Userspace asked us to repair something, we repaired it, rescanned 572 * it, and the rescan says it's still broken. Scream about this in 573 * the system logs. 574 */ 575 if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && 576 (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | 577 XFS_SCRUB_OFLAG_XCORRUPT))) 578 xrep_failure(sc->mp); 579 } 580 #else 581 static inline void xchk_postmortem(struct xfs_scrub *sc) 582 { 583 /* 584 * Userspace asked us to scrub something, it's broken, and we have no 585 * way of fixing it. Scream in the logs. 586 */ 587 if (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | 588 XFS_SCRUB_OFLAG_XCORRUPT)) 589 xfs_alert_ratelimited(sc->mp, 590 "Corruption detected during scrub."); 591 } 592 #endif /* CONFIG_XFS_ONLINE_REPAIR */ 593 594 /* 595 * Create a new scrub context from an existing one, but with a different scrub 596 * type. 597 */ 598 struct xfs_scrub_subord * 599 xchk_scrub_create_subord( 600 struct xfs_scrub *sc, 601 unsigned int subtype) 602 { 603 struct xfs_scrub_subord *sub; 604 605 sub = kzalloc(sizeof(*sub), XCHK_GFP_FLAGS); 606 if (!sub) 607 return ERR_PTR(-ENOMEM); 608 609 sub->old_smtype = sc->sm->sm_type; 610 sub->old_smflags = sc->sm->sm_flags; 611 sub->parent_sc = sc; 612 memcpy(&sub->sc, sc, sizeof(struct xfs_scrub)); 613 sub->sc.ops = &meta_scrub_ops[subtype]; 614 sub->sc.sm->sm_type = subtype; 615 sub->sc.sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; 616 sub->sc.buf = NULL; 617 sub->sc.buf_cleanup = NULL; 618 sub->sc.xfile = NULL; 619 sub->sc.xmbtp = NULL; 620 621 return sub; 622 } 623 624 /* Dispatch metadata scrubbing. */ 625 STATIC int 626 xfs_scrub_metadata( 627 struct file *file, 628 struct xfs_scrub_metadata *sm) 629 { 630 struct xchk_stats_run run = { }; 631 struct xfs_scrub *sc; 632 struct xfs_mount *mp = XFS_I(file_inode(file))->i_mount; 633 u64 check_start; 634 int error = 0; 635 636 BUILD_BUG_ON(sizeof(meta_scrub_ops) != 637 (sizeof(struct xchk_meta_ops) * XFS_SCRUB_TYPE_NR)); 638 639 trace_xchk_start(XFS_I(file_inode(file)), sm, error); 640 641 /* Forbidden if we are shut down or mounted norecovery. */ 642 error = -ESHUTDOWN; 643 if (xfs_is_shutdown(mp)) 644 goto out; 645 error = -ENOTRECOVERABLE; 646 if (xfs_has_norecovery(mp)) 647 goto out; 648 649 error = xchk_validate_inputs(mp, sm); 650 if (error) 651 goto out; 652 653 xfs_warn_experimental(mp, XFS_EXPERIMENTAL_SCRUB); 654 655 sc = kzalloc(sizeof(struct xfs_scrub), XCHK_GFP_FLAGS); 656 if (!sc) { 657 error = -ENOMEM; 658 goto out; 659 } 660 661 sc->mp = mp; 662 sc->file = file; 663 sc->sm = sm; 664 sc->ops = &meta_scrub_ops[sm->sm_type]; 665 sc->sick_mask = xchk_health_mask_for_scrub_type(sm->sm_type); 666 sc->relax = INIT_XCHK_RELAX; 667 retry_op: 668 /* 669 * When repairs are allowed, prevent freezing or readonly remount while 670 * scrub is running with a real transaction. 671 */ 672 if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) { 673 error = mnt_want_write_file(sc->file); 674 if (error) 675 goto out_sc; 676 677 sc->flags |= XCHK_HAVE_FREEZE_PROT; 678 } 679 680 /* Set up for the operation. */ 681 error = sc->ops->setup(sc); 682 if (error == -EDEADLOCK && !(sc->flags & XCHK_TRY_HARDER)) 683 goto try_harder; 684 if (error == -ECHRNG && !(sc->flags & XCHK_NEED_DRAIN)) 685 goto need_drain; 686 if (error) 687 goto out_teardown; 688 689 /* Scrub for errors. */ 690 check_start = xchk_stats_now(); 691 if ((sc->flags & XREP_ALREADY_FIXED) && sc->ops->repair_eval != NULL) 692 error = sc->ops->repair_eval(sc); 693 else 694 error = sc->ops->scrub(sc); 695 run.scrub_ns += xchk_stats_elapsed_ns(check_start); 696 if (error == -EDEADLOCK && !(sc->flags & XCHK_TRY_HARDER)) 697 goto try_harder; 698 if (error == -ECHRNG && !(sc->flags & XCHK_NEED_DRAIN)) 699 goto need_drain; 700 if (error || (sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)) 701 goto out_teardown; 702 703 xchk_update_health(sc); 704 705 if (xchk_could_repair(sc)) { 706 /* 707 * If userspace asked for a repair but it wasn't necessary, 708 * report that back to userspace. 709 */ 710 if (!xrep_will_attempt(sc)) { 711 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED; 712 goto out_nofix; 713 } 714 715 /* 716 * If it's broken, userspace wants us to fix it, and we haven't 717 * already tried to fix it, then attempt a repair. 718 */ 719 error = xrep_attempt(sc, &run); 720 if (error == -EAGAIN) { 721 /* 722 * Either the repair function succeeded or it couldn't 723 * get all the resources it needs; either way, we go 724 * back to the beginning and call the scrub function. 725 */ 726 error = xchk_teardown(sc, 0); 727 if (error) { 728 xrep_failure(mp); 729 goto out_sc; 730 } 731 goto retry_op; 732 } 733 } 734 735 out_nofix: 736 xchk_postmortem(sc); 737 out_teardown: 738 error = xchk_teardown(sc, error); 739 out_sc: 740 if (error != -ENOENT) 741 xchk_stats_merge(mp, sm, &run); 742 kfree(sc); 743 out: 744 trace_xchk_done(XFS_I(file_inode(file)), sm, error); 745 if (error == -EFSCORRUPTED || error == -EFSBADCRC) { 746 sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; 747 error = 0; 748 } 749 return error; 750 need_drain: 751 error = xchk_teardown(sc, 0); 752 if (error) 753 goto out_sc; 754 sc->flags |= XCHK_NEED_DRAIN; 755 run.retries++; 756 goto retry_op; 757 try_harder: 758 /* 759 * Scrubbers return -EDEADLOCK to mean 'try harder'. Tear down 760 * everything we hold, then set up again with preparation for 761 * worst-case scenarios. 762 */ 763 error = xchk_teardown(sc, 0); 764 if (error) 765 goto out_sc; 766 sc->flags |= XCHK_TRY_HARDER; 767 run.retries++; 768 goto retry_op; 769 } 770 771 /* Scrub one aspect of one piece of metadata. */ 772 int 773 xfs_ioc_scrub_metadata( 774 struct file *file, 775 void __user *arg) 776 { 777 struct xfs_scrub_metadata scrub; 778 int error; 779 780 if (!capable(CAP_SYS_ADMIN)) 781 return -EPERM; 782 783 if (copy_from_user(&scrub, arg, sizeof(scrub))) 784 return -EFAULT; 785 786 error = xfs_scrub_metadata(file, &scrub); 787 if (error) 788 return error; 789 790 if (copy_to_user(arg, &scrub, sizeof(scrub))) 791 return -EFAULT; 792 793 return 0; 794 } 795 796 /* Decide if there have been any scrub failures up to this point. */ 797 static inline int 798 xfs_scrubv_check_barrier( 799 struct xfs_mount *mp, 800 const struct xfs_scrub_vec *vectors, 801 const struct xfs_scrub_vec *stop_vec) 802 { 803 const struct xfs_scrub_vec *v; 804 __u32 failmask; 805 806 failmask = stop_vec->sv_flags & XFS_SCRUB_FLAGS_OUT; 807 808 for (v = vectors; v < stop_vec; v++) { 809 if (v->sv_type == XFS_SCRUB_TYPE_BARRIER) 810 continue; 811 812 /* 813 * Runtime errors count as a previous failure, except the ones 814 * used to ask userspace to retry. 815 */ 816 switch (v->sv_ret) { 817 case -EBUSY: 818 case -ENOENT: 819 case -EUSERS: 820 case 0: 821 break; 822 default: 823 return -ECANCELED; 824 } 825 826 /* 827 * If any of the out-flags on the scrub vector match the mask 828 * that was set on the barrier vector, that's a previous fail. 829 */ 830 if (v->sv_flags & failmask) 831 return -ECANCELED; 832 } 833 834 return 0; 835 } 836 837 /* 838 * If the caller provided us with a nonzero inode number that isn't the ioctl 839 * file, try to grab a reference to it to eliminate all further untrusted inode 840 * lookups. If we can't get the inode, let each scrub function try again. 841 */ 842 STATIC struct xfs_inode * 843 xchk_scrubv_open_by_handle( 844 struct xfs_mount *mp, 845 const struct xfs_scrub_vec_head *head) 846 { 847 struct xfs_trans *tp; 848 struct xfs_inode *ip; 849 int error; 850 851 error = xfs_trans_alloc_empty(mp, &tp); 852 if (error) 853 return NULL; 854 855 error = xfs_iget(mp, tp, head->svh_ino, XCHK_IGET_FLAGS, 0, &ip); 856 xfs_trans_cancel(tp); 857 if (error) 858 return NULL; 859 860 if (VFS_I(ip)->i_generation != head->svh_gen) { 861 xfs_irele(ip); 862 return NULL; 863 } 864 865 return ip; 866 } 867 868 /* Vectored scrub implementation to reduce ioctl calls. */ 869 int 870 xfs_ioc_scrubv_metadata( 871 struct file *file, 872 void __user *arg) 873 { 874 struct xfs_scrub_vec_head head; 875 struct xfs_scrub_vec_head __user *uhead = arg; 876 struct xfs_scrub_vec *vectors; 877 struct xfs_scrub_vec __user *uvectors; 878 struct xfs_inode *ip_in = XFS_I(file_inode(file)); 879 struct xfs_mount *mp = ip_in->i_mount; 880 struct xfs_inode *handle_ip = NULL; 881 struct xfs_scrub_vec *v; 882 size_t vec_bytes; 883 unsigned int i; 884 int error = 0; 885 886 if (!capable(CAP_SYS_ADMIN)) 887 return -EPERM; 888 889 if (copy_from_user(&head, uhead, sizeof(head))) 890 return -EFAULT; 891 892 if (head.svh_reserved) 893 return -EINVAL; 894 if (head.svh_flags & ~XFS_SCRUB_VEC_FLAGS_ALL) 895 return -EINVAL; 896 if (head.svh_nr == 0) 897 return 0; 898 899 vec_bytes = array_size(head.svh_nr, sizeof(struct xfs_scrub_vec)); 900 if (vec_bytes > PAGE_SIZE) 901 return -ENOMEM; 902 903 uvectors = u64_to_user_ptr(head.svh_vectors); 904 vectors = memdup_user(uvectors, vec_bytes); 905 if (IS_ERR(vectors)) 906 return PTR_ERR(vectors); 907 908 trace_xchk_scrubv_start(ip_in, &head); 909 910 for (i = 0, v = vectors; i < head.svh_nr; i++, v++) { 911 if (v->sv_reserved) { 912 error = -EINVAL; 913 goto out_free; 914 } 915 916 if (v->sv_type == XFS_SCRUB_TYPE_BARRIER && 917 (v->sv_flags & ~XFS_SCRUB_FLAGS_OUT)) { 918 error = -EINVAL; 919 goto out_free; 920 } 921 922 trace_xchk_scrubv_item(mp, &head, i, v); 923 } 924 925 /* 926 * If the caller wants us to do a scrub-by-handle and the file used to 927 * call the ioctl is not the same file, load the incore inode and pin 928 * it across all the scrubv actions to avoid repeated UNTRUSTED 929 * lookups. The reference is not passed to deeper layers of scrub 930 * because each scrubber gets to decide its own strategy and return 931 * values for getting an inode. 932 */ 933 if (head.svh_ino && head.svh_ino != ip_in->i_ino) 934 handle_ip = xchk_scrubv_open_by_handle(mp, &head); 935 936 /* Run all the scrubbers. */ 937 for (i = 0, v = vectors; i < head.svh_nr; i++, v++) { 938 struct xfs_scrub_metadata sm = { 939 .sm_type = v->sv_type, 940 .sm_flags = v->sv_flags, 941 .sm_ino = head.svh_ino, 942 .sm_gen = head.svh_gen, 943 .sm_agno = head.svh_agno, 944 }; 945 946 if (v->sv_type == XFS_SCRUB_TYPE_BARRIER) { 947 v->sv_ret = xfs_scrubv_check_barrier(mp, vectors, v); 948 if (v->sv_ret) { 949 trace_xchk_scrubv_barrier_fail(mp, &head, i, v); 950 break; 951 } 952 953 continue; 954 } 955 956 v->sv_ret = xfs_scrub_metadata(file, &sm); 957 v->sv_flags = sm.sm_flags; 958 959 trace_xchk_scrubv_outcome(mp, &head, i, v); 960 961 if (head.svh_rest_us) { 962 ktime_t expires; 963 964 expires = ktime_add_ns(ktime_get(), 965 head.svh_rest_us * 1000); 966 set_current_state(TASK_KILLABLE); 967 schedule_hrtimeout(&expires, HRTIMER_MODE_ABS); 968 } 969 970 if (fatal_signal_pending(current)) { 971 error = -EINTR; 972 goto out_free; 973 } 974 } 975 976 if (copy_to_user(uvectors, vectors, vec_bytes) || 977 copy_to_user(uhead, &head, sizeof(head))) { 978 error = -EFAULT; 979 goto out_free; 980 } 981 982 out_free: 983 if (handle_ip) 984 xfs_irele(handle_ip); 985 kfree(vectors); 986 return error; 987 } 988