1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2017-2023 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_log_format.h" 13 #include "xfs_trans.h" 14 #include "xfs_inode.h" 15 #include "xfs_quota.h" 16 #include "xfs_qm.h" 17 #include "xfs_scrub.h" 18 #include "xfs_buf_mem.h" 19 #include "xfs_rmap.h" 20 #include "xfs_exchrange.h" 21 #include "xfs_exchmaps.h" 22 #include "xfs_dir2.h" 23 #include "xfs_parent.h" 24 #include "xfs_icache.h" 25 #include "scrub/scrub.h" 26 #include "scrub/common.h" 27 #include "scrub/trace.h" 28 #include "scrub/repair.h" 29 #include "scrub/health.h" 30 #include "scrub/stats.h" 31 #include "scrub/xfile.h" 32 #include "scrub/tempfile.h" 33 #include "scrub/orphanage.h" 34 35 /* 36 * Online Scrub and Repair 37 * 38 * Traditionally, XFS (the kernel driver) did not know how to check or 39 * repair on-disk data structures. That task was left to the xfs_check 40 * and xfs_repair tools, both of which require taking the filesystem 41 * offline for a thorough but time consuming examination. Online 42 * scrub & repair, on the other hand, enables us to check the metadata 43 * for obvious errors while carefully stepping around the filesystem's 44 * ongoing operations, locking rules, etc. 45 * 46 * Given that most XFS metadata consist of records stored in a btree, 47 * most of the checking functions iterate the btree blocks themselves 48 * looking for irregularities. When a record block is encountered, each 49 * record can be checked for obviously bad values. Record values can 50 * also be cross-referenced against other btrees to look for potential 51 * misunderstandings between pieces of metadata. 52 * 53 * It is expected that the checkers responsible for per-AG metadata 54 * structures will lock the AG headers (AGI, AGF, AGFL), iterate the 55 * metadata structure, and perform any relevant cross-referencing before 56 * unlocking the AG and returning the results to userspace. These 57 * scrubbers must not keep an AG locked for too long to avoid tying up 58 * the block and inode allocators. 59 * 60 * Block maps and b-trees rooted in an inode present a special challenge 61 * because they can involve extents from any AG. The general scrubber 62 * structure of lock -> check -> xref -> unlock still holds, but AG 63 * locking order rules /must/ be obeyed to avoid deadlocks. The 64 * ordering rule, of course, is that we must lock in increasing AG 65 * order. Helper functions are provided to track which AG headers we've 66 * already locked. If we detect an imminent locking order violation, we 67 * can signal a potential deadlock, in which case the scrubber can jump 68 * out to the top level, lock all the AGs in order, and retry the scrub. 69 * 70 * For file data (directories, extended attributes, symlinks) scrub, we 71 * can simply lock the inode and walk the data. For btree data 72 * (directories and attributes) we follow the same btree-scrubbing 73 * strategy outlined previously to check the records. 74 * 75 * We use a bit of trickery with transactions to avoid buffer deadlocks 76 * if there is a cycle in the metadata. The basic problem is that 77 * travelling down a btree involves locking the current buffer at each 78 * tree level. If a pointer should somehow point back to a buffer that 79 * we've already examined, we will deadlock due to the second buffer 80 * locking attempt. Note however that grabbing a buffer in transaction 81 * context links the locked buffer to the transaction. If we try to 82 * re-grab the buffer in the context of the same transaction, we avoid 83 * the second lock attempt and continue. Between the verifier and the 84 * scrubber, something will notice that something is amiss and report 85 * the corruption. Therefore, each scrubber will allocate an empty 86 * transaction, attach buffers to it, and cancel the transaction at the 87 * end of the scrub run. Cancelling a non-dirty transaction simply 88 * unlocks the buffers. 89 * 90 * There are four pieces of data that scrub can communicate to 91 * userspace. The first is the error code (errno), which can be used to 92 * communicate operational errors in performing the scrub. There are 93 * also three flags that can be set in the scrub context. If the data 94 * structure itself is corrupt, the CORRUPT flag will be set. If 95 * the metadata is correct but otherwise suboptimal, the PREEN flag 96 * will be set. 97 * 98 * We perform secondary validation of filesystem metadata by 99 * cross-referencing every record with all other available metadata. 100 * For example, for block mapping extents, we verify that there are no 101 * records in the free space and inode btrees corresponding to that 102 * space extent and that there is a corresponding entry in the reverse 103 * mapping btree. Inconsistent metadata is noted by setting the 104 * XCORRUPT flag; btree query function errors are noted by setting the 105 * XFAIL flag and deleting the cursor to prevent further attempts to 106 * cross-reference with a defective btree. 107 * 108 * If a piece of metadata proves corrupt or suboptimal, the userspace 109 * program can ask the kernel to apply some tender loving care (TLC) to 110 * the metadata object by setting the REPAIR flag and re-calling the 111 * scrub ioctl. "Corruption" is defined by metadata violating the 112 * on-disk specification; operations cannot continue if the violation is 113 * left untreated. It is possible for XFS to continue if an object is 114 * "suboptimal", however performance may be degraded. Repairs are 115 * usually performed by rebuilding the metadata entirely out of 116 * redundant metadata. Optimizing, on the other hand, can sometimes be 117 * done without rebuilding entire structures. 118 * 119 * Generally speaking, the repair code has the following code structure: 120 * Lock -> scrub -> repair -> commit -> re-lock -> re-scrub -> unlock. 121 * The first check helps us figure out if we need to rebuild or simply 122 * optimize the structure so that the rebuild knows what to do. The 123 * second check evaluates the completeness of the repair; that is what 124 * is reported to userspace. 125 * 126 * A quick note on symbol prefixes: 127 * - "xfs_" are general XFS symbols. 128 * - "xchk_" are symbols related to metadata checking. 129 * - "xrep_" are symbols related to metadata repair. 130 * - "xfs_scrub_" are symbols that tie online fsck to the rest of XFS. 131 */ 132 133 /* 134 * Scrub probe -- userspace uses this to probe if we're willing to scrub 135 * or repair a given mountpoint. This will be used by xfs_scrub to 136 * probe the kernel's abilities to scrub (and repair) the metadata. We 137 * do this by validating the ioctl inputs from userspace, preparing the 138 * filesystem for a scrub (or a repair) operation, and immediately 139 * returning to userspace. Userspace can use the returned errno and 140 * structure state to decide (in broad terms) if scrub/repair are 141 * supported by the running kernel. 142 */ 143 static int 144 xchk_probe( 145 struct xfs_scrub *sc) 146 { 147 int error = 0; 148 149 if (xchk_should_terminate(sc, &error)) 150 return error; 151 152 return 0; 153 } 154 155 /* Scrub setup and teardown */ 156 157 static inline void 158 xchk_fsgates_disable( 159 struct xfs_scrub *sc) 160 { 161 if (!(sc->flags & XCHK_FSGATES_ALL)) 162 return; 163 164 trace_xchk_fsgates_disable(sc, sc->flags & XCHK_FSGATES_ALL); 165 166 if (sc->flags & XCHK_FSGATES_DRAIN) 167 xfs_drain_wait_disable(); 168 169 if (sc->flags & XCHK_FSGATES_QUOTA) 170 xfs_dqtrx_hook_disable(); 171 172 if (sc->flags & XCHK_FSGATES_DIRENTS) 173 xfs_dir_hook_disable(); 174 175 if (sc->flags & XCHK_FSGATES_RMAP) 176 xfs_rmap_hook_disable(); 177 178 sc->flags &= ~XCHK_FSGATES_ALL; 179 } 180 181 /* Free the resources associated with a scrub subtype. */ 182 void 183 xchk_scrub_free_subord( 184 struct xfs_scrub_subord *sub) 185 { 186 struct xfs_scrub *sc = sub->parent_sc; 187 188 ASSERT(sc->ip == sub->sc.ip); 189 ASSERT(sc->orphanage == sub->sc.orphanage); 190 ASSERT(sc->tempip == sub->sc.tempip); 191 192 sc->sm->sm_type = sub->old_smtype; 193 sc->sm->sm_flags = sub->old_smflags | 194 (sc->sm->sm_flags & XFS_SCRUB_FLAGS_OUT); 195 sc->tp = sub->sc.tp; 196 197 if (sub->sc.buf) { 198 if (sub->sc.buf_cleanup) 199 sub->sc.buf_cleanup(sub->sc.buf); 200 kvfree(sub->sc.buf); 201 } 202 if (sub->sc.xmbtp) 203 xmbuf_free(sub->sc.xmbtp); 204 if (sub->sc.xfile) 205 xfile_destroy(sub->sc.xfile); 206 207 sc->ilock_flags = sub->sc.ilock_flags; 208 sc->orphanage_ilock_flags = sub->sc.orphanage_ilock_flags; 209 sc->temp_ilock_flags = sub->sc.temp_ilock_flags; 210 211 kfree(sub); 212 } 213 214 /* Free all the resources and finish the transactions. */ 215 STATIC int 216 xchk_teardown( 217 struct xfs_scrub *sc, 218 int error) 219 { 220 xchk_ag_free(sc, &sc->sa); 221 if (sc->tp) { 222 if (error == 0 && (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)) 223 error = xfs_trans_commit(sc->tp); 224 else 225 xfs_trans_cancel(sc->tp); 226 sc->tp = NULL; 227 } 228 if (sc->ip) { 229 if (sc->ilock_flags) 230 xchk_iunlock(sc, sc->ilock_flags); 231 xchk_irele(sc, sc->ip); 232 sc->ip = NULL; 233 } 234 if (sc->flags & XCHK_HAVE_FREEZE_PROT) { 235 sc->flags &= ~XCHK_HAVE_FREEZE_PROT; 236 mnt_drop_write_file(sc->file); 237 } 238 if (sc->xmbtp) { 239 xmbuf_free(sc->xmbtp); 240 sc->xmbtp = NULL; 241 } 242 if (sc->xfile) { 243 xfile_destroy(sc->xfile); 244 sc->xfile = NULL; 245 } 246 if (sc->buf) { 247 if (sc->buf_cleanup) 248 sc->buf_cleanup(sc->buf); 249 kvfree(sc->buf); 250 sc->buf_cleanup = NULL; 251 sc->buf = NULL; 252 } 253 254 xrep_tempfile_rele(sc); 255 xrep_orphanage_rele(sc); 256 xchk_fsgates_disable(sc); 257 return error; 258 } 259 260 /* Scrubbing dispatch. */ 261 262 static const struct xchk_meta_ops meta_scrub_ops[] = { 263 [XFS_SCRUB_TYPE_PROBE] = { /* ioctl presence test */ 264 .type = ST_NONE, 265 .setup = xchk_setup_fs, 266 .scrub = xchk_probe, 267 .repair = xrep_probe, 268 }, 269 [XFS_SCRUB_TYPE_SB] = { /* superblock */ 270 .type = ST_PERAG, 271 .setup = xchk_setup_agheader, 272 .scrub = xchk_superblock, 273 .repair = xrep_superblock, 274 }, 275 [XFS_SCRUB_TYPE_AGF] = { /* agf */ 276 .type = ST_PERAG, 277 .setup = xchk_setup_agheader, 278 .scrub = xchk_agf, 279 .repair = xrep_agf, 280 }, 281 [XFS_SCRUB_TYPE_AGFL]= { /* agfl */ 282 .type = ST_PERAG, 283 .setup = xchk_setup_agheader, 284 .scrub = xchk_agfl, 285 .repair = xrep_agfl, 286 }, 287 [XFS_SCRUB_TYPE_AGI] = { /* agi */ 288 .type = ST_PERAG, 289 .setup = xchk_setup_agheader, 290 .scrub = xchk_agi, 291 .repair = xrep_agi, 292 }, 293 [XFS_SCRUB_TYPE_BNOBT] = { /* bnobt */ 294 .type = ST_PERAG, 295 .setup = xchk_setup_ag_allocbt, 296 .scrub = xchk_allocbt, 297 .repair = xrep_allocbt, 298 .repair_eval = xrep_revalidate_allocbt, 299 }, 300 [XFS_SCRUB_TYPE_CNTBT] = { /* cntbt */ 301 .type = ST_PERAG, 302 .setup = xchk_setup_ag_allocbt, 303 .scrub = xchk_allocbt, 304 .repair = xrep_allocbt, 305 .repair_eval = xrep_revalidate_allocbt, 306 }, 307 [XFS_SCRUB_TYPE_INOBT] = { /* inobt */ 308 .type = ST_PERAG, 309 .setup = xchk_setup_ag_iallocbt, 310 .scrub = xchk_iallocbt, 311 .repair = xrep_iallocbt, 312 .repair_eval = xrep_revalidate_iallocbt, 313 }, 314 [XFS_SCRUB_TYPE_FINOBT] = { /* finobt */ 315 .type = ST_PERAG, 316 .setup = xchk_setup_ag_iallocbt, 317 .scrub = xchk_iallocbt, 318 .has = xfs_has_finobt, 319 .repair = xrep_iallocbt, 320 .repair_eval = xrep_revalidate_iallocbt, 321 }, 322 [XFS_SCRUB_TYPE_RMAPBT] = { /* rmapbt */ 323 .type = ST_PERAG, 324 .setup = xchk_setup_ag_rmapbt, 325 .scrub = xchk_rmapbt, 326 .has = xfs_has_rmapbt, 327 .repair = xrep_rmapbt, 328 }, 329 [XFS_SCRUB_TYPE_REFCNTBT] = { /* refcountbt */ 330 .type = ST_PERAG, 331 .setup = xchk_setup_ag_refcountbt, 332 .scrub = xchk_refcountbt, 333 .has = xfs_has_reflink, 334 .repair = xrep_refcountbt, 335 }, 336 [XFS_SCRUB_TYPE_INODE] = { /* inode record */ 337 .type = ST_INODE, 338 .setup = xchk_setup_inode, 339 .scrub = xchk_inode, 340 .repair = xrep_inode, 341 }, 342 [XFS_SCRUB_TYPE_BMBTD] = { /* inode data fork */ 343 .type = ST_INODE, 344 .setup = xchk_setup_inode_bmap, 345 .scrub = xchk_bmap_data, 346 .repair = xrep_bmap_data, 347 }, 348 [XFS_SCRUB_TYPE_BMBTA] = { /* inode attr fork */ 349 .type = ST_INODE, 350 .setup = xchk_setup_inode_bmap, 351 .scrub = xchk_bmap_attr, 352 .repair = xrep_bmap_attr, 353 }, 354 [XFS_SCRUB_TYPE_BMBTC] = { /* inode CoW fork */ 355 .type = ST_INODE, 356 .setup = xchk_setup_inode_bmap, 357 .scrub = xchk_bmap_cow, 358 .repair = xrep_bmap_cow, 359 }, 360 [XFS_SCRUB_TYPE_DIR] = { /* directory */ 361 .type = ST_INODE, 362 .setup = xchk_setup_directory, 363 .scrub = xchk_directory, 364 .repair = xrep_directory, 365 }, 366 [XFS_SCRUB_TYPE_XATTR] = { /* extended attributes */ 367 .type = ST_INODE, 368 .setup = xchk_setup_xattr, 369 .scrub = xchk_xattr, 370 .repair = xrep_xattr, 371 }, 372 [XFS_SCRUB_TYPE_SYMLINK] = { /* symbolic link */ 373 .type = ST_INODE, 374 .setup = xchk_setup_symlink, 375 .scrub = xchk_symlink, 376 .repair = xrep_symlink, 377 }, 378 [XFS_SCRUB_TYPE_PARENT] = { /* parent pointers */ 379 .type = ST_INODE, 380 .setup = xchk_setup_parent, 381 .scrub = xchk_parent, 382 .repair = xrep_parent, 383 }, 384 [XFS_SCRUB_TYPE_RTBITMAP] = { /* realtime bitmap */ 385 .type = ST_FS, 386 .setup = xchk_setup_rtbitmap, 387 .scrub = xchk_rtbitmap, 388 .repair = xrep_rtbitmap, 389 }, 390 [XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */ 391 .type = ST_FS, 392 .setup = xchk_setup_rtsummary, 393 .scrub = xchk_rtsummary, 394 .repair = xrep_rtsummary, 395 }, 396 [XFS_SCRUB_TYPE_UQUOTA] = { /* user quota */ 397 .type = ST_FS, 398 .setup = xchk_setup_quota, 399 .scrub = xchk_quota, 400 .repair = xrep_quota, 401 }, 402 [XFS_SCRUB_TYPE_GQUOTA] = { /* group quota */ 403 .type = ST_FS, 404 .setup = xchk_setup_quota, 405 .scrub = xchk_quota, 406 .repair = xrep_quota, 407 }, 408 [XFS_SCRUB_TYPE_PQUOTA] = { /* project quota */ 409 .type = ST_FS, 410 .setup = xchk_setup_quota, 411 .scrub = xchk_quota, 412 .repair = xrep_quota, 413 }, 414 [XFS_SCRUB_TYPE_FSCOUNTERS] = { /* fs summary counters */ 415 .type = ST_FS, 416 .setup = xchk_setup_fscounters, 417 .scrub = xchk_fscounters, 418 .repair = xrep_fscounters, 419 }, 420 [XFS_SCRUB_TYPE_QUOTACHECK] = { /* quota counters */ 421 .type = ST_FS, 422 .setup = xchk_setup_quotacheck, 423 .scrub = xchk_quotacheck, 424 .repair = xrep_quotacheck, 425 }, 426 [XFS_SCRUB_TYPE_NLINKS] = { /* inode link counts */ 427 .type = ST_FS, 428 .setup = xchk_setup_nlinks, 429 .scrub = xchk_nlinks, 430 .repair = xrep_nlinks, 431 }, 432 [XFS_SCRUB_TYPE_HEALTHY] = { /* fs healthy; clean all reminders */ 433 .type = ST_FS, 434 .setup = xchk_setup_fs, 435 .scrub = xchk_health_record, 436 .repair = xrep_notsupported, 437 }, 438 [XFS_SCRUB_TYPE_DIRTREE] = { /* directory tree structure */ 439 .type = ST_INODE, 440 .setup = xchk_setup_dirtree, 441 .scrub = xchk_dirtree, 442 .has = xfs_has_parent, 443 .repair = xrep_dirtree, 444 }, 445 }; 446 447 static int 448 xchk_validate_inputs( 449 struct xfs_mount *mp, 450 struct xfs_scrub_metadata *sm) 451 { 452 int error; 453 const struct xchk_meta_ops *ops; 454 455 error = -EINVAL; 456 /* Check our inputs. */ 457 sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; 458 if (sm->sm_flags & ~XFS_SCRUB_FLAGS_IN) 459 goto out; 460 /* sm_reserved[] must be zero */ 461 if (memchr_inv(sm->sm_reserved, 0, sizeof(sm->sm_reserved))) 462 goto out; 463 464 error = -ENOENT; 465 /* Do we know about this type of metadata? */ 466 if (sm->sm_type >= XFS_SCRUB_TYPE_NR) 467 goto out; 468 ops = &meta_scrub_ops[sm->sm_type]; 469 if (ops->setup == NULL || ops->scrub == NULL) 470 goto out; 471 /* Does this fs even support this type of metadata? */ 472 if (ops->has && !ops->has(mp)) 473 goto out; 474 475 error = -EINVAL; 476 /* restricting fields must be appropriate for type */ 477 switch (ops->type) { 478 case ST_NONE: 479 case ST_FS: 480 if (sm->sm_ino || sm->sm_gen || sm->sm_agno) 481 goto out; 482 break; 483 case ST_PERAG: 484 if (sm->sm_ino || sm->sm_gen || 485 sm->sm_agno >= mp->m_sb.sb_agcount) 486 goto out; 487 break; 488 case ST_INODE: 489 if (sm->sm_agno || (sm->sm_gen && !sm->sm_ino)) 490 goto out; 491 break; 492 default: 493 goto out; 494 } 495 496 /* No rebuild without repair. */ 497 if ((sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) && 498 !(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)) 499 return -EINVAL; 500 501 /* 502 * We only want to repair read-write v5+ filesystems. Defer the check 503 * for ops->repair until after our scrub confirms that we need to 504 * perform repairs so that we avoid failing due to not supporting 505 * repairing an object that doesn't need repairs. 506 */ 507 if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) { 508 error = -EOPNOTSUPP; 509 if (!xfs_has_crc(mp)) 510 goto out; 511 512 error = -EROFS; 513 if (xfs_is_readonly(mp)) 514 goto out; 515 } 516 517 error = 0; 518 out: 519 return error; 520 } 521 522 #ifdef CONFIG_XFS_ONLINE_REPAIR 523 static inline void xchk_postmortem(struct xfs_scrub *sc) 524 { 525 /* 526 * Userspace asked us to repair something, we repaired it, rescanned 527 * it, and the rescan says it's still broken. Scream about this in 528 * the system logs. 529 */ 530 if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && 531 (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | 532 XFS_SCRUB_OFLAG_XCORRUPT))) 533 xrep_failure(sc->mp); 534 } 535 #else 536 static inline void xchk_postmortem(struct xfs_scrub *sc) 537 { 538 /* 539 * Userspace asked us to scrub something, it's broken, and we have no 540 * way of fixing it. Scream in the logs. 541 */ 542 if (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | 543 XFS_SCRUB_OFLAG_XCORRUPT)) 544 xfs_alert_ratelimited(sc->mp, 545 "Corruption detected during scrub."); 546 } 547 #endif /* CONFIG_XFS_ONLINE_REPAIR */ 548 549 /* 550 * Create a new scrub context from an existing one, but with a different scrub 551 * type. 552 */ 553 struct xfs_scrub_subord * 554 xchk_scrub_create_subord( 555 struct xfs_scrub *sc, 556 unsigned int subtype) 557 { 558 struct xfs_scrub_subord *sub; 559 560 sub = kzalloc(sizeof(*sub), XCHK_GFP_FLAGS); 561 if (!sub) 562 return ERR_PTR(-ENOMEM); 563 564 sub->old_smtype = sc->sm->sm_type; 565 sub->old_smflags = sc->sm->sm_flags; 566 sub->parent_sc = sc; 567 memcpy(&sub->sc, sc, sizeof(struct xfs_scrub)); 568 sub->sc.ops = &meta_scrub_ops[subtype]; 569 sub->sc.sm->sm_type = subtype; 570 sub->sc.sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; 571 sub->sc.buf = NULL; 572 sub->sc.buf_cleanup = NULL; 573 sub->sc.xfile = NULL; 574 sub->sc.xmbtp = NULL; 575 576 return sub; 577 } 578 579 /* Dispatch metadata scrubbing. */ 580 STATIC int 581 xfs_scrub_metadata( 582 struct file *file, 583 struct xfs_scrub_metadata *sm) 584 { 585 struct xchk_stats_run run = { }; 586 struct xfs_scrub *sc; 587 struct xfs_mount *mp = XFS_I(file_inode(file))->i_mount; 588 u64 check_start; 589 int error = 0; 590 591 BUILD_BUG_ON(sizeof(meta_scrub_ops) != 592 (sizeof(struct xchk_meta_ops) * XFS_SCRUB_TYPE_NR)); 593 594 trace_xchk_start(XFS_I(file_inode(file)), sm, error); 595 596 /* Forbidden if we are shut down or mounted norecovery. */ 597 error = -ESHUTDOWN; 598 if (xfs_is_shutdown(mp)) 599 goto out; 600 error = -ENOTRECOVERABLE; 601 if (xfs_has_norecovery(mp)) 602 goto out; 603 604 error = xchk_validate_inputs(mp, sm); 605 if (error) 606 goto out; 607 608 xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SCRUB, 609 "EXPERIMENTAL online scrub feature in use. Use at your own risk!"); 610 611 sc = kzalloc(sizeof(struct xfs_scrub), XCHK_GFP_FLAGS); 612 if (!sc) { 613 error = -ENOMEM; 614 goto out; 615 } 616 617 sc->mp = mp; 618 sc->file = file; 619 sc->sm = sm; 620 sc->ops = &meta_scrub_ops[sm->sm_type]; 621 sc->sick_mask = xchk_health_mask_for_scrub_type(sm->sm_type); 622 sc->relax = INIT_XCHK_RELAX; 623 retry_op: 624 /* 625 * When repairs are allowed, prevent freezing or readonly remount while 626 * scrub is running with a real transaction. 627 */ 628 if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) { 629 error = mnt_want_write_file(sc->file); 630 if (error) 631 goto out_sc; 632 633 sc->flags |= XCHK_HAVE_FREEZE_PROT; 634 } 635 636 /* Set up for the operation. */ 637 error = sc->ops->setup(sc); 638 if (error == -EDEADLOCK && !(sc->flags & XCHK_TRY_HARDER)) 639 goto try_harder; 640 if (error == -ECHRNG && !(sc->flags & XCHK_NEED_DRAIN)) 641 goto need_drain; 642 if (error) 643 goto out_teardown; 644 645 /* Scrub for errors. */ 646 check_start = xchk_stats_now(); 647 if ((sc->flags & XREP_ALREADY_FIXED) && sc->ops->repair_eval != NULL) 648 error = sc->ops->repair_eval(sc); 649 else 650 error = sc->ops->scrub(sc); 651 run.scrub_ns += xchk_stats_elapsed_ns(check_start); 652 if (error == -EDEADLOCK && !(sc->flags & XCHK_TRY_HARDER)) 653 goto try_harder; 654 if (error == -ECHRNG && !(sc->flags & XCHK_NEED_DRAIN)) 655 goto need_drain; 656 if (error || (sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)) 657 goto out_teardown; 658 659 xchk_update_health(sc); 660 661 if (xchk_could_repair(sc)) { 662 /* 663 * If userspace asked for a repair but it wasn't necessary, 664 * report that back to userspace. 665 */ 666 if (!xrep_will_attempt(sc)) { 667 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED; 668 goto out_nofix; 669 } 670 671 /* 672 * If it's broken, userspace wants us to fix it, and we haven't 673 * already tried to fix it, then attempt a repair. 674 */ 675 error = xrep_attempt(sc, &run); 676 if (error == -EAGAIN) { 677 /* 678 * Either the repair function succeeded or it couldn't 679 * get all the resources it needs; either way, we go 680 * back to the beginning and call the scrub function. 681 */ 682 error = xchk_teardown(sc, 0); 683 if (error) { 684 xrep_failure(mp); 685 goto out_sc; 686 } 687 goto retry_op; 688 } 689 } 690 691 out_nofix: 692 xchk_postmortem(sc); 693 out_teardown: 694 error = xchk_teardown(sc, error); 695 out_sc: 696 if (error != -ENOENT) 697 xchk_stats_merge(mp, sm, &run); 698 kfree(sc); 699 out: 700 trace_xchk_done(XFS_I(file_inode(file)), sm, error); 701 if (error == -EFSCORRUPTED || error == -EFSBADCRC) { 702 sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; 703 error = 0; 704 } 705 return error; 706 need_drain: 707 error = xchk_teardown(sc, 0); 708 if (error) 709 goto out_sc; 710 sc->flags |= XCHK_NEED_DRAIN; 711 run.retries++; 712 goto retry_op; 713 try_harder: 714 /* 715 * Scrubbers return -EDEADLOCK to mean 'try harder'. Tear down 716 * everything we hold, then set up again with preparation for 717 * worst-case scenarios. 718 */ 719 error = xchk_teardown(sc, 0); 720 if (error) 721 goto out_sc; 722 sc->flags |= XCHK_TRY_HARDER; 723 run.retries++; 724 goto retry_op; 725 } 726 727 /* Scrub one aspect of one piece of metadata. */ 728 int 729 xfs_ioc_scrub_metadata( 730 struct file *file, 731 void __user *arg) 732 { 733 struct xfs_scrub_metadata scrub; 734 int error; 735 736 if (!capable(CAP_SYS_ADMIN)) 737 return -EPERM; 738 739 if (copy_from_user(&scrub, arg, sizeof(scrub))) 740 return -EFAULT; 741 742 error = xfs_scrub_metadata(file, &scrub); 743 if (error) 744 return error; 745 746 if (copy_to_user(arg, &scrub, sizeof(scrub))) 747 return -EFAULT; 748 749 return 0; 750 } 751 752 /* Decide if there have been any scrub failures up to this point. */ 753 static inline int 754 xfs_scrubv_check_barrier( 755 struct xfs_mount *mp, 756 const struct xfs_scrub_vec *vectors, 757 const struct xfs_scrub_vec *stop_vec) 758 { 759 const struct xfs_scrub_vec *v; 760 __u32 failmask; 761 762 failmask = stop_vec->sv_flags & XFS_SCRUB_FLAGS_OUT; 763 764 for (v = vectors; v < stop_vec; v++) { 765 if (v->sv_type == XFS_SCRUB_TYPE_BARRIER) 766 continue; 767 768 /* 769 * Runtime errors count as a previous failure, except the ones 770 * used to ask userspace to retry. 771 */ 772 switch (v->sv_ret) { 773 case -EBUSY: 774 case -ENOENT: 775 case -EUSERS: 776 case 0: 777 break; 778 default: 779 return -ECANCELED; 780 } 781 782 /* 783 * If any of the out-flags on the scrub vector match the mask 784 * that was set on the barrier vector, that's a previous fail. 785 */ 786 if (v->sv_flags & failmask) 787 return -ECANCELED; 788 } 789 790 return 0; 791 } 792 793 /* 794 * If the caller provided us with a nonzero inode number that isn't the ioctl 795 * file, try to grab a reference to it to eliminate all further untrusted inode 796 * lookups. If we can't get the inode, let each scrub function try again. 797 */ 798 STATIC struct xfs_inode * 799 xchk_scrubv_open_by_handle( 800 struct xfs_mount *mp, 801 const struct xfs_scrub_vec_head *head) 802 { 803 struct xfs_trans *tp; 804 struct xfs_inode *ip; 805 int error; 806 807 error = xfs_trans_alloc_empty(mp, &tp); 808 if (error) 809 return NULL; 810 811 error = xfs_iget(mp, tp, head->svh_ino, XCHK_IGET_FLAGS, 0, &ip); 812 xfs_trans_cancel(tp); 813 if (error) 814 return NULL; 815 816 if (VFS_I(ip)->i_generation != head->svh_gen) { 817 xfs_irele(ip); 818 return NULL; 819 } 820 821 return ip; 822 } 823 824 /* Vectored scrub implementation to reduce ioctl calls. */ 825 int 826 xfs_ioc_scrubv_metadata( 827 struct file *file, 828 void __user *arg) 829 { 830 struct xfs_scrub_vec_head head; 831 struct xfs_scrub_vec_head __user *uhead = arg; 832 struct xfs_scrub_vec *vectors; 833 struct xfs_scrub_vec __user *uvectors; 834 struct xfs_inode *ip_in = XFS_I(file_inode(file)); 835 struct xfs_mount *mp = ip_in->i_mount; 836 struct xfs_inode *handle_ip = NULL; 837 struct xfs_scrub_vec *v; 838 size_t vec_bytes; 839 unsigned int i; 840 int error = 0; 841 842 if (!capable(CAP_SYS_ADMIN)) 843 return -EPERM; 844 845 if (copy_from_user(&head, uhead, sizeof(head))) 846 return -EFAULT; 847 848 if (head.svh_reserved) 849 return -EINVAL; 850 if (head.svh_flags & ~XFS_SCRUB_VEC_FLAGS_ALL) 851 return -EINVAL; 852 if (head.svh_nr == 0) 853 return 0; 854 855 vec_bytes = array_size(head.svh_nr, sizeof(struct xfs_scrub_vec)); 856 if (vec_bytes > PAGE_SIZE) 857 return -ENOMEM; 858 859 uvectors = (void __user *)(uintptr_t)head.svh_vectors; 860 vectors = memdup_user(uvectors, vec_bytes); 861 if (IS_ERR(vectors)) 862 return PTR_ERR(vectors); 863 864 trace_xchk_scrubv_start(ip_in, &head); 865 866 for (i = 0, v = vectors; i < head.svh_nr; i++, v++) { 867 if (v->sv_reserved) { 868 error = -EINVAL; 869 goto out_free; 870 } 871 872 if (v->sv_type == XFS_SCRUB_TYPE_BARRIER && 873 (v->sv_flags & ~XFS_SCRUB_FLAGS_OUT)) { 874 error = -EINVAL; 875 goto out_free; 876 } 877 878 trace_xchk_scrubv_item(mp, &head, i, v); 879 } 880 881 /* 882 * If the caller wants us to do a scrub-by-handle and the file used to 883 * call the ioctl is not the same file, load the incore inode and pin 884 * it across all the scrubv actions to avoid repeated UNTRUSTED 885 * lookups. The reference is not passed to deeper layers of scrub 886 * because each scrubber gets to decide its own strategy and return 887 * values for getting an inode. 888 */ 889 if (head.svh_ino && head.svh_ino != ip_in->i_ino) 890 handle_ip = xchk_scrubv_open_by_handle(mp, &head); 891 892 /* Run all the scrubbers. */ 893 for (i = 0, v = vectors; i < head.svh_nr; i++, v++) { 894 struct xfs_scrub_metadata sm = { 895 .sm_type = v->sv_type, 896 .sm_flags = v->sv_flags, 897 .sm_ino = head.svh_ino, 898 .sm_gen = head.svh_gen, 899 .sm_agno = head.svh_agno, 900 }; 901 902 if (v->sv_type == XFS_SCRUB_TYPE_BARRIER) { 903 v->sv_ret = xfs_scrubv_check_barrier(mp, vectors, v); 904 if (v->sv_ret) { 905 trace_xchk_scrubv_barrier_fail(mp, &head, i, v); 906 break; 907 } 908 909 continue; 910 } 911 912 v->sv_ret = xfs_scrub_metadata(file, &sm); 913 v->sv_flags = sm.sm_flags; 914 915 trace_xchk_scrubv_outcome(mp, &head, i, v); 916 917 if (head.svh_rest_us) { 918 ktime_t expires; 919 920 expires = ktime_add_ns(ktime_get(), 921 head.svh_rest_us * 1000); 922 set_current_state(TASK_KILLABLE); 923 schedule_hrtimeout(&expires, HRTIMER_MODE_ABS); 924 } 925 926 if (fatal_signal_pending(current)) { 927 error = -EINTR; 928 goto out_free; 929 } 930 } 931 932 if (copy_to_user(uvectors, vectors, vec_bytes) || 933 copy_to_user(uhead, &head, sizeof(head))) { 934 error = -EFAULT; 935 goto out_free; 936 } 937 938 out_free: 939 if (handle_ip) 940 xfs_irele(handle_ip); 941 kfree(vectors); 942 return error; 943 } 944