1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2018-2023 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_defer.h" 13 #include "xfs_btree.h" 14 #include "xfs_bit.h" 15 #include "xfs_log_format.h" 16 #include "xfs_trans.h" 17 #include "xfs_sb.h" 18 #include "xfs_inode.h" 19 #include "xfs_icache.h" 20 #include "xfs_inode_buf.h" 21 #include "xfs_inode_fork.h" 22 #include "xfs_ialloc.h" 23 #include "xfs_da_format.h" 24 #include "xfs_reflink.h" 25 #include "xfs_alloc.h" 26 #include "xfs_rmap.h" 27 #include "xfs_rmap_btree.h" 28 #include "xfs_bmap.h" 29 #include "xfs_bmap_btree.h" 30 #include "xfs_bmap_util.h" 31 #include "xfs_dir2.h" 32 #include "xfs_dir2_priv.h" 33 #include "xfs_quota_defs.h" 34 #include "xfs_quota.h" 35 #include "xfs_ag.h" 36 #include "xfs_rtbitmap.h" 37 #include "xfs_attr_leaf.h" 38 #include "xfs_log_priv.h" 39 #include "xfs_health.h" 40 #include "xfs_symlink_remote.h" 41 #include "scrub/xfs_scrub.h" 42 #include "scrub/scrub.h" 43 #include "scrub/common.h" 44 #include "scrub/btree.h" 45 #include "scrub/trace.h" 46 #include "scrub/repair.h" 47 #include "scrub/iscan.h" 48 #include "scrub/readdir.h" 49 #include "scrub/tempfile.h" 50 51 /* 52 * Inode Record Repair 53 * =================== 54 * 55 * Roughly speaking, inode problems can be classified based on whether or not 56 * they trip the dinode verifiers. If those trip, then we won't be able to 57 * xfs_iget ourselves the inode. 58 * 59 * Therefore, the xrep_dinode_* functions fix anything that will cause the 60 * inode buffer verifier or the dinode verifier. The xrep_inode_* functions 61 * fix things on live incore inodes. The inode repair functions make decisions 62 * with security and usability implications when reviving a file: 63 * 64 * - Files with zero di_mode or a garbage di_mode are converted to regular file 65 * that only root can read. This file may not actually contain user data, 66 * if the file was not previously a regular file. Setuid and setgid bits 67 * are cleared. 68 * 69 * - Zero-size directories can be truncated to look empty. It is necessary to 70 * run the bmapbtd and directory repair functions to fully rebuild the 71 * directory. 72 * 73 * - Zero-size symbolic link targets can be truncated to '?'. It is necessary 74 * to run the bmapbtd and symlink repair functions to salvage the symlink. 75 * 76 * - Invalid extent size hints will be removed. 77 * 78 * - Quotacheck will be scheduled if we repaired an inode that was so badly 79 * damaged that the ondisk inode had to be rebuilt. 80 * 81 * - Invalid user, group, or project IDs (aka -1U) will be reset to zero. 82 * Setuid and setgid bits are cleared. 83 * 84 * - Data and attr forks are reset to extents format with zero extents if the 85 * fork data is inconsistent. It is necessary to run the bmapbtd or bmapbta 86 * repair functions to recover the space mapping. 87 * 88 * - ACLs will not be recovered if the attr fork is zapped or the extended 89 * attribute structure itself requires salvaging. 90 * 91 * - If the attr fork is zapped, the user and group ids are reset to root and 92 * the setuid and setgid bits are removed. 93 */ 94 95 /* 96 * All the information we need to repair the ondisk inode if we can't iget the 97 * incore inode. We don't allocate this buffer unless we're going to perform 98 * a repair to the ondisk inode cluster buffer. 99 */ 100 struct xrep_inode { 101 /* Inode mapping that we saved from the initial lookup attempt. */ 102 struct xfs_imap imap; 103 104 struct xfs_scrub *sc; 105 106 /* Blocks in use on the data device by data extents or bmbt blocks. */ 107 xfs_rfsblock_t data_blocks; 108 109 /* Blocks in use on the rt device. */ 110 xfs_rfsblock_t rt_blocks; 111 112 /* Blocks in use by the attr fork. */ 113 xfs_rfsblock_t attr_blocks; 114 115 /* Number of data device extents for the data fork. */ 116 xfs_extnum_t data_extents; 117 118 /* 119 * Number of realtime device extents for the data fork. If 120 * data_extents and rt_extents indicate that the data fork has extents 121 * on both devices, we'll just back away slowly. 122 */ 123 xfs_extnum_t rt_extents; 124 125 /* Number of (data device) extents for the attr fork. */ 126 xfs_aextnum_t attr_extents; 127 128 /* Sick state to set after zapping parts of the inode. */ 129 unsigned int ino_sick_mask; 130 131 /* Must we remove all access from this file? */ 132 bool zap_acls; 133 134 /* Inode scanner to see if we can find the ftype from dirents */ 135 struct xchk_iscan ftype_iscan; 136 uint8_t alleged_ftype; 137 }; 138 139 /* 140 * Setup function for inode repair. @imap contains the ondisk inode mapping 141 * information so that we can correct the ondisk inode cluster buffer if 142 * necessary to make iget work. 143 */ 144 int 145 xrep_setup_inode( 146 struct xfs_scrub *sc, 147 const struct xfs_imap *imap) 148 { 149 struct xrep_inode *ri; 150 151 sc->buf = kzalloc(sizeof(struct xrep_inode), XCHK_GFP_FLAGS); 152 if (!sc->buf) 153 return -ENOMEM; 154 155 ri = sc->buf; 156 memcpy(&ri->imap, imap, sizeof(struct xfs_imap)); 157 ri->sc = sc; 158 return 0; 159 } 160 161 /* 162 * Make sure this ondisk inode can pass the inode buffer verifier. This is 163 * not the same as the dinode verifier. 164 */ 165 STATIC void 166 xrep_dinode_buf_core( 167 struct xfs_scrub *sc, 168 struct xfs_buf *bp, 169 unsigned int ioffset) 170 { 171 struct xfs_dinode *dip = xfs_buf_offset(bp, ioffset); 172 struct xfs_trans *tp = sc->tp; 173 struct xfs_mount *mp = sc->mp; 174 xfs_agino_t agino; 175 bool crc_ok = false; 176 bool magic_ok = false; 177 bool unlinked_ok = false; 178 179 agino = be32_to_cpu(dip->di_next_unlinked); 180 181 if (xfs_verify_agino_or_null(bp->b_pag, agino)) 182 unlinked_ok = true; 183 184 if (dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && 185 xfs_dinode_good_version(mp, dip->di_version)) 186 magic_ok = true; 187 188 if (xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, 189 XFS_DINODE_CRC_OFF)) 190 crc_ok = true; 191 192 if (magic_ok && unlinked_ok && crc_ok) 193 return; 194 195 if (!magic_ok) { 196 dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); 197 dip->di_version = 3; 198 } 199 if (!unlinked_ok) 200 dip->di_next_unlinked = cpu_to_be32(NULLAGINO); 201 xfs_dinode_calc_crc(mp, dip); 202 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); 203 xfs_trans_log_buf(tp, bp, ioffset, 204 ioffset + sizeof(struct xfs_dinode) - 1); 205 } 206 207 /* Make sure this inode cluster buffer can pass the inode buffer verifier. */ 208 STATIC void 209 xrep_dinode_buf( 210 struct xfs_scrub *sc, 211 struct xfs_buf *bp) 212 { 213 struct xfs_mount *mp = sc->mp; 214 int i; 215 int ni; 216 217 ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock; 218 for (i = 0; i < ni; i++) 219 xrep_dinode_buf_core(sc, bp, i << mp->m_sb.sb_inodelog); 220 } 221 222 /* Reinitialize things that never change in an inode. */ 223 STATIC void 224 xrep_dinode_header( 225 struct xfs_scrub *sc, 226 struct xfs_dinode *dip) 227 { 228 trace_xrep_dinode_header(sc, dip); 229 230 dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); 231 if (!xfs_dinode_good_version(sc->mp, dip->di_version)) 232 dip->di_version = 3; 233 dip->di_ino = cpu_to_be64(sc->sm->sm_ino); 234 uuid_copy(&dip->di_uuid, &sc->mp->m_sb.sb_meta_uuid); 235 dip->di_gen = cpu_to_be32(sc->sm->sm_gen); 236 } 237 238 /* 239 * If this directory entry points to the scrub target inode, then the directory 240 * we're scanning is the parent of the scrub target inode. 241 */ 242 STATIC int 243 xrep_dinode_findmode_dirent( 244 struct xfs_scrub *sc, 245 struct xfs_inode *dp, 246 xfs_dir2_dataptr_t dapos, 247 const struct xfs_name *name, 248 xfs_ino_t ino, 249 void *priv) 250 { 251 struct xrep_inode *ri = priv; 252 int error = 0; 253 254 if (xchk_should_terminate(ri->sc, &error)) 255 return error; 256 257 if (ino != sc->sm->sm_ino) 258 return 0; 259 260 /* Ignore garbage directory entry names. */ 261 if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len)) 262 return -EFSCORRUPTED; 263 264 /* Don't pick up dot or dotdot entries; we only want child dirents. */ 265 if (xfs_dir2_samename(name, &xfs_name_dotdot) || 266 xfs_dir2_samename(name, &xfs_name_dot)) 267 return 0; 268 269 /* 270 * Uhoh, more than one parent for this inode and they don't agree on 271 * the file type? 272 */ 273 if (ri->alleged_ftype != XFS_DIR3_FT_UNKNOWN && 274 ri->alleged_ftype != name->type) { 275 trace_xrep_dinode_findmode_dirent_inval(ri->sc, dp, name->type, 276 ri->alleged_ftype); 277 return -EFSCORRUPTED; 278 } 279 280 /* We found a potential parent; remember the ftype. */ 281 trace_xrep_dinode_findmode_dirent(ri->sc, dp, name->type); 282 ri->alleged_ftype = name->type; 283 return 0; 284 } 285 286 /* Try to lock a directory, or wait a jiffy. */ 287 static inline int 288 xrep_dinode_ilock_nowait( 289 struct xfs_inode *dp, 290 unsigned int lock_mode) 291 { 292 if (xfs_ilock_nowait(dp, lock_mode)) 293 return true; 294 295 schedule_timeout_killable(1); 296 return false; 297 } 298 299 /* 300 * Try to lock a directory to look for ftype hints. Since we already hold the 301 * AGI buffer, we cannot block waiting for the ILOCK because rename can take 302 * the ILOCK and then try to lock AGIs. 303 */ 304 STATIC int 305 xrep_dinode_trylock_directory( 306 struct xrep_inode *ri, 307 struct xfs_inode *dp, 308 unsigned int *lock_modep) 309 { 310 unsigned long deadline = jiffies + msecs_to_jiffies(30000); 311 unsigned int lock_mode; 312 int error = 0; 313 314 do { 315 if (xchk_should_terminate(ri->sc, &error)) 316 return error; 317 318 if (xfs_need_iread_extents(&dp->i_df)) 319 lock_mode = XFS_ILOCK_EXCL; 320 else 321 lock_mode = XFS_ILOCK_SHARED; 322 323 if (xrep_dinode_ilock_nowait(dp, lock_mode)) { 324 *lock_modep = lock_mode; 325 return 0; 326 } 327 } while (!time_is_before_jiffies(deadline)); 328 return -EBUSY; 329 } 330 331 /* 332 * If this is a directory, walk the dirents looking for any that point to the 333 * scrub target inode. 334 */ 335 STATIC int 336 xrep_dinode_findmode_walk_directory( 337 struct xrep_inode *ri, 338 struct xfs_inode *dp) 339 { 340 struct xfs_scrub *sc = ri->sc; 341 unsigned int lock_mode; 342 int error = 0; 343 344 /* Ignore temporary repair directories. */ 345 if (xrep_is_tempfile(dp)) 346 return 0; 347 348 /* 349 * Scan the directory to see if there it contains an entry pointing to 350 * the directory that we are repairing. 351 */ 352 error = xrep_dinode_trylock_directory(ri, dp, &lock_mode); 353 if (error) 354 return error; 355 356 /* 357 * If this directory is known to be sick, we cannot scan it reliably 358 * and must abort. 359 */ 360 if (xfs_inode_has_sickness(dp, XFS_SICK_INO_CORE | 361 XFS_SICK_INO_BMBTD | 362 XFS_SICK_INO_DIR)) { 363 error = -EFSCORRUPTED; 364 goto out_unlock; 365 } 366 367 /* 368 * We cannot complete our parent pointer scan if a directory looks as 369 * though it has been zapped by the inode record repair code. 370 */ 371 if (xchk_dir_looks_zapped(dp)) { 372 error = -EBUSY; 373 goto out_unlock; 374 } 375 376 error = xchk_dir_walk(sc, dp, xrep_dinode_findmode_dirent, ri); 377 if (error) 378 goto out_unlock; 379 380 out_unlock: 381 xfs_iunlock(dp, lock_mode); 382 return error; 383 } 384 385 /* 386 * Try to find the mode of the inode being repaired by looking for directories 387 * that point down to this file. 388 */ 389 STATIC int 390 xrep_dinode_find_mode( 391 struct xrep_inode *ri, 392 uint16_t *mode) 393 { 394 struct xfs_scrub *sc = ri->sc; 395 struct xfs_inode *dp; 396 int error; 397 398 /* No ftype means we have no other metadata to consult. */ 399 if (!xfs_has_ftype(sc->mp)) { 400 *mode = S_IFREG; 401 return 0; 402 } 403 404 /* 405 * Scan all directories for parents that might point down to this 406 * inode. Skip the inode being repaired during the scan since it 407 * cannot be its own parent. Note that we still hold the AGI locked 408 * so there's a real possibility that _iscan_iter can return EBUSY. 409 */ 410 xchk_iscan_start(sc, 5000, 100, &ri->ftype_iscan); 411 xchk_iscan_set_agi_trylock(&ri->ftype_iscan); 412 ri->ftype_iscan.skip_ino = sc->sm->sm_ino; 413 ri->alleged_ftype = XFS_DIR3_FT_UNKNOWN; 414 while ((error = xchk_iscan_iter(&ri->ftype_iscan, &dp)) == 1) { 415 if (S_ISDIR(VFS_I(dp)->i_mode)) 416 error = xrep_dinode_findmode_walk_directory(ri, dp); 417 xchk_iscan_mark_visited(&ri->ftype_iscan, dp); 418 xchk_irele(sc, dp); 419 if (error < 0) 420 break; 421 if (xchk_should_terminate(sc, &error)) 422 break; 423 } 424 xchk_iscan_iter_finish(&ri->ftype_iscan); 425 xchk_iscan_teardown(&ri->ftype_iscan); 426 427 if (error == -EBUSY) { 428 if (ri->alleged_ftype != XFS_DIR3_FT_UNKNOWN) { 429 /* 430 * If we got an EBUSY after finding at least one 431 * dirent, that means the scan found an inode on the 432 * inactivation list and could not open it. Accept the 433 * alleged ftype and install a new mode below. 434 */ 435 error = 0; 436 } else if (!(sc->flags & XCHK_TRY_HARDER)) { 437 /* 438 * Otherwise, retry the operation one time to see if 439 * the reason for the delay is an inode from the same 440 * cluster buffer waiting on the inactivation list. 441 */ 442 error = -EDEADLOCK; 443 } 444 } 445 if (error) 446 return error; 447 448 /* 449 * Convert the discovered ftype into the file mode. If all else fails, 450 * return S_IFREG. 451 */ 452 switch (ri->alleged_ftype) { 453 case XFS_DIR3_FT_DIR: 454 *mode = S_IFDIR; 455 break; 456 case XFS_DIR3_FT_WHT: 457 case XFS_DIR3_FT_CHRDEV: 458 *mode = S_IFCHR; 459 break; 460 case XFS_DIR3_FT_BLKDEV: 461 *mode = S_IFBLK; 462 break; 463 case XFS_DIR3_FT_FIFO: 464 *mode = S_IFIFO; 465 break; 466 case XFS_DIR3_FT_SOCK: 467 *mode = S_IFSOCK; 468 break; 469 case XFS_DIR3_FT_SYMLINK: 470 *mode = S_IFLNK; 471 break; 472 default: 473 *mode = S_IFREG; 474 break; 475 } 476 return 0; 477 } 478 479 /* Turn di_mode into /something/ recognizable. Returns true if we succeed. */ 480 STATIC int 481 xrep_dinode_mode( 482 struct xrep_inode *ri, 483 struct xfs_dinode *dip) 484 { 485 struct xfs_scrub *sc = ri->sc; 486 uint16_t mode = be16_to_cpu(dip->di_mode); 487 int error; 488 489 trace_xrep_dinode_mode(sc, dip); 490 491 if (mode == 0 || xfs_mode_to_ftype(mode) != XFS_DIR3_FT_UNKNOWN) 492 return 0; 493 494 /* Try to fix the mode. If we cannot, then leave everything alone. */ 495 error = xrep_dinode_find_mode(ri, &mode); 496 switch (error) { 497 case -EINTR: 498 case -EBUSY: 499 case -EDEADLOCK: 500 /* temporary failure or fatal signal */ 501 return error; 502 case 0: 503 /* found mode */ 504 break; 505 default: 506 /* some other error, assume S_IFREG */ 507 mode = S_IFREG; 508 break; 509 } 510 511 /* bad mode, so we set it to a file that only root can read */ 512 dip->di_mode = cpu_to_be16(mode); 513 dip->di_uid = 0; 514 dip->di_gid = 0; 515 ri->zap_acls = true; 516 return 0; 517 } 518 519 /* Fix unused link count fields having nonzero values. */ 520 STATIC void 521 xrep_dinode_nlinks( 522 struct xfs_dinode *dip) 523 { 524 if (dip->di_version < 2) { 525 dip->di_nlink = 0; 526 return; 527 } 528 529 if (xfs_dinode_is_metadir(dip)) { 530 if (be16_to_cpu(dip->di_metatype) >= XFS_METAFILE_MAX) 531 dip->di_metatype = cpu_to_be16(XFS_METAFILE_UNKNOWN); 532 } else { 533 dip->di_metatype = 0; 534 } 535 } 536 537 /* Fix any conflicting flags that the verifiers complain about. */ 538 STATIC void 539 xrep_dinode_flags( 540 struct xfs_scrub *sc, 541 struct xfs_dinode *dip, 542 bool isrt) 543 { 544 struct xfs_mount *mp = sc->mp; 545 uint64_t flags2 = be64_to_cpu(dip->di_flags2); 546 uint16_t flags = be16_to_cpu(dip->di_flags); 547 uint16_t mode = be16_to_cpu(dip->di_mode); 548 549 trace_xrep_dinode_flags(sc, dip); 550 551 if (isrt) 552 flags |= XFS_DIFLAG_REALTIME; 553 else 554 flags &= ~XFS_DIFLAG_REALTIME; 555 556 /* 557 * For regular files on a reflink filesystem, set the REFLINK flag to 558 * protect shared extents. A later stage will actually check those 559 * extents and clear the flag if possible. 560 */ 561 if (xfs_has_reflink(mp) && S_ISREG(mode)) 562 flags2 |= XFS_DIFLAG2_REFLINK; 563 else 564 flags2 &= ~(XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE); 565 if (flags & XFS_DIFLAG_REALTIME) 566 flags2 &= ~XFS_DIFLAG2_REFLINK; 567 if (!xfs_has_bigtime(mp)) 568 flags2 &= ~XFS_DIFLAG2_BIGTIME; 569 if (!xfs_has_large_extent_counts(mp)) 570 flags2 &= ~XFS_DIFLAG2_NREXT64; 571 if (flags2 & XFS_DIFLAG2_NREXT64) 572 dip->di_nrext64_pad = 0; 573 else if (dip->di_version >= 3) 574 dip->di_v3_pad = 0; 575 576 if (flags2 & XFS_DIFLAG2_METADATA) { 577 xfs_failaddr_t fa; 578 579 fa = xfs_dinode_verify_metadir(sc->mp, dip, mode, flags, 580 flags2); 581 if (fa) 582 flags2 &= ~XFS_DIFLAG2_METADATA; 583 } 584 585 dip->di_flags = cpu_to_be16(flags); 586 dip->di_flags2 = cpu_to_be64(flags2); 587 } 588 589 /* 590 * Blow out symlink; now it points nowhere. We don't have to worry about 591 * incore state because this inode is failing the verifiers. 592 */ 593 STATIC void 594 xrep_dinode_zap_symlink( 595 struct xrep_inode *ri, 596 struct xfs_dinode *dip) 597 { 598 struct xfs_scrub *sc = ri->sc; 599 char *p; 600 601 trace_xrep_dinode_zap_symlink(sc, dip); 602 603 dip->di_format = XFS_DINODE_FMT_LOCAL; 604 dip->di_size = cpu_to_be64(1); 605 p = XFS_DFORK_PTR(dip, XFS_DATA_FORK); 606 *p = '?'; 607 ri->ino_sick_mask |= XFS_SICK_INO_SYMLINK_ZAPPED; 608 } 609 610 /* 611 * Blow out dir, make the parent point to the root. In the future repair will 612 * reconstruct this directory for us. Note that there's no in-core directory 613 * inode because the sf verifier tripped, so we don't have to worry about the 614 * dentry cache. 615 */ 616 STATIC void 617 xrep_dinode_zap_dir( 618 struct xrep_inode *ri, 619 struct xfs_dinode *dip) 620 { 621 struct xfs_scrub *sc = ri->sc; 622 struct xfs_mount *mp = sc->mp; 623 struct xfs_dir2_sf_hdr *sfp; 624 int i8count; 625 626 trace_xrep_dinode_zap_dir(sc, dip); 627 628 dip->di_format = XFS_DINODE_FMT_LOCAL; 629 i8count = mp->m_sb.sb_rootino > XFS_DIR2_MAX_SHORT_INUM; 630 sfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK); 631 sfp->count = 0; 632 sfp->i8count = i8count; 633 xfs_dir2_sf_put_parent_ino(sfp, mp->m_sb.sb_rootino); 634 dip->di_size = cpu_to_be64(xfs_dir2_sf_hdr_size(i8count)); 635 ri->ino_sick_mask |= XFS_SICK_INO_DIR_ZAPPED; 636 } 637 638 /* Make sure we don't have a garbage file size. */ 639 STATIC void 640 xrep_dinode_size( 641 struct xrep_inode *ri, 642 struct xfs_dinode *dip) 643 { 644 struct xfs_scrub *sc = ri->sc; 645 uint64_t size = be64_to_cpu(dip->di_size); 646 uint16_t mode = be16_to_cpu(dip->di_mode); 647 648 trace_xrep_dinode_size(sc, dip); 649 650 switch (mode & S_IFMT) { 651 case S_IFIFO: 652 case S_IFCHR: 653 case S_IFBLK: 654 case S_IFSOCK: 655 /* di_size can't be nonzero for special files */ 656 dip->di_size = 0; 657 break; 658 case S_IFREG: 659 /* Regular files can't be larger than 2^63-1 bytes. */ 660 dip->di_size = cpu_to_be64(size & ~(1ULL << 63)); 661 break; 662 case S_IFLNK: 663 /* 664 * Truncate ridiculously oversized symlinks. If the size is 665 * zero, reset it to point to the current directory. Both of 666 * these conditions trigger dinode verifier errors, so there 667 * is no in-core state to reset. 668 */ 669 if (size > XFS_SYMLINK_MAXLEN) 670 dip->di_size = cpu_to_be64(XFS_SYMLINK_MAXLEN); 671 else if (size == 0) 672 xrep_dinode_zap_symlink(ri, dip); 673 break; 674 case S_IFDIR: 675 /* 676 * Directories can't have a size larger than 32G. If the size 677 * is zero, reset it to an empty directory. Both of these 678 * conditions trigger dinode verifier errors, so there is no 679 * in-core state to reset. 680 */ 681 if (size > XFS_DIR2_SPACE_SIZE) 682 dip->di_size = cpu_to_be64(XFS_DIR2_SPACE_SIZE); 683 else if (size == 0) 684 xrep_dinode_zap_dir(ri, dip); 685 break; 686 } 687 } 688 689 /* Fix extent size hints. */ 690 STATIC void 691 xrep_dinode_extsize_hints( 692 struct xfs_scrub *sc, 693 struct xfs_dinode *dip) 694 { 695 struct xfs_mount *mp = sc->mp; 696 uint64_t flags2 = be64_to_cpu(dip->di_flags2); 697 uint16_t flags = be16_to_cpu(dip->di_flags); 698 uint16_t mode = be16_to_cpu(dip->di_mode); 699 700 xfs_failaddr_t fa; 701 702 trace_xrep_dinode_extsize_hints(sc, dip); 703 704 fa = xfs_inode_validate_extsize(mp, be32_to_cpu(dip->di_extsize), 705 mode, flags); 706 if (fa) { 707 dip->di_extsize = 0; 708 dip->di_flags &= ~cpu_to_be16(XFS_DIFLAG_EXTSIZE | 709 XFS_DIFLAG_EXTSZINHERIT); 710 } 711 712 if (dip->di_version < 3) 713 return; 714 715 fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize), 716 mode, flags, flags2); 717 if (fa) { 718 dip->di_cowextsize = 0; 719 dip->di_flags2 &= ~cpu_to_be64(XFS_DIFLAG2_COWEXTSIZE); 720 } 721 } 722 723 /* Count extents and blocks for an inode given an rmap. */ 724 STATIC int 725 xrep_dinode_walk_rmap( 726 struct xfs_btree_cur *cur, 727 const struct xfs_rmap_irec *rec, 728 void *priv) 729 { 730 struct xrep_inode *ri = priv; 731 int error = 0; 732 733 if (xchk_should_terminate(ri->sc, &error)) 734 return error; 735 736 /* We only care about this inode. */ 737 if (rec->rm_owner != ri->sc->sm->sm_ino) 738 return 0; 739 740 if (rec->rm_flags & XFS_RMAP_ATTR_FORK) { 741 ri->attr_blocks += rec->rm_blockcount; 742 if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK)) 743 ri->attr_extents++; 744 745 return 0; 746 } 747 748 ri->data_blocks += rec->rm_blockcount; 749 if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK)) 750 ri->data_extents++; 751 752 return 0; 753 } 754 755 /* Count extents and blocks for an inode from all AG rmap data. */ 756 STATIC int 757 xrep_dinode_count_ag_rmaps( 758 struct xrep_inode *ri, 759 struct xfs_perag *pag) 760 { 761 struct xfs_btree_cur *cur; 762 struct xfs_buf *agf; 763 int error; 764 765 error = xfs_alloc_read_agf(pag, ri->sc->tp, 0, &agf); 766 if (error) 767 return error; 768 769 cur = xfs_rmapbt_init_cursor(ri->sc->mp, ri->sc->tp, agf, pag); 770 error = xfs_rmap_query_all(cur, xrep_dinode_walk_rmap, ri); 771 xfs_btree_del_cursor(cur, error); 772 xfs_trans_brelse(ri->sc->tp, agf); 773 return error; 774 } 775 776 /* Count extents and blocks for a given inode from all rmap data. */ 777 STATIC int 778 xrep_dinode_count_rmaps( 779 struct xrep_inode *ri) 780 { 781 struct xfs_perag *pag = NULL; 782 int error; 783 784 if (!xfs_has_rmapbt(ri->sc->mp) || xfs_has_realtime(ri->sc->mp)) 785 return -EOPNOTSUPP; 786 787 while ((pag = xfs_perag_next(ri->sc->mp, pag))) { 788 error = xrep_dinode_count_ag_rmaps(ri, pag); 789 if (error) { 790 xfs_perag_rele(pag); 791 return error; 792 } 793 } 794 795 /* Can't have extents on both the rt and the data device. */ 796 if (ri->data_extents && ri->rt_extents) 797 return -EFSCORRUPTED; 798 799 trace_xrep_dinode_count_rmaps(ri->sc, 800 ri->data_blocks, ri->rt_blocks, ri->attr_blocks, 801 ri->data_extents, ri->rt_extents, ri->attr_extents); 802 return 0; 803 } 804 805 /* Return true if this extents-format ifork looks like garbage. */ 806 STATIC bool 807 xrep_dinode_bad_extents_fork( 808 struct xfs_scrub *sc, 809 struct xfs_dinode *dip, 810 unsigned int dfork_size, 811 int whichfork) 812 { 813 struct xfs_bmbt_irec new; 814 struct xfs_bmbt_rec *dp; 815 xfs_extnum_t nex; 816 bool isrt; 817 unsigned int i; 818 819 nex = xfs_dfork_nextents(dip, whichfork); 820 if (nex > dfork_size / sizeof(struct xfs_bmbt_rec)) 821 return true; 822 823 dp = XFS_DFORK_PTR(dip, whichfork); 824 825 isrt = dip->di_flags & cpu_to_be16(XFS_DIFLAG_REALTIME); 826 for (i = 0; i < nex; i++, dp++) { 827 xfs_failaddr_t fa; 828 829 xfs_bmbt_disk_get_all(dp, &new); 830 fa = xfs_bmap_validate_extent_raw(sc->mp, isrt, whichfork, 831 &new); 832 if (fa) 833 return true; 834 } 835 836 return false; 837 } 838 839 /* Return true if this btree-format ifork looks like garbage. */ 840 STATIC bool 841 xrep_dinode_bad_bmbt_fork( 842 struct xfs_scrub *sc, 843 struct xfs_dinode *dip, 844 unsigned int dfork_size, 845 int whichfork) 846 { 847 struct xfs_bmdr_block *dfp; 848 xfs_extnum_t nex; 849 unsigned int i; 850 unsigned int dmxr; 851 unsigned int nrecs; 852 unsigned int level; 853 854 nex = xfs_dfork_nextents(dip, whichfork); 855 if (nex <= dfork_size / sizeof(struct xfs_bmbt_rec)) 856 return true; 857 858 if (dfork_size < sizeof(struct xfs_bmdr_block)) 859 return true; 860 861 dfp = XFS_DFORK_PTR(dip, whichfork); 862 nrecs = be16_to_cpu(dfp->bb_numrecs); 863 level = be16_to_cpu(dfp->bb_level); 864 865 if (nrecs == 0 || xfs_bmdr_space_calc(nrecs) > dfork_size) 866 return true; 867 if (level == 0 || level >= XFS_BM_MAXLEVELS(sc->mp, whichfork)) 868 return true; 869 870 dmxr = xfs_bmdr_maxrecs(dfork_size, 0); 871 for (i = 1; i <= nrecs; i++) { 872 struct xfs_bmbt_key *fkp; 873 xfs_bmbt_ptr_t *fpp; 874 xfs_fileoff_t fileoff; 875 xfs_fsblock_t fsbno; 876 877 fkp = xfs_bmdr_key_addr(dfp, i); 878 fileoff = be64_to_cpu(fkp->br_startoff); 879 if (!xfs_verify_fileoff(sc->mp, fileoff)) 880 return true; 881 882 fpp = xfs_bmdr_ptr_addr(dfp, i, dmxr); 883 fsbno = be64_to_cpu(*fpp); 884 if (!xfs_verify_fsbno(sc->mp, fsbno)) 885 return true; 886 } 887 888 return false; 889 } 890 891 /* 892 * Check the data fork for things that will fail the ifork verifiers or the 893 * ifork formatters. 894 */ 895 STATIC bool 896 xrep_dinode_check_dfork( 897 struct xfs_scrub *sc, 898 struct xfs_dinode *dip, 899 uint16_t mode) 900 { 901 void *dfork_ptr; 902 int64_t data_size; 903 unsigned int fmt; 904 unsigned int dfork_size; 905 906 /* 907 * Verifier functions take signed int64_t, so check for bogus negative 908 * values first. 909 */ 910 data_size = be64_to_cpu(dip->di_size); 911 if (data_size < 0) 912 return true; 913 914 fmt = XFS_DFORK_FORMAT(dip, XFS_DATA_FORK); 915 switch (mode & S_IFMT) { 916 case S_IFIFO: 917 case S_IFCHR: 918 case S_IFBLK: 919 case S_IFSOCK: 920 if (fmt != XFS_DINODE_FMT_DEV) 921 return true; 922 break; 923 case S_IFREG: 924 if (fmt == XFS_DINODE_FMT_LOCAL) 925 return true; 926 fallthrough; 927 case S_IFLNK: 928 case S_IFDIR: 929 switch (fmt) { 930 case XFS_DINODE_FMT_LOCAL: 931 case XFS_DINODE_FMT_EXTENTS: 932 case XFS_DINODE_FMT_BTREE: 933 break; 934 default: 935 return true; 936 } 937 break; 938 default: 939 return true; 940 } 941 942 dfork_size = XFS_DFORK_SIZE(dip, sc->mp, XFS_DATA_FORK); 943 dfork_ptr = XFS_DFORK_PTR(dip, XFS_DATA_FORK); 944 945 switch (fmt) { 946 case XFS_DINODE_FMT_DEV: 947 break; 948 case XFS_DINODE_FMT_LOCAL: 949 /* dir/symlink structure cannot be larger than the fork */ 950 if (data_size > dfork_size) 951 return true; 952 /* directory structure must pass verification. */ 953 if (S_ISDIR(mode) && 954 xfs_dir2_sf_verify(sc->mp, dfork_ptr, data_size) != NULL) 955 return true; 956 /* symlink structure must pass verification. */ 957 if (S_ISLNK(mode) && 958 xfs_symlink_shortform_verify(dfork_ptr, data_size) != NULL) 959 return true; 960 break; 961 case XFS_DINODE_FMT_EXTENTS: 962 if (xrep_dinode_bad_extents_fork(sc, dip, dfork_size, 963 XFS_DATA_FORK)) 964 return true; 965 break; 966 case XFS_DINODE_FMT_BTREE: 967 if (xrep_dinode_bad_bmbt_fork(sc, dip, dfork_size, 968 XFS_DATA_FORK)) 969 return true; 970 break; 971 default: 972 return true; 973 } 974 975 return false; 976 } 977 978 static void 979 xrep_dinode_set_data_nextents( 980 struct xfs_dinode *dip, 981 xfs_extnum_t nextents) 982 { 983 if (xfs_dinode_has_large_extent_counts(dip)) 984 dip->di_big_nextents = cpu_to_be64(nextents); 985 else 986 dip->di_nextents = cpu_to_be32(nextents); 987 } 988 989 static void 990 xrep_dinode_set_attr_nextents( 991 struct xfs_dinode *dip, 992 xfs_extnum_t nextents) 993 { 994 if (xfs_dinode_has_large_extent_counts(dip)) 995 dip->di_big_anextents = cpu_to_be32(nextents); 996 else 997 dip->di_anextents = cpu_to_be16(nextents); 998 } 999 1000 /* Reset the data fork to something sane. */ 1001 STATIC void 1002 xrep_dinode_zap_dfork( 1003 struct xrep_inode *ri, 1004 struct xfs_dinode *dip, 1005 uint16_t mode) 1006 { 1007 struct xfs_scrub *sc = ri->sc; 1008 1009 trace_xrep_dinode_zap_dfork(sc, dip); 1010 1011 ri->ino_sick_mask |= XFS_SICK_INO_BMBTD_ZAPPED; 1012 1013 xrep_dinode_set_data_nextents(dip, 0); 1014 ri->data_blocks = 0; 1015 ri->rt_blocks = 0; 1016 1017 /* Special files always get reset to DEV */ 1018 switch (mode & S_IFMT) { 1019 case S_IFIFO: 1020 case S_IFCHR: 1021 case S_IFBLK: 1022 case S_IFSOCK: 1023 dip->di_format = XFS_DINODE_FMT_DEV; 1024 dip->di_size = 0; 1025 return; 1026 } 1027 1028 /* 1029 * If we have data extents, reset to an empty map and hope the user 1030 * will run the bmapbtd checker next. 1031 */ 1032 if (ri->data_extents || ri->rt_extents || S_ISREG(mode)) { 1033 dip->di_format = XFS_DINODE_FMT_EXTENTS; 1034 return; 1035 } 1036 1037 /* Otherwise, reset the local format to the minimum. */ 1038 switch (mode & S_IFMT) { 1039 case S_IFLNK: 1040 xrep_dinode_zap_symlink(ri, dip); 1041 break; 1042 case S_IFDIR: 1043 xrep_dinode_zap_dir(ri, dip); 1044 break; 1045 } 1046 } 1047 1048 /* 1049 * Check the attr fork for things that will fail the ifork verifiers or the 1050 * ifork formatters. 1051 */ 1052 STATIC bool 1053 xrep_dinode_check_afork( 1054 struct xfs_scrub *sc, 1055 struct xfs_dinode *dip) 1056 { 1057 struct xfs_attr_sf_hdr *afork_ptr; 1058 size_t attr_size; 1059 unsigned int afork_size; 1060 1061 if (XFS_DFORK_BOFF(dip) == 0) 1062 return dip->di_aformat != XFS_DINODE_FMT_EXTENTS || 1063 xfs_dfork_attr_extents(dip) != 0; 1064 1065 afork_size = XFS_DFORK_SIZE(dip, sc->mp, XFS_ATTR_FORK); 1066 afork_ptr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK); 1067 1068 switch (XFS_DFORK_FORMAT(dip, XFS_ATTR_FORK)) { 1069 case XFS_DINODE_FMT_LOCAL: 1070 /* Fork has to be large enough to extract the xattr size. */ 1071 if (afork_size < sizeof(struct xfs_attr_sf_hdr)) 1072 return true; 1073 1074 /* xattr structure cannot be larger than the fork */ 1075 attr_size = be16_to_cpu(afork_ptr->totsize); 1076 if (attr_size > afork_size) 1077 return true; 1078 1079 /* xattr structure must pass verification. */ 1080 return xfs_attr_shortform_verify(afork_ptr, attr_size) != NULL; 1081 case XFS_DINODE_FMT_EXTENTS: 1082 if (xrep_dinode_bad_extents_fork(sc, dip, afork_size, 1083 XFS_ATTR_FORK)) 1084 return true; 1085 break; 1086 case XFS_DINODE_FMT_BTREE: 1087 if (xrep_dinode_bad_bmbt_fork(sc, dip, afork_size, 1088 XFS_ATTR_FORK)) 1089 return true; 1090 break; 1091 default: 1092 return true; 1093 } 1094 1095 return false; 1096 } 1097 1098 /* 1099 * Reset the attr fork to empty. Since the attr fork could have contained 1100 * ACLs, make the file readable only by root. 1101 */ 1102 STATIC void 1103 xrep_dinode_zap_afork( 1104 struct xrep_inode *ri, 1105 struct xfs_dinode *dip, 1106 uint16_t mode) 1107 { 1108 struct xfs_scrub *sc = ri->sc; 1109 1110 trace_xrep_dinode_zap_afork(sc, dip); 1111 1112 ri->ino_sick_mask |= XFS_SICK_INO_BMBTA_ZAPPED; 1113 1114 dip->di_aformat = XFS_DINODE_FMT_EXTENTS; 1115 xrep_dinode_set_attr_nextents(dip, 0); 1116 ri->attr_blocks = 0; 1117 1118 /* 1119 * If the data fork is in btree format, removing the attr fork entirely 1120 * might cause verifier failures if the next level down in the bmbt 1121 * could now fit in the data fork area. 1122 */ 1123 if (dip->di_format != XFS_DINODE_FMT_BTREE) 1124 dip->di_forkoff = 0; 1125 dip->di_mode = cpu_to_be16(mode & ~0777); 1126 dip->di_uid = 0; 1127 dip->di_gid = 0; 1128 } 1129 1130 /* Make sure the fork offset is a sensible value. */ 1131 STATIC void 1132 xrep_dinode_ensure_forkoff( 1133 struct xrep_inode *ri, 1134 struct xfs_dinode *dip, 1135 uint16_t mode) 1136 { 1137 struct xfs_bmdr_block *bmdr; 1138 struct xfs_scrub *sc = ri->sc; 1139 xfs_extnum_t attr_extents, data_extents; 1140 size_t bmdr_minsz = xfs_bmdr_space_calc(1); 1141 unsigned int lit_sz = XFS_LITINO(sc->mp); 1142 unsigned int afork_min, dfork_min; 1143 1144 trace_xrep_dinode_ensure_forkoff(sc, dip); 1145 1146 /* 1147 * Before calling this function, xrep_dinode_core ensured that both 1148 * forks actually fit inside their respective literal areas. If this 1149 * was not the case, the fork was reset to FMT_EXTENTS with zero 1150 * records. If the rmapbt scan found attr or data fork blocks, this 1151 * will be noted in the dinode_stats, and we must leave enough room 1152 * for the bmap repair code to reconstruct the mapping structure. 1153 * 1154 * First, compute the minimum space required for the attr fork. 1155 */ 1156 switch (dip->di_aformat) { 1157 case XFS_DINODE_FMT_LOCAL: 1158 /* 1159 * If we still have a shortform xattr structure at all, that 1160 * means the attr fork area was exactly large enough to fit 1161 * the sf structure. 1162 */ 1163 afork_min = XFS_DFORK_SIZE(dip, sc->mp, XFS_ATTR_FORK); 1164 break; 1165 case XFS_DINODE_FMT_EXTENTS: 1166 attr_extents = xfs_dfork_attr_extents(dip); 1167 if (attr_extents) { 1168 /* 1169 * We must maintain sufficient space to hold the entire 1170 * extent map array in the data fork. Note that we 1171 * previously zapped the fork if it had no chance of 1172 * fitting in the inode. 1173 */ 1174 afork_min = sizeof(struct xfs_bmbt_rec) * attr_extents; 1175 } else if (ri->attr_extents > 0) { 1176 /* 1177 * The attr fork thinks it has zero extents, but we 1178 * found some xattr extents. We need to leave enough 1179 * empty space here so that the incore attr fork will 1180 * get created (and hence trigger the attr fork bmap 1181 * repairer). 1182 */ 1183 afork_min = bmdr_minsz; 1184 } else { 1185 /* No extents on disk or found in rmapbt. */ 1186 afork_min = 0; 1187 } 1188 break; 1189 case XFS_DINODE_FMT_BTREE: 1190 /* Must have space for btree header and key/pointers. */ 1191 bmdr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK); 1192 afork_min = xfs_bmap_broot_space(sc->mp, bmdr); 1193 break; 1194 default: 1195 /* We should never see any other formats. */ 1196 afork_min = 0; 1197 break; 1198 } 1199 1200 /* Compute the minimum space required for the data fork. */ 1201 switch (dip->di_format) { 1202 case XFS_DINODE_FMT_DEV: 1203 dfork_min = sizeof(__be32); 1204 break; 1205 case XFS_DINODE_FMT_UUID: 1206 dfork_min = sizeof(uuid_t); 1207 break; 1208 case XFS_DINODE_FMT_LOCAL: 1209 /* 1210 * If we still have a shortform data fork at all, that means 1211 * the data fork area was large enough to fit whatever was in 1212 * there. 1213 */ 1214 dfork_min = be64_to_cpu(dip->di_size); 1215 break; 1216 case XFS_DINODE_FMT_EXTENTS: 1217 data_extents = xfs_dfork_data_extents(dip); 1218 if (data_extents) { 1219 /* 1220 * We must maintain sufficient space to hold the entire 1221 * extent map array in the data fork. Note that we 1222 * previously zapped the fork if it had no chance of 1223 * fitting in the inode. 1224 */ 1225 dfork_min = sizeof(struct xfs_bmbt_rec) * data_extents; 1226 } else if (ri->data_extents > 0 || ri->rt_extents > 0) { 1227 /* 1228 * The data fork thinks it has zero extents, but we 1229 * found some data extents. We need to leave enough 1230 * empty space here so that the data fork bmap repair 1231 * will recover the mappings. 1232 */ 1233 dfork_min = bmdr_minsz; 1234 } else { 1235 /* No extents on disk or found in rmapbt. */ 1236 dfork_min = 0; 1237 } 1238 break; 1239 case XFS_DINODE_FMT_BTREE: 1240 /* Must have space for btree header and key/pointers. */ 1241 bmdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK); 1242 dfork_min = xfs_bmap_broot_space(sc->mp, bmdr); 1243 break; 1244 default: 1245 dfork_min = 0; 1246 break; 1247 } 1248 1249 /* 1250 * Round all values up to the nearest 8 bytes, because that is the 1251 * precision of di_forkoff. 1252 */ 1253 afork_min = roundup(afork_min, 8); 1254 dfork_min = roundup(dfork_min, 8); 1255 bmdr_minsz = roundup(bmdr_minsz, 8); 1256 1257 ASSERT(dfork_min <= lit_sz); 1258 ASSERT(afork_min <= lit_sz); 1259 1260 /* 1261 * If the data fork was zapped and we don't have enough space for the 1262 * recovery fork, move the attr fork up. 1263 */ 1264 if (dip->di_format == XFS_DINODE_FMT_EXTENTS && 1265 xfs_dfork_data_extents(dip) == 0 && 1266 (ri->data_extents > 0 || ri->rt_extents > 0) && 1267 bmdr_minsz > XFS_DFORK_DSIZE(dip, sc->mp)) { 1268 if (bmdr_minsz + afork_min > lit_sz) { 1269 /* 1270 * The attr for and the stub fork we need to recover 1271 * the data fork won't both fit. Zap the attr fork. 1272 */ 1273 xrep_dinode_zap_afork(ri, dip, mode); 1274 afork_min = bmdr_minsz; 1275 } else { 1276 void *before, *after; 1277 1278 /* Otherwise, just slide the attr fork up. */ 1279 before = XFS_DFORK_APTR(dip); 1280 dip->di_forkoff = bmdr_minsz >> 3; 1281 after = XFS_DFORK_APTR(dip); 1282 memmove(after, before, XFS_DFORK_ASIZE(dip, sc->mp)); 1283 } 1284 } 1285 1286 /* 1287 * If the attr fork was zapped and we don't have enough space for the 1288 * recovery fork, move the attr fork down. 1289 */ 1290 if (dip->di_aformat == XFS_DINODE_FMT_EXTENTS && 1291 xfs_dfork_attr_extents(dip) == 0 && 1292 ri->attr_extents > 0 && 1293 bmdr_minsz > XFS_DFORK_ASIZE(dip, sc->mp)) { 1294 if (dip->di_format == XFS_DINODE_FMT_BTREE) { 1295 /* 1296 * If the data fork is in btree format then we can't 1297 * adjust forkoff because that runs the risk of 1298 * violating the extents/btree format transition rules. 1299 */ 1300 } else if (bmdr_minsz + dfork_min > lit_sz) { 1301 /* 1302 * If we can't move the attr fork, too bad, we lose the 1303 * attr fork and leak its blocks. 1304 */ 1305 xrep_dinode_zap_afork(ri, dip, mode); 1306 } else { 1307 /* 1308 * Otherwise, just slide the attr fork down. The attr 1309 * fork is empty, so we don't have any old contents to 1310 * move here. 1311 */ 1312 dip->di_forkoff = (lit_sz - bmdr_minsz) >> 3; 1313 } 1314 } 1315 } 1316 1317 /* 1318 * Zap the data/attr forks if we spot anything that isn't going to pass the 1319 * ifork verifiers or the ifork formatters, because we need to get the inode 1320 * into good enough shape that the higher level repair functions can run. 1321 */ 1322 STATIC void 1323 xrep_dinode_zap_forks( 1324 struct xrep_inode *ri, 1325 struct xfs_dinode *dip) 1326 { 1327 struct xfs_scrub *sc = ri->sc; 1328 xfs_extnum_t data_extents; 1329 xfs_extnum_t attr_extents; 1330 xfs_filblks_t nblocks; 1331 uint16_t mode; 1332 bool zap_datafork = false; 1333 bool zap_attrfork = ri->zap_acls; 1334 1335 trace_xrep_dinode_zap_forks(sc, dip); 1336 1337 mode = be16_to_cpu(dip->di_mode); 1338 1339 data_extents = xfs_dfork_data_extents(dip); 1340 attr_extents = xfs_dfork_attr_extents(dip); 1341 nblocks = be64_to_cpu(dip->di_nblocks); 1342 1343 /* Inode counters don't make sense? */ 1344 if (data_extents > nblocks) 1345 zap_datafork = true; 1346 if (attr_extents > nblocks) 1347 zap_attrfork = true; 1348 if (data_extents + attr_extents > nblocks) 1349 zap_datafork = zap_attrfork = true; 1350 1351 if (!zap_datafork) 1352 zap_datafork = xrep_dinode_check_dfork(sc, dip, mode); 1353 if (!zap_attrfork) 1354 zap_attrfork = xrep_dinode_check_afork(sc, dip); 1355 1356 /* Zap whatever's bad. */ 1357 if (zap_attrfork) 1358 xrep_dinode_zap_afork(ri, dip, mode); 1359 if (zap_datafork) 1360 xrep_dinode_zap_dfork(ri, dip, mode); 1361 xrep_dinode_ensure_forkoff(ri, dip, mode); 1362 1363 /* 1364 * Zero di_nblocks if we don't have any extents at all to satisfy the 1365 * buffer verifier. 1366 */ 1367 data_extents = xfs_dfork_data_extents(dip); 1368 attr_extents = xfs_dfork_attr_extents(dip); 1369 if (data_extents + attr_extents == 0) 1370 dip->di_nblocks = 0; 1371 } 1372 1373 /* Inode didn't pass dinode verifiers, so fix the raw buffer and retry iget. */ 1374 STATIC int 1375 xrep_dinode_core( 1376 struct xrep_inode *ri) 1377 { 1378 struct xfs_scrub *sc = ri->sc; 1379 struct xfs_buf *bp; 1380 struct xfs_dinode *dip; 1381 xfs_ino_t ino = sc->sm->sm_ino; 1382 int error; 1383 int iget_error; 1384 1385 /* Figure out what this inode had mapped in both forks. */ 1386 error = xrep_dinode_count_rmaps(ri); 1387 if (error) 1388 return error; 1389 1390 /* Read the inode cluster buffer. */ 1391 error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp, 1392 ri->imap.im_blkno, ri->imap.im_len, XBF_UNMAPPED, &bp, 1393 NULL); 1394 if (error) 1395 return error; 1396 1397 /* Make sure we can pass the inode buffer verifier. */ 1398 xrep_dinode_buf(sc, bp); 1399 bp->b_ops = &xfs_inode_buf_ops; 1400 1401 /* Fix everything the verifier will complain about. */ 1402 dip = xfs_buf_offset(bp, ri->imap.im_boffset); 1403 xrep_dinode_header(sc, dip); 1404 iget_error = xrep_dinode_mode(ri, dip); 1405 if (iget_error) 1406 goto write; 1407 xrep_dinode_nlinks(dip); 1408 xrep_dinode_flags(sc, dip, ri->rt_extents > 0); 1409 xrep_dinode_size(ri, dip); 1410 xrep_dinode_extsize_hints(sc, dip); 1411 xrep_dinode_zap_forks(ri, dip); 1412 1413 write: 1414 /* Write out the inode. */ 1415 trace_xrep_dinode_fixed(sc, dip); 1416 xfs_dinode_calc_crc(sc->mp, dip); 1417 xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_DINO_BUF); 1418 xfs_trans_log_buf(sc->tp, bp, ri->imap.im_boffset, 1419 ri->imap.im_boffset + sc->mp->m_sb.sb_inodesize - 1); 1420 1421 /* 1422 * In theory, we've fixed the ondisk inode record enough that we should 1423 * be able to load the inode into the cache. Try to iget that inode 1424 * now while we hold the AGI and the inode cluster buffer and take the 1425 * IOLOCK so that we can continue with repairs without anyone else 1426 * accessing the inode. If iget fails, we still need to commit the 1427 * changes. 1428 */ 1429 if (!iget_error) 1430 iget_error = xchk_iget(sc, ino, &sc->ip); 1431 if (!iget_error) 1432 xchk_ilock(sc, XFS_IOLOCK_EXCL); 1433 1434 /* 1435 * Commit the inode cluster buffer updates and drop the AGI buffer that 1436 * we've been holding since scrub setup. From here on out, repairs 1437 * deal only with the cached inode. 1438 */ 1439 error = xrep_trans_commit(sc); 1440 if (error) 1441 return error; 1442 1443 if (iget_error) 1444 return iget_error; 1445 1446 error = xchk_trans_alloc(sc, 0); 1447 if (error) 1448 return error; 1449 1450 error = xrep_ino_dqattach(sc); 1451 if (error) 1452 return error; 1453 1454 xchk_ilock(sc, XFS_ILOCK_EXCL); 1455 if (ri->ino_sick_mask) 1456 xfs_inode_mark_sick(sc->ip, ri->ino_sick_mask); 1457 return 0; 1458 } 1459 1460 /* Fix everything xfs_dinode_verify cares about. */ 1461 STATIC int 1462 xrep_dinode_problems( 1463 struct xrep_inode *ri) 1464 { 1465 struct xfs_scrub *sc = ri->sc; 1466 int error; 1467 1468 error = xrep_dinode_core(ri); 1469 if (error) 1470 return error; 1471 1472 /* We had to fix a totally busted inode, schedule quotacheck. */ 1473 if (XFS_IS_UQUOTA_ON(sc->mp)) 1474 xrep_force_quotacheck(sc, XFS_DQTYPE_USER); 1475 if (XFS_IS_GQUOTA_ON(sc->mp)) 1476 xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP); 1477 if (XFS_IS_PQUOTA_ON(sc->mp)) 1478 xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ); 1479 1480 return 0; 1481 } 1482 1483 /* 1484 * Fix problems that the verifiers don't care about. In general these are 1485 * errors that don't cause problems elsewhere in the kernel that we can easily 1486 * detect, so we don't check them all that rigorously. 1487 */ 1488 1489 /* Make sure block and extent counts are ok. */ 1490 STATIC int 1491 xrep_inode_blockcounts( 1492 struct xfs_scrub *sc) 1493 { 1494 struct xfs_ifork *ifp; 1495 xfs_filblks_t count; 1496 xfs_filblks_t acount; 1497 xfs_extnum_t nextents; 1498 int error; 1499 1500 trace_xrep_inode_blockcounts(sc); 1501 1502 /* Set data fork counters from the data fork mappings. */ 1503 error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK, 1504 &nextents, &count); 1505 if (error) 1506 return error; 1507 if (xfs_is_reflink_inode(sc->ip)) { 1508 /* 1509 * data fork blockcount can exceed physical storage if a user 1510 * reflinks the same block over and over again. 1511 */ 1512 ; 1513 } else if (XFS_IS_REALTIME_INODE(sc->ip)) { 1514 if (count >= sc->mp->m_sb.sb_rblocks) 1515 return -EFSCORRUPTED; 1516 } else { 1517 if (count >= sc->mp->m_sb.sb_dblocks) 1518 return -EFSCORRUPTED; 1519 } 1520 error = xrep_ino_ensure_extent_count(sc, XFS_DATA_FORK, nextents); 1521 if (error) 1522 return error; 1523 sc->ip->i_df.if_nextents = nextents; 1524 1525 /* Set attr fork counters from the attr fork mappings. */ 1526 ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK); 1527 if (ifp) { 1528 error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK, 1529 &nextents, &acount); 1530 if (error) 1531 return error; 1532 if (count >= sc->mp->m_sb.sb_dblocks) 1533 return -EFSCORRUPTED; 1534 error = xrep_ino_ensure_extent_count(sc, XFS_ATTR_FORK, 1535 nextents); 1536 if (error) 1537 return error; 1538 ifp->if_nextents = nextents; 1539 } else { 1540 acount = 0; 1541 } 1542 1543 sc->ip->i_nblocks = count + acount; 1544 return 0; 1545 } 1546 1547 /* Check for invalid uid/gid/prid. */ 1548 STATIC void 1549 xrep_inode_ids( 1550 struct xfs_scrub *sc) 1551 { 1552 bool dirty = false; 1553 1554 trace_xrep_inode_ids(sc); 1555 1556 if (!uid_valid(VFS_I(sc->ip)->i_uid)) { 1557 i_uid_write(VFS_I(sc->ip), 0); 1558 dirty = true; 1559 if (XFS_IS_UQUOTA_ON(sc->mp)) 1560 xrep_force_quotacheck(sc, XFS_DQTYPE_USER); 1561 } 1562 1563 if (!gid_valid(VFS_I(sc->ip)->i_gid)) { 1564 i_gid_write(VFS_I(sc->ip), 0); 1565 dirty = true; 1566 if (XFS_IS_GQUOTA_ON(sc->mp)) 1567 xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP); 1568 } 1569 1570 if (sc->ip->i_projid == -1U) { 1571 sc->ip->i_projid = 0; 1572 dirty = true; 1573 if (XFS_IS_PQUOTA_ON(sc->mp)) 1574 xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ); 1575 } 1576 1577 /* strip setuid/setgid if we touched any of the ids */ 1578 if (dirty) 1579 VFS_I(sc->ip)->i_mode &= ~(S_ISUID | S_ISGID); 1580 } 1581 1582 static inline void 1583 xrep_clamp_timestamp( 1584 struct xfs_inode *ip, 1585 struct timespec64 *ts) 1586 { 1587 ts->tv_nsec = clamp_t(long, ts->tv_nsec, 0, NSEC_PER_SEC); 1588 *ts = timestamp_truncate(*ts, VFS_I(ip)); 1589 } 1590 1591 /* Nanosecond counters can't have more than 1 billion. */ 1592 STATIC void 1593 xrep_inode_timestamps( 1594 struct xfs_inode *ip) 1595 { 1596 struct timespec64 tstamp; 1597 struct inode *inode = VFS_I(ip); 1598 1599 tstamp = inode_get_atime(inode); 1600 xrep_clamp_timestamp(ip, &tstamp); 1601 inode_set_atime_to_ts(inode, tstamp); 1602 1603 tstamp = inode_get_mtime(inode); 1604 xrep_clamp_timestamp(ip, &tstamp); 1605 inode_set_mtime_to_ts(inode, tstamp); 1606 1607 tstamp = inode_get_ctime(inode); 1608 xrep_clamp_timestamp(ip, &tstamp); 1609 inode_set_ctime_to_ts(inode, tstamp); 1610 1611 xrep_clamp_timestamp(ip, &ip->i_crtime); 1612 } 1613 1614 /* Fix inode flags that don't make sense together. */ 1615 STATIC void 1616 xrep_inode_flags( 1617 struct xfs_scrub *sc) 1618 { 1619 uint16_t mode; 1620 1621 trace_xrep_inode_flags(sc); 1622 1623 mode = VFS_I(sc->ip)->i_mode; 1624 1625 /* Clear junk flags */ 1626 if (sc->ip->i_diflags & ~XFS_DIFLAG_ANY) 1627 sc->ip->i_diflags &= ~XFS_DIFLAG_ANY; 1628 1629 /* NEWRTBM only applies to realtime bitmaps */ 1630 if (sc->ip->i_ino == sc->mp->m_sb.sb_rbmino) 1631 sc->ip->i_diflags |= XFS_DIFLAG_NEWRTBM; 1632 else 1633 sc->ip->i_diflags &= ~XFS_DIFLAG_NEWRTBM; 1634 1635 /* These only make sense for directories. */ 1636 if (!S_ISDIR(mode)) 1637 sc->ip->i_diflags &= ~(XFS_DIFLAG_RTINHERIT | 1638 XFS_DIFLAG_EXTSZINHERIT | 1639 XFS_DIFLAG_PROJINHERIT | 1640 XFS_DIFLAG_NOSYMLINKS); 1641 1642 /* These only make sense for files. */ 1643 if (!S_ISREG(mode)) 1644 sc->ip->i_diflags &= ~(XFS_DIFLAG_REALTIME | 1645 XFS_DIFLAG_EXTSIZE); 1646 1647 /* These only make sense for non-rt files. */ 1648 if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) 1649 sc->ip->i_diflags &= ~XFS_DIFLAG_FILESTREAM; 1650 1651 /* Immutable and append only? Drop the append. */ 1652 if ((sc->ip->i_diflags & XFS_DIFLAG_IMMUTABLE) && 1653 (sc->ip->i_diflags & XFS_DIFLAG_APPEND)) 1654 sc->ip->i_diflags &= ~XFS_DIFLAG_APPEND; 1655 1656 /* Clear junk flags. */ 1657 if (sc->ip->i_diflags2 & ~XFS_DIFLAG2_ANY) 1658 sc->ip->i_diflags2 &= ~XFS_DIFLAG2_ANY; 1659 1660 /* No reflink flag unless we support it and it's a file. */ 1661 if (!xfs_has_reflink(sc->mp) || !S_ISREG(mode)) 1662 sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; 1663 1664 /* DAX only applies to files and dirs. */ 1665 if (!(S_ISREG(mode) || S_ISDIR(mode))) 1666 sc->ip->i_diflags2 &= ~XFS_DIFLAG2_DAX; 1667 1668 /* No reflink files on the realtime device. */ 1669 if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) 1670 sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; 1671 } 1672 1673 /* 1674 * Fix size problems with block/node format directories. If we fail to find 1675 * the extent list, just bail out and let the bmapbtd repair functions clean 1676 * up that mess. 1677 */ 1678 STATIC void 1679 xrep_inode_blockdir_size( 1680 struct xfs_scrub *sc) 1681 { 1682 struct xfs_iext_cursor icur; 1683 struct xfs_bmbt_irec got; 1684 struct xfs_ifork *ifp; 1685 xfs_fileoff_t off; 1686 int error; 1687 1688 trace_xrep_inode_blockdir_size(sc); 1689 1690 error = xfs_iread_extents(sc->tp, sc->ip, XFS_DATA_FORK); 1691 if (error) 1692 return; 1693 1694 /* Find the last block before 32G; this is the dir size. */ 1695 ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); 1696 off = XFS_B_TO_FSB(sc->mp, XFS_DIR2_SPACE_SIZE); 1697 if (!xfs_iext_lookup_extent_before(sc->ip, ifp, &off, &icur, &got)) { 1698 /* zero-extents directory? */ 1699 return; 1700 } 1701 1702 off = got.br_startoff + got.br_blockcount; 1703 sc->ip->i_disk_size = min_t(loff_t, XFS_DIR2_SPACE_SIZE, 1704 XFS_FSB_TO_B(sc->mp, off)); 1705 } 1706 1707 /* Fix size problems with short format directories. */ 1708 STATIC void 1709 xrep_inode_sfdir_size( 1710 struct xfs_scrub *sc) 1711 { 1712 struct xfs_ifork *ifp; 1713 1714 trace_xrep_inode_sfdir_size(sc); 1715 1716 ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); 1717 sc->ip->i_disk_size = ifp->if_bytes; 1718 } 1719 1720 /* 1721 * Fix any irregularities in a directory inode's size now that we can iterate 1722 * extent maps and access other regular inode data. 1723 */ 1724 STATIC void 1725 xrep_inode_dir_size( 1726 struct xfs_scrub *sc) 1727 { 1728 trace_xrep_inode_dir_size(sc); 1729 1730 switch (sc->ip->i_df.if_format) { 1731 case XFS_DINODE_FMT_EXTENTS: 1732 case XFS_DINODE_FMT_BTREE: 1733 xrep_inode_blockdir_size(sc); 1734 break; 1735 case XFS_DINODE_FMT_LOCAL: 1736 xrep_inode_sfdir_size(sc); 1737 break; 1738 } 1739 } 1740 1741 /* Fix extent size hint problems. */ 1742 STATIC void 1743 xrep_inode_extsize( 1744 struct xfs_scrub *sc) 1745 { 1746 /* Fix misaligned extent size hints on a directory. */ 1747 if ((sc->ip->i_diflags & XFS_DIFLAG_RTINHERIT) && 1748 (sc->ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) && 1749 xfs_extlen_to_rtxmod(sc->mp, sc->ip->i_extsize) > 0) { 1750 sc->ip->i_extsize = 0; 1751 sc->ip->i_diflags &= ~XFS_DIFLAG_EXTSZINHERIT; 1752 } 1753 } 1754 1755 /* Ensure this file has an attr fork if it needs to hold a parent pointer. */ 1756 STATIC int 1757 xrep_inode_pptr( 1758 struct xfs_scrub *sc) 1759 { 1760 struct xfs_mount *mp = sc->mp; 1761 struct xfs_inode *ip = sc->ip; 1762 struct inode *inode = VFS_I(ip); 1763 1764 if (!xfs_has_parent(mp)) 1765 return 0; 1766 1767 /* 1768 * Unlinked inodes that cannot be added to the directory tree will not 1769 * have a parent pointer. 1770 */ 1771 if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE)) 1772 return 0; 1773 1774 /* Children of the superblock do not have parent pointers. */ 1775 if (xchk_inode_is_sb_rooted(ip)) 1776 return 0; 1777 1778 /* Inode already has an attr fork; no further work possible here. */ 1779 if (xfs_inode_has_attr_fork(ip)) 1780 return 0; 1781 1782 return xfs_bmap_add_attrfork(sc->tp, ip, 1783 sizeof(struct xfs_attr_sf_hdr), true); 1784 } 1785 1786 /* Fix any irregularities in an inode that the verifiers don't catch. */ 1787 STATIC int 1788 xrep_inode_problems( 1789 struct xfs_scrub *sc) 1790 { 1791 int error; 1792 1793 error = xrep_inode_blockcounts(sc); 1794 if (error) 1795 return error; 1796 error = xrep_inode_pptr(sc); 1797 if (error) 1798 return error; 1799 xrep_inode_timestamps(sc->ip); 1800 xrep_inode_flags(sc); 1801 xrep_inode_ids(sc); 1802 /* 1803 * We can now do a better job fixing the size of a directory now that 1804 * we can scan the data fork extents than we could in xrep_dinode_size. 1805 */ 1806 if (S_ISDIR(VFS_I(sc->ip)->i_mode)) 1807 xrep_inode_dir_size(sc); 1808 xrep_inode_extsize(sc); 1809 1810 trace_xrep_inode_fixed(sc); 1811 xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); 1812 return xrep_roll_trans(sc); 1813 } 1814 1815 /* 1816 * Make sure this inode's unlinked list pointers are consistent with its 1817 * link count. 1818 */ 1819 STATIC int 1820 xrep_inode_unlinked( 1821 struct xfs_scrub *sc) 1822 { 1823 unsigned int nlink = VFS_I(sc->ip)->i_nlink; 1824 int error; 1825 1826 /* 1827 * If this inode is linked from the directory tree and on the unlinked 1828 * list, remove it from the unlinked list. 1829 */ 1830 if (nlink > 0 && xfs_inode_on_unlinked_list(sc->ip)) { 1831 struct xfs_perag *pag; 1832 int error; 1833 1834 pag = xfs_perag_get(sc->mp, 1835 XFS_INO_TO_AGNO(sc->mp, sc->ip->i_ino)); 1836 error = xfs_iunlink_remove(sc->tp, pag, sc->ip); 1837 xfs_perag_put(pag); 1838 if (error) 1839 return error; 1840 } 1841 1842 /* 1843 * If this inode is not linked from the directory tree yet not on the 1844 * unlinked list, put it on the unlinked list. 1845 */ 1846 if (nlink == 0 && !xfs_inode_on_unlinked_list(sc->ip)) { 1847 error = xfs_iunlink(sc->tp, sc->ip); 1848 if (error) 1849 return error; 1850 } 1851 1852 return 0; 1853 } 1854 1855 /* Repair an inode's fields. */ 1856 int 1857 xrep_inode( 1858 struct xfs_scrub *sc) 1859 { 1860 int error = 0; 1861 1862 /* 1863 * No inode? That means we failed the _iget verifiers. Repair all 1864 * the things that the inode verifiers care about, then retry _iget. 1865 */ 1866 if (!sc->ip) { 1867 struct xrep_inode *ri = sc->buf; 1868 1869 ASSERT(ri != NULL); 1870 1871 error = xrep_dinode_problems(ri); 1872 if (error == -EBUSY) { 1873 /* 1874 * Directory scan to recover inode mode encountered a 1875 * busy inode, so we did not continue repairing things. 1876 */ 1877 return 0; 1878 } 1879 if (error) 1880 return error; 1881 1882 /* By this point we had better have a working incore inode. */ 1883 if (!sc->ip) 1884 return -EFSCORRUPTED; 1885 } 1886 1887 xfs_trans_ijoin(sc->tp, sc->ip, 0); 1888 1889 /* If we found corruption of any kind, try to fix it. */ 1890 if ((sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) || 1891 (sc->sm->sm_flags & XFS_SCRUB_OFLAG_XCORRUPT)) { 1892 error = xrep_inode_problems(sc); 1893 if (error) 1894 return error; 1895 } 1896 1897 /* See if we can clear the reflink flag. */ 1898 if (xfs_is_reflink_inode(sc->ip)) { 1899 error = xfs_reflink_clear_inode_flag(sc->ip, &sc->tp); 1900 if (error) 1901 return error; 1902 } 1903 1904 /* Reconnect incore unlinked list */ 1905 error = xrep_inode_unlinked(sc); 1906 if (error) 1907 return error; 1908 1909 return xrep_defer_finish(sc); 1910 } 1911