1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2018-2023 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_defer.h" 13 #include "xfs_btree.h" 14 #include "xfs_bit.h" 15 #include "xfs_log_format.h" 16 #include "xfs_trans.h" 17 #include "xfs_sb.h" 18 #include "xfs_inode.h" 19 #include "xfs_icache.h" 20 #include "xfs_inode_buf.h" 21 #include "xfs_inode_fork.h" 22 #include "xfs_ialloc.h" 23 #include "xfs_da_format.h" 24 #include "xfs_reflink.h" 25 #include "xfs_alloc.h" 26 #include "xfs_rmap.h" 27 #include "xfs_rmap_btree.h" 28 #include "xfs_bmap.h" 29 #include "xfs_bmap_btree.h" 30 #include "xfs_bmap_util.h" 31 #include "xfs_dir2.h" 32 #include "xfs_dir2_priv.h" 33 #include "xfs_quota_defs.h" 34 #include "xfs_quota.h" 35 #include "xfs_ag.h" 36 #include "xfs_rtbitmap.h" 37 #include "xfs_attr_leaf.h" 38 #include "xfs_log_priv.h" 39 #include "xfs_health.h" 40 #include "xfs_symlink_remote.h" 41 #include "scrub/xfs_scrub.h" 42 #include "scrub/scrub.h" 43 #include "scrub/common.h" 44 #include "scrub/btree.h" 45 #include "scrub/trace.h" 46 #include "scrub/repair.h" 47 #include "scrub/iscan.h" 48 #include "scrub/readdir.h" 49 #include "scrub/tempfile.h" 50 51 /* 52 * Inode Record Repair 53 * =================== 54 * 55 * Roughly speaking, inode problems can be classified based on whether or not 56 * they trip the dinode verifiers. If those trip, then we won't be able to 57 * xfs_iget ourselves the inode. 58 * 59 * Therefore, the xrep_dinode_* functions fix anything that will cause the 60 * inode buffer verifier or the dinode verifier. The xrep_inode_* functions 61 * fix things on live incore inodes. The inode repair functions make decisions 62 * with security and usability implications when reviving a file: 63 * 64 * - Files with zero di_mode or a garbage di_mode are converted to regular file 65 * that only root can read. This file may not actually contain user data, 66 * if the file was not previously a regular file. Setuid and setgid bits 67 * are cleared. 68 * 69 * - Zero-size directories can be truncated to look empty. It is necessary to 70 * run the bmapbtd and directory repair functions to fully rebuild the 71 * directory. 72 * 73 * - Zero-size symbolic link targets can be truncated to '?'. It is necessary 74 * to run the bmapbtd and symlink repair functions to salvage the symlink. 75 * 76 * - Invalid extent size hints will be removed. 77 * 78 * - Quotacheck will be scheduled if we repaired an inode that was so badly 79 * damaged that the ondisk inode had to be rebuilt. 80 * 81 * - Invalid user, group, or project IDs (aka -1U) will be reset to zero. 82 * Setuid and setgid bits are cleared. 83 * 84 * - Data and attr forks are reset to extents format with zero extents if the 85 * fork data is inconsistent. It is necessary to run the bmapbtd or bmapbta 86 * repair functions to recover the space mapping. 87 * 88 * - ACLs will not be recovered if the attr fork is zapped or the extended 89 * attribute structure itself requires salvaging. 90 * 91 * - If the attr fork is zapped, the user and group ids are reset to root and 92 * the setuid and setgid bits are removed. 93 */ 94 95 /* 96 * All the information we need to repair the ondisk inode if we can't iget the 97 * incore inode. We don't allocate this buffer unless we're going to perform 98 * a repair to the ondisk inode cluster buffer. 99 */ 100 struct xrep_inode { 101 /* Inode mapping that we saved from the initial lookup attempt. */ 102 struct xfs_imap imap; 103 104 struct xfs_scrub *sc; 105 106 /* Blocks in use on the data device by data extents or bmbt blocks. */ 107 xfs_rfsblock_t data_blocks; 108 109 /* Blocks in use on the rt device. */ 110 xfs_rfsblock_t rt_blocks; 111 112 /* Blocks in use by the attr fork. */ 113 xfs_rfsblock_t attr_blocks; 114 115 /* Number of data device extents for the data fork. */ 116 xfs_extnum_t data_extents; 117 118 /* 119 * Number of realtime device extents for the data fork. If 120 * data_extents and rt_extents indicate that the data fork has extents 121 * on both devices, we'll just back away slowly. 122 */ 123 xfs_extnum_t rt_extents; 124 125 /* Number of (data device) extents for the attr fork. */ 126 xfs_aextnum_t attr_extents; 127 128 /* Sick state to set after zapping parts of the inode. */ 129 unsigned int ino_sick_mask; 130 131 /* Must we remove all access from this file? */ 132 bool zap_acls; 133 134 /* Inode scanner to see if we can find the ftype from dirents */ 135 struct xchk_iscan ftype_iscan; 136 uint8_t alleged_ftype; 137 }; 138 139 /* 140 * Setup function for inode repair. @imap contains the ondisk inode mapping 141 * information so that we can correct the ondisk inode cluster buffer if 142 * necessary to make iget work. 143 */ 144 int 145 xrep_setup_inode( 146 struct xfs_scrub *sc, 147 const struct xfs_imap *imap) 148 { 149 struct xrep_inode *ri; 150 151 sc->buf = kzalloc(sizeof(struct xrep_inode), XCHK_GFP_FLAGS); 152 if (!sc->buf) 153 return -ENOMEM; 154 155 ri = sc->buf; 156 memcpy(&ri->imap, imap, sizeof(struct xfs_imap)); 157 ri->sc = sc; 158 return 0; 159 } 160 161 /* 162 * Make sure this ondisk inode can pass the inode buffer verifier. This is 163 * not the same as the dinode verifier. 164 */ 165 STATIC void 166 xrep_dinode_buf_core( 167 struct xfs_scrub *sc, 168 struct xfs_buf *bp, 169 unsigned int ioffset) 170 { 171 struct xfs_dinode *dip = xfs_buf_offset(bp, ioffset); 172 struct xfs_trans *tp = sc->tp; 173 struct xfs_mount *mp = sc->mp; 174 xfs_agino_t agino; 175 bool crc_ok = false; 176 bool magic_ok = false; 177 bool unlinked_ok = false; 178 179 agino = be32_to_cpu(dip->di_next_unlinked); 180 181 if (xfs_verify_agino_or_null(bp->b_pag, agino)) 182 unlinked_ok = true; 183 184 if (dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && 185 xfs_dinode_good_version(mp, dip->di_version)) 186 magic_ok = true; 187 188 if (xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, 189 XFS_DINODE_CRC_OFF)) 190 crc_ok = true; 191 192 if (magic_ok && unlinked_ok && crc_ok) 193 return; 194 195 if (!magic_ok) { 196 dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); 197 dip->di_version = 3; 198 } 199 if (!unlinked_ok) 200 dip->di_next_unlinked = cpu_to_be32(NULLAGINO); 201 xfs_dinode_calc_crc(mp, dip); 202 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); 203 xfs_trans_log_buf(tp, bp, ioffset, 204 ioffset + sizeof(struct xfs_dinode) - 1); 205 } 206 207 /* Make sure this inode cluster buffer can pass the inode buffer verifier. */ 208 STATIC void 209 xrep_dinode_buf( 210 struct xfs_scrub *sc, 211 struct xfs_buf *bp) 212 { 213 struct xfs_mount *mp = sc->mp; 214 int i; 215 int ni; 216 217 ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock; 218 for (i = 0; i < ni; i++) 219 xrep_dinode_buf_core(sc, bp, i << mp->m_sb.sb_inodelog); 220 } 221 222 /* Reinitialize things that never change in an inode. */ 223 STATIC void 224 xrep_dinode_header( 225 struct xfs_scrub *sc, 226 struct xfs_dinode *dip) 227 { 228 trace_xrep_dinode_header(sc, dip); 229 230 dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); 231 if (!xfs_dinode_good_version(sc->mp, dip->di_version)) 232 dip->di_version = 3; 233 dip->di_ino = cpu_to_be64(sc->sm->sm_ino); 234 uuid_copy(&dip->di_uuid, &sc->mp->m_sb.sb_meta_uuid); 235 dip->di_gen = cpu_to_be32(sc->sm->sm_gen); 236 } 237 238 /* 239 * If this directory entry points to the scrub target inode, then the directory 240 * we're scanning is the parent of the scrub target inode. 241 */ 242 STATIC int 243 xrep_dinode_findmode_dirent( 244 struct xfs_scrub *sc, 245 struct xfs_inode *dp, 246 xfs_dir2_dataptr_t dapos, 247 const struct xfs_name *name, 248 xfs_ino_t ino, 249 void *priv) 250 { 251 struct xrep_inode *ri = priv; 252 int error = 0; 253 254 if (xchk_should_terminate(ri->sc, &error)) 255 return error; 256 257 if (ino != sc->sm->sm_ino) 258 return 0; 259 260 /* Ignore garbage directory entry names. */ 261 if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len)) 262 return -EFSCORRUPTED; 263 264 /* Don't pick up dot or dotdot entries; we only want child dirents. */ 265 if (xfs_dir2_samename(name, &xfs_name_dotdot) || 266 xfs_dir2_samename(name, &xfs_name_dot)) 267 return 0; 268 269 /* 270 * Uhoh, more than one parent for this inode and they don't agree on 271 * the file type? 272 */ 273 if (ri->alleged_ftype != XFS_DIR3_FT_UNKNOWN && 274 ri->alleged_ftype != name->type) { 275 trace_xrep_dinode_findmode_dirent_inval(ri->sc, dp, name->type, 276 ri->alleged_ftype); 277 return -EFSCORRUPTED; 278 } 279 280 /* We found a potential parent; remember the ftype. */ 281 trace_xrep_dinode_findmode_dirent(ri->sc, dp, name->type); 282 ri->alleged_ftype = name->type; 283 return 0; 284 } 285 286 /* Try to lock a directory, or wait a jiffy. */ 287 static inline int 288 xrep_dinode_ilock_nowait( 289 struct xfs_inode *dp, 290 unsigned int lock_mode) 291 { 292 if (xfs_ilock_nowait(dp, lock_mode)) 293 return true; 294 295 schedule_timeout_killable(1); 296 return false; 297 } 298 299 /* 300 * Try to lock a directory to look for ftype hints. Since we already hold the 301 * AGI buffer, we cannot block waiting for the ILOCK because rename can take 302 * the ILOCK and then try to lock AGIs. 303 */ 304 STATIC int 305 xrep_dinode_trylock_directory( 306 struct xrep_inode *ri, 307 struct xfs_inode *dp, 308 unsigned int *lock_modep) 309 { 310 unsigned long deadline = jiffies + msecs_to_jiffies(30000); 311 unsigned int lock_mode; 312 int error = 0; 313 314 do { 315 if (xchk_should_terminate(ri->sc, &error)) 316 return error; 317 318 if (xfs_need_iread_extents(&dp->i_df)) 319 lock_mode = XFS_ILOCK_EXCL; 320 else 321 lock_mode = XFS_ILOCK_SHARED; 322 323 if (xrep_dinode_ilock_nowait(dp, lock_mode)) { 324 *lock_modep = lock_mode; 325 return 0; 326 } 327 } while (!time_is_before_jiffies(deadline)); 328 return -EBUSY; 329 } 330 331 /* 332 * If this is a directory, walk the dirents looking for any that point to the 333 * scrub target inode. 334 */ 335 STATIC int 336 xrep_dinode_findmode_walk_directory( 337 struct xrep_inode *ri, 338 struct xfs_inode *dp) 339 { 340 struct xfs_scrub *sc = ri->sc; 341 unsigned int lock_mode; 342 int error = 0; 343 344 /* Ignore temporary repair directories. */ 345 if (xrep_is_tempfile(dp)) 346 return 0; 347 348 /* 349 * Scan the directory to see if there it contains an entry pointing to 350 * the directory that we are repairing. 351 */ 352 error = xrep_dinode_trylock_directory(ri, dp, &lock_mode); 353 if (error) 354 return error; 355 356 /* 357 * If this directory is known to be sick, we cannot scan it reliably 358 * and must abort. 359 */ 360 if (xfs_inode_has_sickness(dp, XFS_SICK_INO_CORE | 361 XFS_SICK_INO_BMBTD | 362 XFS_SICK_INO_DIR)) { 363 error = -EFSCORRUPTED; 364 goto out_unlock; 365 } 366 367 /* 368 * We cannot complete our parent pointer scan if a directory looks as 369 * though it has been zapped by the inode record repair code. 370 */ 371 if (xchk_dir_looks_zapped(dp)) { 372 error = -EBUSY; 373 goto out_unlock; 374 } 375 376 error = xchk_dir_walk(sc, dp, xrep_dinode_findmode_dirent, ri); 377 if (error) 378 goto out_unlock; 379 380 out_unlock: 381 xfs_iunlock(dp, lock_mode); 382 return error; 383 } 384 385 /* 386 * Try to find the mode of the inode being repaired by looking for directories 387 * that point down to this file. 388 */ 389 STATIC int 390 xrep_dinode_find_mode( 391 struct xrep_inode *ri, 392 uint16_t *mode) 393 { 394 struct xfs_scrub *sc = ri->sc; 395 struct xfs_inode *dp; 396 int error; 397 398 /* No ftype means we have no other metadata to consult. */ 399 if (!xfs_has_ftype(sc->mp)) { 400 *mode = S_IFREG; 401 return 0; 402 } 403 404 /* 405 * Scan all directories for parents that might point down to this 406 * inode. Skip the inode being repaired during the scan since it 407 * cannot be its own parent. Note that we still hold the AGI locked 408 * so there's a real possibility that _iscan_iter can return EBUSY. 409 */ 410 xchk_iscan_start(sc, 5000, 100, &ri->ftype_iscan); 411 xchk_iscan_set_agi_trylock(&ri->ftype_iscan); 412 ri->ftype_iscan.skip_ino = sc->sm->sm_ino; 413 ri->alleged_ftype = XFS_DIR3_FT_UNKNOWN; 414 while ((error = xchk_iscan_iter(&ri->ftype_iscan, &dp)) == 1) { 415 if (S_ISDIR(VFS_I(dp)->i_mode)) 416 error = xrep_dinode_findmode_walk_directory(ri, dp); 417 xchk_iscan_mark_visited(&ri->ftype_iscan, dp); 418 xchk_irele(sc, dp); 419 if (error < 0) 420 break; 421 if (xchk_should_terminate(sc, &error)) 422 break; 423 } 424 xchk_iscan_iter_finish(&ri->ftype_iscan); 425 xchk_iscan_teardown(&ri->ftype_iscan); 426 427 if (error == -EBUSY) { 428 if (ri->alleged_ftype != XFS_DIR3_FT_UNKNOWN) { 429 /* 430 * If we got an EBUSY after finding at least one 431 * dirent, that means the scan found an inode on the 432 * inactivation list and could not open it. Accept the 433 * alleged ftype and install a new mode below. 434 */ 435 error = 0; 436 } else if (!(sc->flags & XCHK_TRY_HARDER)) { 437 /* 438 * Otherwise, retry the operation one time to see if 439 * the reason for the delay is an inode from the same 440 * cluster buffer waiting on the inactivation list. 441 */ 442 error = -EDEADLOCK; 443 } 444 } 445 if (error) 446 return error; 447 448 /* 449 * Convert the discovered ftype into the file mode. If all else fails, 450 * return S_IFREG. 451 */ 452 switch (ri->alleged_ftype) { 453 case XFS_DIR3_FT_DIR: 454 *mode = S_IFDIR; 455 break; 456 case XFS_DIR3_FT_WHT: 457 case XFS_DIR3_FT_CHRDEV: 458 *mode = S_IFCHR; 459 break; 460 case XFS_DIR3_FT_BLKDEV: 461 *mode = S_IFBLK; 462 break; 463 case XFS_DIR3_FT_FIFO: 464 *mode = S_IFIFO; 465 break; 466 case XFS_DIR3_FT_SOCK: 467 *mode = S_IFSOCK; 468 break; 469 case XFS_DIR3_FT_SYMLINK: 470 *mode = S_IFLNK; 471 break; 472 default: 473 *mode = S_IFREG; 474 break; 475 } 476 return 0; 477 } 478 479 /* Turn di_mode into /something/ recognizable. Returns true if we succeed. */ 480 STATIC int 481 xrep_dinode_mode( 482 struct xrep_inode *ri, 483 struct xfs_dinode *dip) 484 { 485 struct xfs_scrub *sc = ri->sc; 486 uint16_t mode = be16_to_cpu(dip->di_mode); 487 int error; 488 489 trace_xrep_dinode_mode(sc, dip); 490 491 if (mode == 0 || xfs_mode_to_ftype(mode) != XFS_DIR3_FT_UNKNOWN) 492 return 0; 493 494 /* Try to fix the mode. If we cannot, then leave everything alone. */ 495 error = xrep_dinode_find_mode(ri, &mode); 496 switch (error) { 497 case -EINTR: 498 case -EBUSY: 499 case -EDEADLOCK: 500 /* temporary failure or fatal signal */ 501 return error; 502 case 0: 503 /* found mode */ 504 break; 505 default: 506 /* some other error, assume S_IFREG */ 507 mode = S_IFREG; 508 break; 509 } 510 511 /* bad mode, so we set it to a file that only root can read */ 512 dip->di_mode = cpu_to_be16(mode); 513 dip->di_uid = 0; 514 dip->di_gid = 0; 515 ri->zap_acls = true; 516 return 0; 517 } 518 519 /* Fix unused link count fields having nonzero values. */ 520 STATIC void 521 xrep_dinode_nlinks( 522 struct xfs_dinode *dip) 523 { 524 if (dip->di_version > 1) 525 dip->di_onlink = 0; 526 else 527 dip->di_nlink = 0; 528 } 529 530 /* Fix any conflicting flags that the verifiers complain about. */ 531 STATIC void 532 xrep_dinode_flags( 533 struct xfs_scrub *sc, 534 struct xfs_dinode *dip, 535 bool isrt) 536 { 537 struct xfs_mount *mp = sc->mp; 538 uint64_t flags2 = be64_to_cpu(dip->di_flags2); 539 uint16_t flags = be16_to_cpu(dip->di_flags); 540 uint16_t mode = be16_to_cpu(dip->di_mode); 541 542 trace_xrep_dinode_flags(sc, dip); 543 544 if (isrt) 545 flags |= XFS_DIFLAG_REALTIME; 546 else 547 flags &= ~XFS_DIFLAG_REALTIME; 548 549 /* 550 * For regular files on a reflink filesystem, set the REFLINK flag to 551 * protect shared extents. A later stage will actually check those 552 * extents and clear the flag if possible. 553 */ 554 if (xfs_has_reflink(mp) && S_ISREG(mode)) 555 flags2 |= XFS_DIFLAG2_REFLINK; 556 else 557 flags2 &= ~(XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE); 558 if (flags & XFS_DIFLAG_REALTIME) 559 flags2 &= ~XFS_DIFLAG2_REFLINK; 560 if (!xfs_has_bigtime(mp)) 561 flags2 &= ~XFS_DIFLAG2_BIGTIME; 562 if (!xfs_has_large_extent_counts(mp)) 563 flags2 &= ~XFS_DIFLAG2_NREXT64; 564 if (flags2 & XFS_DIFLAG2_NREXT64) 565 dip->di_nrext64_pad = 0; 566 else if (dip->di_version >= 3) 567 dip->di_v3_pad = 0; 568 dip->di_flags = cpu_to_be16(flags); 569 dip->di_flags2 = cpu_to_be64(flags2); 570 } 571 572 /* 573 * Blow out symlink; now it points nowhere. We don't have to worry about 574 * incore state because this inode is failing the verifiers. 575 */ 576 STATIC void 577 xrep_dinode_zap_symlink( 578 struct xrep_inode *ri, 579 struct xfs_dinode *dip) 580 { 581 struct xfs_scrub *sc = ri->sc; 582 char *p; 583 584 trace_xrep_dinode_zap_symlink(sc, dip); 585 586 dip->di_format = XFS_DINODE_FMT_LOCAL; 587 dip->di_size = cpu_to_be64(1); 588 p = XFS_DFORK_PTR(dip, XFS_DATA_FORK); 589 *p = '?'; 590 ri->ino_sick_mask |= XFS_SICK_INO_SYMLINK_ZAPPED; 591 } 592 593 /* 594 * Blow out dir, make the parent point to the root. In the future repair will 595 * reconstruct this directory for us. Note that there's no in-core directory 596 * inode because the sf verifier tripped, so we don't have to worry about the 597 * dentry cache. 598 */ 599 STATIC void 600 xrep_dinode_zap_dir( 601 struct xrep_inode *ri, 602 struct xfs_dinode *dip) 603 { 604 struct xfs_scrub *sc = ri->sc; 605 struct xfs_mount *mp = sc->mp; 606 struct xfs_dir2_sf_hdr *sfp; 607 int i8count; 608 609 trace_xrep_dinode_zap_dir(sc, dip); 610 611 dip->di_format = XFS_DINODE_FMT_LOCAL; 612 i8count = mp->m_sb.sb_rootino > XFS_DIR2_MAX_SHORT_INUM; 613 sfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK); 614 sfp->count = 0; 615 sfp->i8count = i8count; 616 xfs_dir2_sf_put_parent_ino(sfp, mp->m_sb.sb_rootino); 617 dip->di_size = cpu_to_be64(xfs_dir2_sf_hdr_size(i8count)); 618 ri->ino_sick_mask |= XFS_SICK_INO_DIR_ZAPPED; 619 } 620 621 /* Make sure we don't have a garbage file size. */ 622 STATIC void 623 xrep_dinode_size( 624 struct xrep_inode *ri, 625 struct xfs_dinode *dip) 626 { 627 struct xfs_scrub *sc = ri->sc; 628 uint64_t size = be64_to_cpu(dip->di_size); 629 uint16_t mode = be16_to_cpu(dip->di_mode); 630 631 trace_xrep_dinode_size(sc, dip); 632 633 switch (mode & S_IFMT) { 634 case S_IFIFO: 635 case S_IFCHR: 636 case S_IFBLK: 637 case S_IFSOCK: 638 /* di_size can't be nonzero for special files */ 639 dip->di_size = 0; 640 break; 641 case S_IFREG: 642 /* Regular files can't be larger than 2^63-1 bytes. */ 643 dip->di_size = cpu_to_be64(size & ~(1ULL << 63)); 644 break; 645 case S_IFLNK: 646 /* 647 * Truncate ridiculously oversized symlinks. If the size is 648 * zero, reset it to point to the current directory. Both of 649 * these conditions trigger dinode verifier errors, so there 650 * is no in-core state to reset. 651 */ 652 if (size > XFS_SYMLINK_MAXLEN) 653 dip->di_size = cpu_to_be64(XFS_SYMLINK_MAXLEN); 654 else if (size == 0) 655 xrep_dinode_zap_symlink(ri, dip); 656 break; 657 case S_IFDIR: 658 /* 659 * Directories can't have a size larger than 32G. If the size 660 * is zero, reset it to an empty directory. Both of these 661 * conditions trigger dinode verifier errors, so there is no 662 * in-core state to reset. 663 */ 664 if (size > XFS_DIR2_SPACE_SIZE) 665 dip->di_size = cpu_to_be64(XFS_DIR2_SPACE_SIZE); 666 else if (size == 0) 667 xrep_dinode_zap_dir(ri, dip); 668 break; 669 } 670 } 671 672 /* Fix extent size hints. */ 673 STATIC void 674 xrep_dinode_extsize_hints( 675 struct xfs_scrub *sc, 676 struct xfs_dinode *dip) 677 { 678 struct xfs_mount *mp = sc->mp; 679 uint64_t flags2 = be64_to_cpu(dip->di_flags2); 680 uint16_t flags = be16_to_cpu(dip->di_flags); 681 uint16_t mode = be16_to_cpu(dip->di_mode); 682 683 xfs_failaddr_t fa; 684 685 trace_xrep_dinode_extsize_hints(sc, dip); 686 687 fa = xfs_inode_validate_extsize(mp, be32_to_cpu(dip->di_extsize), 688 mode, flags); 689 if (fa) { 690 dip->di_extsize = 0; 691 dip->di_flags &= ~cpu_to_be16(XFS_DIFLAG_EXTSIZE | 692 XFS_DIFLAG_EXTSZINHERIT); 693 } 694 695 if (dip->di_version < 3) 696 return; 697 698 fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize), 699 mode, flags, flags2); 700 if (fa) { 701 dip->di_cowextsize = 0; 702 dip->di_flags2 &= ~cpu_to_be64(XFS_DIFLAG2_COWEXTSIZE); 703 } 704 } 705 706 /* Count extents and blocks for an inode given an rmap. */ 707 STATIC int 708 xrep_dinode_walk_rmap( 709 struct xfs_btree_cur *cur, 710 const struct xfs_rmap_irec *rec, 711 void *priv) 712 { 713 struct xrep_inode *ri = priv; 714 int error = 0; 715 716 if (xchk_should_terminate(ri->sc, &error)) 717 return error; 718 719 /* We only care about this inode. */ 720 if (rec->rm_owner != ri->sc->sm->sm_ino) 721 return 0; 722 723 if (rec->rm_flags & XFS_RMAP_ATTR_FORK) { 724 ri->attr_blocks += rec->rm_blockcount; 725 if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK)) 726 ri->attr_extents++; 727 728 return 0; 729 } 730 731 ri->data_blocks += rec->rm_blockcount; 732 if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK)) 733 ri->data_extents++; 734 735 return 0; 736 } 737 738 /* Count extents and blocks for an inode from all AG rmap data. */ 739 STATIC int 740 xrep_dinode_count_ag_rmaps( 741 struct xrep_inode *ri, 742 struct xfs_perag *pag) 743 { 744 struct xfs_btree_cur *cur; 745 struct xfs_buf *agf; 746 int error; 747 748 error = xfs_alloc_read_agf(pag, ri->sc->tp, 0, &agf); 749 if (error) 750 return error; 751 752 cur = xfs_rmapbt_init_cursor(ri->sc->mp, ri->sc->tp, agf, pag); 753 error = xfs_rmap_query_all(cur, xrep_dinode_walk_rmap, ri); 754 xfs_btree_del_cursor(cur, error); 755 xfs_trans_brelse(ri->sc->tp, agf); 756 return error; 757 } 758 759 /* Count extents and blocks for a given inode from all rmap data. */ 760 STATIC int 761 xrep_dinode_count_rmaps( 762 struct xrep_inode *ri) 763 { 764 struct xfs_perag *pag; 765 xfs_agnumber_t agno; 766 int error; 767 768 if (!xfs_has_rmapbt(ri->sc->mp) || xfs_has_realtime(ri->sc->mp)) 769 return -EOPNOTSUPP; 770 771 for_each_perag(ri->sc->mp, agno, pag) { 772 error = xrep_dinode_count_ag_rmaps(ri, pag); 773 if (error) { 774 xfs_perag_rele(pag); 775 return error; 776 } 777 } 778 779 /* Can't have extents on both the rt and the data device. */ 780 if (ri->data_extents && ri->rt_extents) 781 return -EFSCORRUPTED; 782 783 trace_xrep_dinode_count_rmaps(ri->sc, 784 ri->data_blocks, ri->rt_blocks, ri->attr_blocks, 785 ri->data_extents, ri->rt_extents, ri->attr_extents); 786 return 0; 787 } 788 789 /* Return true if this extents-format ifork looks like garbage. */ 790 STATIC bool 791 xrep_dinode_bad_extents_fork( 792 struct xfs_scrub *sc, 793 struct xfs_dinode *dip, 794 unsigned int dfork_size, 795 int whichfork) 796 { 797 struct xfs_bmbt_irec new; 798 struct xfs_bmbt_rec *dp; 799 xfs_extnum_t nex; 800 bool isrt; 801 unsigned int i; 802 803 nex = xfs_dfork_nextents(dip, whichfork); 804 if (nex > dfork_size / sizeof(struct xfs_bmbt_rec)) 805 return true; 806 807 dp = XFS_DFORK_PTR(dip, whichfork); 808 809 isrt = dip->di_flags & cpu_to_be16(XFS_DIFLAG_REALTIME); 810 for (i = 0; i < nex; i++, dp++) { 811 xfs_failaddr_t fa; 812 813 xfs_bmbt_disk_get_all(dp, &new); 814 fa = xfs_bmap_validate_extent_raw(sc->mp, isrt, whichfork, 815 &new); 816 if (fa) 817 return true; 818 } 819 820 return false; 821 } 822 823 /* Return true if this btree-format ifork looks like garbage. */ 824 STATIC bool 825 xrep_dinode_bad_bmbt_fork( 826 struct xfs_scrub *sc, 827 struct xfs_dinode *dip, 828 unsigned int dfork_size, 829 int whichfork) 830 { 831 struct xfs_bmdr_block *dfp; 832 xfs_extnum_t nex; 833 unsigned int i; 834 unsigned int dmxr; 835 unsigned int nrecs; 836 unsigned int level; 837 838 nex = xfs_dfork_nextents(dip, whichfork); 839 if (nex <= dfork_size / sizeof(struct xfs_bmbt_rec)) 840 return true; 841 842 if (dfork_size < sizeof(struct xfs_bmdr_block)) 843 return true; 844 845 dfp = XFS_DFORK_PTR(dip, whichfork); 846 nrecs = be16_to_cpu(dfp->bb_numrecs); 847 level = be16_to_cpu(dfp->bb_level); 848 849 if (nrecs == 0 || XFS_BMDR_SPACE_CALC(nrecs) > dfork_size) 850 return true; 851 if (level == 0 || level >= XFS_BM_MAXLEVELS(sc->mp, whichfork)) 852 return true; 853 854 dmxr = xfs_bmdr_maxrecs(dfork_size, 0); 855 for (i = 1; i <= nrecs; i++) { 856 struct xfs_bmbt_key *fkp; 857 xfs_bmbt_ptr_t *fpp; 858 xfs_fileoff_t fileoff; 859 xfs_fsblock_t fsbno; 860 861 fkp = XFS_BMDR_KEY_ADDR(dfp, i); 862 fileoff = be64_to_cpu(fkp->br_startoff); 863 if (!xfs_verify_fileoff(sc->mp, fileoff)) 864 return true; 865 866 fpp = XFS_BMDR_PTR_ADDR(dfp, i, dmxr); 867 fsbno = be64_to_cpu(*fpp); 868 if (!xfs_verify_fsbno(sc->mp, fsbno)) 869 return true; 870 } 871 872 return false; 873 } 874 875 /* 876 * Check the data fork for things that will fail the ifork verifiers or the 877 * ifork formatters. 878 */ 879 STATIC bool 880 xrep_dinode_check_dfork( 881 struct xfs_scrub *sc, 882 struct xfs_dinode *dip, 883 uint16_t mode) 884 { 885 void *dfork_ptr; 886 int64_t data_size; 887 unsigned int fmt; 888 unsigned int dfork_size; 889 890 /* 891 * Verifier functions take signed int64_t, so check for bogus negative 892 * values first. 893 */ 894 data_size = be64_to_cpu(dip->di_size); 895 if (data_size < 0) 896 return true; 897 898 fmt = XFS_DFORK_FORMAT(dip, XFS_DATA_FORK); 899 switch (mode & S_IFMT) { 900 case S_IFIFO: 901 case S_IFCHR: 902 case S_IFBLK: 903 case S_IFSOCK: 904 if (fmt != XFS_DINODE_FMT_DEV) 905 return true; 906 break; 907 case S_IFREG: 908 if (fmt == XFS_DINODE_FMT_LOCAL) 909 return true; 910 fallthrough; 911 case S_IFLNK: 912 case S_IFDIR: 913 switch (fmt) { 914 case XFS_DINODE_FMT_LOCAL: 915 case XFS_DINODE_FMT_EXTENTS: 916 case XFS_DINODE_FMT_BTREE: 917 break; 918 default: 919 return true; 920 } 921 break; 922 default: 923 return true; 924 } 925 926 dfork_size = XFS_DFORK_SIZE(dip, sc->mp, XFS_DATA_FORK); 927 dfork_ptr = XFS_DFORK_PTR(dip, XFS_DATA_FORK); 928 929 switch (fmt) { 930 case XFS_DINODE_FMT_DEV: 931 break; 932 case XFS_DINODE_FMT_LOCAL: 933 /* dir/symlink structure cannot be larger than the fork */ 934 if (data_size > dfork_size) 935 return true; 936 /* directory structure must pass verification. */ 937 if (S_ISDIR(mode) && 938 xfs_dir2_sf_verify(sc->mp, dfork_ptr, data_size) != NULL) 939 return true; 940 /* symlink structure must pass verification. */ 941 if (S_ISLNK(mode) && 942 xfs_symlink_shortform_verify(dfork_ptr, data_size) != NULL) 943 return true; 944 break; 945 case XFS_DINODE_FMT_EXTENTS: 946 if (xrep_dinode_bad_extents_fork(sc, dip, dfork_size, 947 XFS_DATA_FORK)) 948 return true; 949 break; 950 case XFS_DINODE_FMT_BTREE: 951 if (xrep_dinode_bad_bmbt_fork(sc, dip, dfork_size, 952 XFS_DATA_FORK)) 953 return true; 954 break; 955 default: 956 return true; 957 } 958 959 return false; 960 } 961 962 static void 963 xrep_dinode_set_data_nextents( 964 struct xfs_dinode *dip, 965 xfs_extnum_t nextents) 966 { 967 if (xfs_dinode_has_large_extent_counts(dip)) 968 dip->di_big_nextents = cpu_to_be64(nextents); 969 else 970 dip->di_nextents = cpu_to_be32(nextents); 971 } 972 973 static void 974 xrep_dinode_set_attr_nextents( 975 struct xfs_dinode *dip, 976 xfs_extnum_t nextents) 977 { 978 if (xfs_dinode_has_large_extent_counts(dip)) 979 dip->di_big_anextents = cpu_to_be32(nextents); 980 else 981 dip->di_anextents = cpu_to_be16(nextents); 982 } 983 984 /* Reset the data fork to something sane. */ 985 STATIC void 986 xrep_dinode_zap_dfork( 987 struct xrep_inode *ri, 988 struct xfs_dinode *dip, 989 uint16_t mode) 990 { 991 struct xfs_scrub *sc = ri->sc; 992 993 trace_xrep_dinode_zap_dfork(sc, dip); 994 995 ri->ino_sick_mask |= XFS_SICK_INO_BMBTD_ZAPPED; 996 997 xrep_dinode_set_data_nextents(dip, 0); 998 ri->data_blocks = 0; 999 ri->rt_blocks = 0; 1000 1001 /* Special files always get reset to DEV */ 1002 switch (mode & S_IFMT) { 1003 case S_IFIFO: 1004 case S_IFCHR: 1005 case S_IFBLK: 1006 case S_IFSOCK: 1007 dip->di_format = XFS_DINODE_FMT_DEV; 1008 dip->di_size = 0; 1009 return; 1010 } 1011 1012 /* 1013 * If we have data extents, reset to an empty map and hope the user 1014 * will run the bmapbtd checker next. 1015 */ 1016 if (ri->data_extents || ri->rt_extents || S_ISREG(mode)) { 1017 dip->di_format = XFS_DINODE_FMT_EXTENTS; 1018 return; 1019 } 1020 1021 /* Otherwise, reset the local format to the minimum. */ 1022 switch (mode & S_IFMT) { 1023 case S_IFLNK: 1024 xrep_dinode_zap_symlink(ri, dip); 1025 break; 1026 case S_IFDIR: 1027 xrep_dinode_zap_dir(ri, dip); 1028 break; 1029 } 1030 } 1031 1032 /* 1033 * Check the attr fork for things that will fail the ifork verifiers or the 1034 * ifork formatters. 1035 */ 1036 STATIC bool 1037 xrep_dinode_check_afork( 1038 struct xfs_scrub *sc, 1039 struct xfs_dinode *dip) 1040 { 1041 struct xfs_attr_sf_hdr *afork_ptr; 1042 size_t attr_size; 1043 unsigned int afork_size; 1044 1045 if (XFS_DFORK_BOFF(dip) == 0) 1046 return dip->di_aformat != XFS_DINODE_FMT_EXTENTS || 1047 xfs_dfork_attr_extents(dip) != 0; 1048 1049 afork_size = XFS_DFORK_SIZE(dip, sc->mp, XFS_ATTR_FORK); 1050 afork_ptr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK); 1051 1052 switch (XFS_DFORK_FORMAT(dip, XFS_ATTR_FORK)) { 1053 case XFS_DINODE_FMT_LOCAL: 1054 /* Fork has to be large enough to extract the xattr size. */ 1055 if (afork_size < sizeof(struct xfs_attr_sf_hdr)) 1056 return true; 1057 1058 /* xattr structure cannot be larger than the fork */ 1059 attr_size = be16_to_cpu(afork_ptr->totsize); 1060 if (attr_size > afork_size) 1061 return true; 1062 1063 /* xattr structure must pass verification. */ 1064 return xfs_attr_shortform_verify(afork_ptr, attr_size) != NULL; 1065 case XFS_DINODE_FMT_EXTENTS: 1066 if (xrep_dinode_bad_extents_fork(sc, dip, afork_size, 1067 XFS_ATTR_FORK)) 1068 return true; 1069 break; 1070 case XFS_DINODE_FMT_BTREE: 1071 if (xrep_dinode_bad_bmbt_fork(sc, dip, afork_size, 1072 XFS_ATTR_FORK)) 1073 return true; 1074 break; 1075 default: 1076 return true; 1077 } 1078 1079 return false; 1080 } 1081 1082 /* 1083 * Reset the attr fork to empty. Since the attr fork could have contained 1084 * ACLs, make the file readable only by root. 1085 */ 1086 STATIC void 1087 xrep_dinode_zap_afork( 1088 struct xrep_inode *ri, 1089 struct xfs_dinode *dip, 1090 uint16_t mode) 1091 { 1092 struct xfs_scrub *sc = ri->sc; 1093 1094 trace_xrep_dinode_zap_afork(sc, dip); 1095 1096 ri->ino_sick_mask |= XFS_SICK_INO_BMBTA_ZAPPED; 1097 1098 dip->di_aformat = XFS_DINODE_FMT_EXTENTS; 1099 xrep_dinode_set_attr_nextents(dip, 0); 1100 ri->attr_blocks = 0; 1101 1102 /* 1103 * If the data fork is in btree format, removing the attr fork entirely 1104 * might cause verifier failures if the next level down in the bmbt 1105 * could now fit in the data fork area. 1106 */ 1107 if (dip->di_format != XFS_DINODE_FMT_BTREE) 1108 dip->di_forkoff = 0; 1109 dip->di_mode = cpu_to_be16(mode & ~0777); 1110 dip->di_uid = 0; 1111 dip->di_gid = 0; 1112 } 1113 1114 /* Make sure the fork offset is a sensible value. */ 1115 STATIC void 1116 xrep_dinode_ensure_forkoff( 1117 struct xrep_inode *ri, 1118 struct xfs_dinode *dip, 1119 uint16_t mode) 1120 { 1121 struct xfs_bmdr_block *bmdr; 1122 struct xfs_scrub *sc = ri->sc; 1123 xfs_extnum_t attr_extents, data_extents; 1124 size_t bmdr_minsz = XFS_BMDR_SPACE_CALC(1); 1125 unsigned int lit_sz = XFS_LITINO(sc->mp); 1126 unsigned int afork_min, dfork_min; 1127 1128 trace_xrep_dinode_ensure_forkoff(sc, dip); 1129 1130 /* 1131 * Before calling this function, xrep_dinode_core ensured that both 1132 * forks actually fit inside their respective literal areas. If this 1133 * was not the case, the fork was reset to FMT_EXTENTS with zero 1134 * records. If the rmapbt scan found attr or data fork blocks, this 1135 * will be noted in the dinode_stats, and we must leave enough room 1136 * for the bmap repair code to reconstruct the mapping structure. 1137 * 1138 * First, compute the minimum space required for the attr fork. 1139 */ 1140 switch (dip->di_aformat) { 1141 case XFS_DINODE_FMT_LOCAL: 1142 /* 1143 * If we still have a shortform xattr structure at all, that 1144 * means the attr fork area was exactly large enough to fit 1145 * the sf structure. 1146 */ 1147 afork_min = XFS_DFORK_SIZE(dip, sc->mp, XFS_ATTR_FORK); 1148 break; 1149 case XFS_DINODE_FMT_EXTENTS: 1150 attr_extents = xfs_dfork_attr_extents(dip); 1151 if (attr_extents) { 1152 /* 1153 * We must maintain sufficient space to hold the entire 1154 * extent map array in the data fork. Note that we 1155 * previously zapped the fork if it had no chance of 1156 * fitting in the inode. 1157 */ 1158 afork_min = sizeof(struct xfs_bmbt_rec) * attr_extents; 1159 } else if (ri->attr_extents > 0) { 1160 /* 1161 * The attr fork thinks it has zero extents, but we 1162 * found some xattr extents. We need to leave enough 1163 * empty space here so that the incore attr fork will 1164 * get created (and hence trigger the attr fork bmap 1165 * repairer). 1166 */ 1167 afork_min = bmdr_minsz; 1168 } else { 1169 /* No extents on disk or found in rmapbt. */ 1170 afork_min = 0; 1171 } 1172 break; 1173 case XFS_DINODE_FMT_BTREE: 1174 /* Must have space for btree header and key/pointers. */ 1175 bmdr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK); 1176 afork_min = XFS_BMAP_BROOT_SPACE(sc->mp, bmdr); 1177 break; 1178 default: 1179 /* We should never see any other formats. */ 1180 afork_min = 0; 1181 break; 1182 } 1183 1184 /* Compute the minimum space required for the data fork. */ 1185 switch (dip->di_format) { 1186 case XFS_DINODE_FMT_DEV: 1187 dfork_min = sizeof(__be32); 1188 break; 1189 case XFS_DINODE_FMT_UUID: 1190 dfork_min = sizeof(uuid_t); 1191 break; 1192 case XFS_DINODE_FMT_LOCAL: 1193 /* 1194 * If we still have a shortform data fork at all, that means 1195 * the data fork area was large enough to fit whatever was in 1196 * there. 1197 */ 1198 dfork_min = be64_to_cpu(dip->di_size); 1199 break; 1200 case XFS_DINODE_FMT_EXTENTS: 1201 data_extents = xfs_dfork_data_extents(dip); 1202 if (data_extents) { 1203 /* 1204 * We must maintain sufficient space to hold the entire 1205 * extent map array in the data fork. Note that we 1206 * previously zapped the fork if it had no chance of 1207 * fitting in the inode. 1208 */ 1209 dfork_min = sizeof(struct xfs_bmbt_rec) * data_extents; 1210 } else if (ri->data_extents > 0 || ri->rt_extents > 0) { 1211 /* 1212 * The data fork thinks it has zero extents, but we 1213 * found some data extents. We need to leave enough 1214 * empty space here so that the data fork bmap repair 1215 * will recover the mappings. 1216 */ 1217 dfork_min = bmdr_minsz; 1218 } else { 1219 /* No extents on disk or found in rmapbt. */ 1220 dfork_min = 0; 1221 } 1222 break; 1223 case XFS_DINODE_FMT_BTREE: 1224 /* Must have space for btree header and key/pointers. */ 1225 bmdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK); 1226 dfork_min = XFS_BMAP_BROOT_SPACE(sc->mp, bmdr); 1227 break; 1228 default: 1229 dfork_min = 0; 1230 break; 1231 } 1232 1233 /* 1234 * Round all values up to the nearest 8 bytes, because that is the 1235 * precision of di_forkoff. 1236 */ 1237 afork_min = roundup(afork_min, 8); 1238 dfork_min = roundup(dfork_min, 8); 1239 bmdr_minsz = roundup(bmdr_minsz, 8); 1240 1241 ASSERT(dfork_min <= lit_sz); 1242 ASSERT(afork_min <= lit_sz); 1243 1244 /* 1245 * If the data fork was zapped and we don't have enough space for the 1246 * recovery fork, move the attr fork up. 1247 */ 1248 if (dip->di_format == XFS_DINODE_FMT_EXTENTS && 1249 xfs_dfork_data_extents(dip) == 0 && 1250 (ri->data_extents > 0 || ri->rt_extents > 0) && 1251 bmdr_minsz > XFS_DFORK_DSIZE(dip, sc->mp)) { 1252 if (bmdr_minsz + afork_min > lit_sz) { 1253 /* 1254 * The attr for and the stub fork we need to recover 1255 * the data fork won't both fit. Zap the attr fork. 1256 */ 1257 xrep_dinode_zap_afork(ri, dip, mode); 1258 afork_min = bmdr_minsz; 1259 } else { 1260 void *before, *after; 1261 1262 /* Otherwise, just slide the attr fork up. */ 1263 before = XFS_DFORK_APTR(dip); 1264 dip->di_forkoff = bmdr_minsz >> 3; 1265 after = XFS_DFORK_APTR(dip); 1266 memmove(after, before, XFS_DFORK_ASIZE(dip, sc->mp)); 1267 } 1268 } 1269 1270 /* 1271 * If the attr fork was zapped and we don't have enough space for the 1272 * recovery fork, move the attr fork down. 1273 */ 1274 if (dip->di_aformat == XFS_DINODE_FMT_EXTENTS && 1275 xfs_dfork_attr_extents(dip) == 0 && 1276 ri->attr_extents > 0 && 1277 bmdr_minsz > XFS_DFORK_ASIZE(dip, sc->mp)) { 1278 if (dip->di_format == XFS_DINODE_FMT_BTREE) { 1279 /* 1280 * If the data fork is in btree format then we can't 1281 * adjust forkoff because that runs the risk of 1282 * violating the extents/btree format transition rules. 1283 */ 1284 } else if (bmdr_minsz + dfork_min > lit_sz) { 1285 /* 1286 * If we can't move the attr fork, too bad, we lose the 1287 * attr fork and leak its blocks. 1288 */ 1289 xrep_dinode_zap_afork(ri, dip, mode); 1290 } else { 1291 /* 1292 * Otherwise, just slide the attr fork down. The attr 1293 * fork is empty, so we don't have any old contents to 1294 * move here. 1295 */ 1296 dip->di_forkoff = (lit_sz - bmdr_minsz) >> 3; 1297 } 1298 } 1299 } 1300 1301 /* 1302 * Zap the data/attr forks if we spot anything that isn't going to pass the 1303 * ifork verifiers or the ifork formatters, because we need to get the inode 1304 * into good enough shape that the higher level repair functions can run. 1305 */ 1306 STATIC void 1307 xrep_dinode_zap_forks( 1308 struct xrep_inode *ri, 1309 struct xfs_dinode *dip) 1310 { 1311 struct xfs_scrub *sc = ri->sc; 1312 xfs_extnum_t data_extents; 1313 xfs_extnum_t attr_extents; 1314 xfs_filblks_t nblocks; 1315 uint16_t mode; 1316 bool zap_datafork = false; 1317 bool zap_attrfork = ri->zap_acls; 1318 1319 trace_xrep_dinode_zap_forks(sc, dip); 1320 1321 mode = be16_to_cpu(dip->di_mode); 1322 1323 data_extents = xfs_dfork_data_extents(dip); 1324 attr_extents = xfs_dfork_attr_extents(dip); 1325 nblocks = be64_to_cpu(dip->di_nblocks); 1326 1327 /* Inode counters don't make sense? */ 1328 if (data_extents > nblocks) 1329 zap_datafork = true; 1330 if (attr_extents > nblocks) 1331 zap_attrfork = true; 1332 if (data_extents + attr_extents > nblocks) 1333 zap_datafork = zap_attrfork = true; 1334 1335 if (!zap_datafork) 1336 zap_datafork = xrep_dinode_check_dfork(sc, dip, mode); 1337 if (!zap_attrfork) 1338 zap_attrfork = xrep_dinode_check_afork(sc, dip); 1339 1340 /* Zap whatever's bad. */ 1341 if (zap_attrfork) 1342 xrep_dinode_zap_afork(ri, dip, mode); 1343 if (zap_datafork) 1344 xrep_dinode_zap_dfork(ri, dip, mode); 1345 xrep_dinode_ensure_forkoff(ri, dip, mode); 1346 1347 /* 1348 * Zero di_nblocks if we don't have any extents at all to satisfy the 1349 * buffer verifier. 1350 */ 1351 data_extents = xfs_dfork_data_extents(dip); 1352 attr_extents = xfs_dfork_attr_extents(dip); 1353 if (data_extents + attr_extents == 0) 1354 dip->di_nblocks = 0; 1355 } 1356 1357 /* Inode didn't pass dinode verifiers, so fix the raw buffer and retry iget. */ 1358 STATIC int 1359 xrep_dinode_core( 1360 struct xrep_inode *ri) 1361 { 1362 struct xfs_scrub *sc = ri->sc; 1363 struct xfs_buf *bp; 1364 struct xfs_dinode *dip; 1365 xfs_ino_t ino = sc->sm->sm_ino; 1366 int error; 1367 int iget_error; 1368 1369 /* Figure out what this inode had mapped in both forks. */ 1370 error = xrep_dinode_count_rmaps(ri); 1371 if (error) 1372 return error; 1373 1374 /* Read the inode cluster buffer. */ 1375 error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp, 1376 ri->imap.im_blkno, ri->imap.im_len, XBF_UNMAPPED, &bp, 1377 NULL); 1378 if (error) 1379 return error; 1380 1381 /* Make sure we can pass the inode buffer verifier. */ 1382 xrep_dinode_buf(sc, bp); 1383 bp->b_ops = &xfs_inode_buf_ops; 1384 1385 /* Fix everything the verifier will complain about. */ 1386 dip = xfs_buf_offset(bp, ri->imap.im_boffset); 1387 xrep_dinode_header(sc, dip); 1388 iget_error = xrep_dinode_mode(ri, dip); 1389 if (iget_error) 1390 goto write; 1391 xrep_dinode_nlinks(dip); 1392 xrep_dinode_flags(sc, dip, ri->rt_extents > 0); 1393 xrep_dinode_size(ri, dip); 1394 xrep_dinode_extsize_hints(sc, dip); 1395 xrep_dinode_zap_forks(ri, dip); 1396 1397 write: 1398 /* Write out the inode. */ 1399 trace_xrep_dinode_fixed(sc, dip); 1400 xfs_dinode_calc_crc(sc->mp, dip); 1401 xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_DINO_BUF); 1402 xfs_trans_log_buf(sc->tp, bp, ri->imap.im_boffset, 1403 ri->imap.im_boffset + sc->mp->m_sb.sb_inodesize - 1); 1404 1405 /* 1406 * In theory, we've fixed the ondisk inode record enough that we should 1407 * be able to load the inode into the cache. Try to iget that inode 1408 * now while we hold the AGI and the inode cluster buffer and take the 1409 * IOLOCK so that we can continue with repairs without anyone else 1410 * accessing the inode. If iget fails, we still need to commit the 1411 * changes. 1412 */ 1413 if (!iget_error) 1414 iget_error = xchk_iget(sc, ino, &sc->ip); 1415 if (!iget_error) 1416 xchk_ilock(sc, XFS_IOLOCK_EXCL); 1417 1418 /* 1419 * Commit the inode cluster buffer updates and drop the AGI buffer that 1420 * we've been holding since scrub setup. From here on out, repairs 1421 * deal only with the cached inode. 1422 */ 1423 error = xrep_trans_commit(sc); 1424 if (error) 1425 return error; 1426 1427 if (iget_error) 1428 return iget_error; 1429 1430 error = xchk_trans_alloc(sc, 0); 1431 if (error) 1432 return error; 1433 1434 error = xrep_ino_dqattach(sc); 1435 if (error) 1436 return error; 1437 1438 xchk_ilock(sc, XFS_ILOCK_EXCL); 1439 if (ri->ino_sick_mask) 1440 xfs_inode_mark_sick(sc->ip, ri->ino_sick_mask); 1441 return 0; 1442 } 1443 1444 /* Fix everything xfs_dinode_verify cares about. */ 1445 STATIC int 1446 xrep_dinode_problems( 1447 struct xrep_inode *ri) 1448 { 1449 struct xfs_scrub *sc = ri->sc; 1450 int error; 1451 1452 error = xrep_dinode_core(ri); 1453 if (error) 1454 return error; 1455 1456 /* We had to fix a totally busted inode, schedule quotacheck. */ 1457 if (XFS_IS_UQUOTA_ON(sc->mp)) 1458 xrep_force_quotacheck(sc, XFS_DQTYPE_USER); 1459 if (XFS_IS_GQUOTA_ON(sc->mp)) 1460 xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP); 1461 if (XFS_IS_PQUOTA_ON(sc->mp)) 1462 xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ); 1463 1464 return 0; 1465 } 1466 1467 /* 1468 * Fix problems that the verifiers don't care about. In general these are 1469 * errors that don't cause problems elsewhere in the kernel that we can easily 1470 * detect, so we don't check them all that rigorously. 1471 */ 1472 1473 /* Make sure block and extent counts are ok. */ 1474 STATIC int 1475 xrep_inode_blockcounts( 1476 struct xfs_scrub *sc) 1477 { 1478 struct xfs_ifork *ifp; 1479 xfs_filblks_t count; 1480 xfs_filblks_t acount; 1481 xfs_extnum_t nextents; 1482 int error; 1483 1484 trace_xrep_inode_blockcounts(sc); 1485 1486 /* Set data fork counters from the data fork mappings. */ 1487 error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK, 1488 &nextents, &count); 1489 if (error) 1490 return error; 1491 if (xfs_is_reflink_inode(sc->ip)) { 1492 /* 1493 * data fork blockcount can exceed physical storage if a user 1494 * reflinks the same block over and over again. 1495 */ 1496 ; 1497 } else if (XFS_IS_REALTIME_INODE(sc->ip)) { 1498 if (count >= sc->mp->m_sb.sb_rblocks) 1499 return -EFSCORRUPTED; 1500 } else { 1501 if (count >= sc->mp->m_sb.sb_dblocks) 1502 return -EFSCORRUPTED; 1503 } 1504 error = xrep_ino_ensure_extent_count(sc, XFS_DATA_FORK, nextents); 1505 if (error) 1506 return error; 1507 sc->ip->i_df.if_nextents = nextents; 1508 1509 /* Set attr fork counters from the attr fork mappings. */ 1510 ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK); 1511 if (ifp) { 1512 error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK, 1513 &nextents, &acount); 1514 if (error) 1515 return error; 1516 if (count >= sc->mp->m_sb.sb_dblocks) 1517 return -EFSCORRUPTED; 1518 error = xrep_ino_ensure_extent_count(sc, XFS_ATTR_FORK, 1519 nextents); 1520 if (error) 1521 return error; 1522 ifp->if_nextents = nextents; 1523 } else { 1524 acount = 0; 1525 } 1526 1527 sc->ip->i_nblocks = count + acount; 1528 return 0; 1529 } 1530 1531 /* Check for invalid uid/gid/prid. */ 1532 STATIC void 1533 xrep_inode_ids( 1534 struct xfs_scrub *sc) 1535 { 1536 bool dirty = false; 1537 1538 trace_xrep_inode_ids(sc); 1539 1540 if (!uid_valid(VFS_I(sc->ip)->i_uid)) { 1541 i_uid_write(VFS_I(sc->ip), 0); 1542 dirty = true; 1543 if (XFS_IS_UQUOTA_ON(sc->mp)) 1544 xrep_force_quotacheck(sc, XFS_DQTYPE_USER); 1545 } 1546 1547 if (!gid_valid(VFS_I(sc->ip)->i_gid)) { 1548 i_gid_write(VFS_I(sc->ip), 0); 1549 dirty = true; 1550 if (XFS_IS_GQUOTA_ON(sc->mp)) 1551 xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP); 1552 } 1553 1554 if (sc->ip->i_projid == -1U) { 1555 sc->ip->i_projid = 0; 1556 dirty = true; 1557 if (XFS_IS_PQUOTA_ON(sc->mp)) 1558 xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ); 1559 } 1560 1561 /* strip setuid/setgid if we touched any of the ids */ 1562 if (dirty) 1563 VFS_I(sc->ip)->i_mode &= ~(S_ISUID | S_ISGID); 1564 } 1565 1566 static inline void 1567 xrep_clamp_timestamp( 1568 struct xfs_inode *ip, 1569 struct timespec64 *ts) 1570 { 1571 ts->tv_nsec = clamp_t(long, ts->tv_nsec, 0, NSEC_PER_SEC); 1572 *ts = timestamp_truncate(*ts, VFS_I(ip)); 1573 } 1574 1575 /* Nanosecond counters can't have more than 1 billion. */ 1576 STATIC void 1577 xrep_inode_timestamps( 1578 struct xfs_inode *ip) 1579 { 1580 struct timespec64 tstamp; 1581 struct inode *inode = VFS_I(ip); 1582 1583 tstamp = inode_get_atime(inode); 1584 xrep_clamp_timestamp(ip, &tstamp); 1585 inode_set_atime_to_ts(inode, tstamp); 1586 1587 tstamp = inode_get_mtime(inode); 1588 xrep_clamp_timestamp(ip, &tstamp); 1589 inode_set_mtime_to_ts(inode, tstamp); 1590 1591 tstamp = inode_get_ctime(inode); 1592 xrep_clamp_timestamp(ip, &tstamp); 1593 inode_set_ctime_to_ts(inode, tstamp); 1594 1595 xrep_clamp_timestamp(ip, &ip->i_crtime); 1596 } 1597 1598 /* Fix inode flags that don't make sense together. */ 1599 STATIC void 1600 xrep_inode_flags( 1601 struct xfs_scrub *sc) 1602 { 1603 uint16_t mode; 1604 1605 trace_xrep_inode_flags(sc); 1606 1607 mode = VFS_I(sc->ip)->i_mode; 1608 1609 /* Clear junk flags */ 1610 if (sc->ip->i_diflags & ~XFS_DIFLAG_ANY) 1611 sc->ip->i_diflags &= ~XFS_DIFLAG_ANY; 1612 1613 /* NEWRTBM only applies to realtime bitmaps */ 1614 if (sc->ip->i_ino == sc->mp->m_sb.sb_rbmino) 1615 sc->ip->i_diflags |= XFS_DIFLAG_NEWRTBM; 1616 else 1617 sc->ip->i_diflags &= ~XFS_DIFLAG_NEWRTBM; 1618 1619 /* These only make sense for directories. */ 1620 if (!S_ISDIR(mode)) 1621 sc->ip->i_diflags &= ~(XFS_DIFLAG_RTINHERIT | 1622 XFS_DIFLAG_EXTSZINHERIT | 1623 XFS_DIFLAG_PROJINHERIT | 1624 XFS_DIFLAG_NOSYMLINKS); 1625 1626 /* These only make sense for files. */ 1627 if (!S_ISREG(mode)) 1628 sc->ip->i_diflags &= ~(XFS_DIFLAG_REALTIME | 1629 XFS_DIFLAG_EXTSIZE); 1630 1631 /* These only make sense for non-rt files. */ 1632 if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) 1633 sc->ip->i_diflags &= ~XFS_DIFLAG_FILESTREAM; 1634 1635 /* Immutable and append only? Drop the append. */ 1636 if ((sc->ip->i_diflags & XFS_DIFLAG_IMMUTABLE) && 1637 (sc->ip->i_diflags & XFS_DIFLAG_APPEND)) 1638 sc->ip->i_diflags &= ~XFS_DIFLAG_APPEND; 1639 1640 /* Clear junk flags. */ 1641 if (sc->ip->i_diflags2 & ~XFS_DIFLAG2_ANY) 1642 sc->ip->i_diflags2 &= ~XFS_DIFLAG2_ANY; 1643 1644 /* No reflink flag unless we support it and it's a file. */ 1645 if (!xfs_has_reflink(sc->mp) || !S_ISREG(mode)) 1646 sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; 1647 1648 /* DAX only applies to files and dirs. */ 1649 if (!(S_ISREG(mode) || S_ISDIR(mode))) 1650 sc->ip->i_diflags2 &= ~XFS_DIFLAG2_DAX; 1651 1652 /* No reflink files on the realtime device. */ 1653 if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) 1654 sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; 1655 } 1656 1657 /* 1658 * Fix size problems with block/node format directories. If we fail to find 1659 * the extent list, just bail out and let the bmapbtd repair functions clean 1660 * up that mess. 1661 */ 1662 STATIC void 1663 xrep_inode_blockdir_size( 1664 struct xfs_scrub *sc) 1665 { 1666 struct xfs_iext_cursor icur; 1667 struct xfs_bmbt_irec got; 1668 struct xfs_ifork *ifp; 1669 xfs_fileoff_t off; 1670 int error; 1671 1672 trace_xrep_inode_blockdir_size(sc); 1673 1674 error = xfs_iread_extents(sc->tp, sc->ip, XFS_DATA_FORK); 1675 if (error) 1676 return; 1677 1678 /* Find the last block before 32G; this is the dir size. */ 1679 ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); 1680 off = XFS_B_TO_FSB(sc->mp, XFS_DIR2_SPACE_SIZE); 1681 if (!xfs_iext_lookup_extent_before(sc->ip, ifp, &off, &icur, &got)) { 1682 /* zero-extents directory? */ 1683 return; 1684 } 1685 1686 off = got.br_startoff + got.br_blockcount; 1687 sc->ip->i_disk_size = min_t(loff_t, XFS_DIR2_SPACE_SIZE, 1688 XFS_FSB_TO_B(sc->mp, off)); 1689 } 1690 1691 /* Fix size problems with short format directories. */ 1692 STATIC void 1693 xrep_inode_sfdir_size( 1694 struct xfs_scrub *sc) 1695 { 1696 struct xfs_ifork *ifp; 1697 1698 trace_xrep_inode_sfdir_size(sc); 1699 1700 ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); 1701 sc->ip->i_disk_size = ifp->if_bytes; 1702 } 1703 1704 /* 1705 * Fix any irregularities in a directory inode's size now that we can iterate 1706 * extent maps and access other regular inode data. 1707 */ 1708 STATIC void 1709 xrep_inode_dir_size( 1710 struct xfs_scrub *sc) 1711 { 1712 trace_xrep_inode_dir_size(sc); 1713 1714 switch (sc->ip->i_df.if_format) { 1715 case XFS_DINODE_FMT_EXTENTS: 1716 case XFS_DINODE_FMT_BTREE: 1717 xrep_inode_blockdir_size(sc); 1718 break; 1719 case XFS_DINODE_FMT_LOCAL: 1720 xrep_inode_sfdir_size(sc); 1721 break; 1722 } 1723 } 1724 1725 /* Fix extent size hint problems. */ 1726 STATIC void 1727 xrep_inode_extsize( 1728 struct xfs_scrub *sc) 1729 { 1730 /* Fix misaligned extent size hints on a directory. */ 1731 if ((sc->ip->i_diflags & XFS_DIFLAG_RTINHERIT) && 1732 (sc->ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) && 1733 xfs_extlen_to_rtxmod(sc->mp, sc->ip->i_extsize) > 0) { 1734 sc->ip->i_extsize = 0; 1735 sc->ip->i_diflags &= ~XFS_DIFLAG_EXTSZINHERIT; 1736 } 1737 } 1738 1739 /* Ensure this file has an attr fork if it needs to hold a parent pointer. */ 1740 STATIC int 1741 xrep_inode_pptr( 1742 struct xfs_scrub *sc) 1743 { 1744 struct xfs_mount *mp = sc->mp; 1745 struct xfs_inode *ip = sc->ip; 1746 struct inode *inode = VFS_I(ip); 1747 1748 if (!xfs_has_parent(mp)) 1749 return 0; 1750 1751 /* 1752 * Unlinked inodes that cannot be added to the directory tree will not 1753 * have a parent pointer. 1754 */ 1755 if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE)) 1756 return 0; 1757 1758 /* The root directory doesn't have a parent pointer. */ 1759 if (ip == mp->m_rootip) 1760 return 0; 1761 1762 /* 1763 * Metadata inodes are rooted in the superblock and do not have any 1764 * parents. 1765 */ 1766 if (xfs_is_metadata_inode(ip)) 1767 return 0; 1768 1769 /* Inode already has an attr fork; no further work possible here. */ 1770 if (xfs_inode_has_attr_fork(ip)) 1771 return 0; 1772 1773 return xfs_bmap_add_attrfork(sc->tp, ip, 1774 sizeof(struct xfs_attr_sf_hdr), true); 1775 } 1776 1777 /* Fix any irregularities in an inode that the verifiers don't catch. */ 1778 STATIC int 1779 xrep_inode_problems( 1780 struct xfs_scrub *sc) 1781 { 1782 int error; 1783 1784 error = xrep_inode_blockcounts(sc); 1785 if (error) 1786 return error; 1787 error = xrep_inode_pptr(sc); 1788 if (error) 1789 return error; 1790 xrep_inode_timestamps(sc->ip); 1791 xrep_inode_flags(sc); 1792 xrep_inode_ids(sc); 1793 /* 1794 * We can now do a better job fixing the size of a directory now that 1795 * we can scan the data fork extents than we could in xrep_dinode_size. 1796 */ 1797 if (S_ISDIR(VFS_I(sc->ip)->i_mode)) 1798 xrep_inode_dir_size(sc); 1799 xrep_inode_extsize(sc); 1800 1801 trace_xrep_inode_fixed(sc); 1802 xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); 1803 return xrep_roll_trans(sc); 1804 } 1805 1806 /* 1807 * Make sure this inode's unlinked list pointers are consistent with its 1808 * link count. 1809 */ 1810 STATIC int 1811 xrep_inode_unlinked( 1812 struct xfs_scrub *sc) 1813 { 1814 unsigned int nlink = VFS_I(sc->ip)->i_nlink; 1815 int error; 1816 1817 /* 1818 * If this inode is linked from the directory tree and on the unlinked 1819 * list, remove it from the unlinked list. 1820 */ 1821 if (nlink > 0 && xfs_inode_on_unlinked_list(sc->ip)) { 1822 struct xfs_perag *pag; 1823 int error; 1824 1825 pag = xfs_perag_get(sc->mp, 1826 XFS_INO_TO_AGNO(sc->mp, sc->ip->i_ino)); 1827 error = xfs_iunlink_remove(sc->tp, pag, sc->ip); 1828 xfs_perag_put(pag); 1829 if (error) 1830 return error; 1831 } 1832 1833 /* 1834 * If this inode is not linked from the directory tree yet not on the 1835 * unlinked list, put it on the unlinked list. 1836 */ 1837 if (nlink == 0 && !xfs_inode_on_unlinked_list(sc->ip)) { 1838 error = xfs_iunlink(sc->tp, sc->ip); 1839 if (error) 1840 return error; 1841 } 1842 1843 return 0; 1844 } 1845 1846 /* Repair an inode's fields. */ 1847 int 1848 xrep_inode( 1849 struct xfs_scrub *sc) 1850 { 1851 int error = 0; 1852 1853 /* 1854 * No inode? That means we failed the _iget verifiers. Repair all 1855 * the things that the inode verifiers care about, then retry _iget. 1856 */ 1857 if (!sc->ip) { 1858 struct xrep_inode *ri = sc->buf; 1859 1860 ASSERT(ri != NULL); 1861 1862 error = xrep_dinode_problems(ri); 1863 if (error == -EBUSY) { 1864 /* 1865 * Directory scan to recover inode mode encountered a 1866 * busy inode, so we did not continue repairing things. 1867 */ 1868 return 0; 1869 } 1870 if (error) 1871 return error; 1872 1873 /* By this point we had better have a working incore inode. */ 1874 if (!sc->ip) 1875 return -EFSCORRUPTED; 1876 } 1877 1878 xfs_trans_ijoin(sc->tp, sc->ip, 0); 1879 1880 /* If we found corruption of any kind, try to fix it. */ 1881 if ((sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) || 1882 (sc->sm->sm_flags & XFS_SCRUB_OFLAG_XCORRUPT)) { 1883 error = xrep_inode_problems(sc); 1884 if (error) 1885 return error; 1886 } 1887 1888 /* See if we can clear the reflink flag. */ 1889 if (xfs_is_reflink_inode(sc->ip)) { 1890 error = xfs_reflink_clear_inode_flag(sc->ip, &sc->tp); 1891 if (error) 1892 return error; 1893 } 1894 1895 /* Reconnect incore unlinked list */ 1896 error = xrep_inode_unlinked(sc); 1897 if (error) 1898 return error; 1899 1900 return xrep_defer_finish(sc); 1901 } 1902