1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2018-2023 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_defer.h" 13 #include "xfs_btree.h" 14 #include "xfs_bit.h" 15 #include "xfs_log_format.h" 16 #include "xfs_trans.h" 17 #include "xfs_sb.h" 18 #include "xfs_inode.h" 19 #include "xfs_icache.h" 20 #include "xfs_inode_buf.h" 21 #include "xfs_inode_fork.h" 22 #include "xfs_ialloc.h" 23 #include "xfs_da_format.h" 24 #include "xfs_reflink.h" 25 #include "xfs_alloc.h" 26 #include "xfs_rmap.h" 27 #include "xfs_rmap_btree.h" 28 #include "xfs_bmap.h" 29 #include "xfs_bmap_btree.h" 30 #include "xfs_bmap_util.h" 31 #include "xfs_dir2.h" 32 #include "xfs_dir2_priv.h" 33 #include "xfs_quota_defs.h" 34 #include "xfs_quota.h" 35 #include "xfs_ag.h" 36 #include "xfs_rtbitmap.h" 37 #include "xfs_attr_leaf.h" 38 #include "xfs_log_priv.h" 39 #include "xfs_health.h" 40 #include "xfs_symlink_remote.h" 41 #include "scrub/xfs_scrub.h" 42 #include "scrub/scrub.h" 43 #include "scrub/common.h" 44 #include "scrub/btree.h" 45 #include "scrub/trace.h" 46 #include "scrub/repair.h" 47 #include "scrub/iscan.h" 48 #include "scrub/readdir.h" 49 50 /* 51 * Inode Record Repair 52 * =================== 53 * 54 * Roughly speaking, inode problems can be classified based on whether or not 55 * they trip the dinode verifiers. If those trip, then we won't be able to 56 * xfs_iget ourselves the inode. 57 * 58 * Therefore, the xrep_dinode_* functions fix anything that will cause the 59 * inode buffer verifier or the dinode verifier. The xrep_inode_* functions 60 * fix things on live incore inodes. The inode repair functions make decisions 61 * with security and usability implications when reviving a file: 62 * 63 * - Files with zero di_mode or a garbage di_mode are converted to regular file 64 * that only root can read. This file may not actually contain user data, 65 * if the file was not previously a regular file. Setuid and setgid bits 66 * are cleared. 67 * 68 * - Zero-size directories can be truncated to look empty. It is necessary to 69 * run the bmapbtd and directory repair functions to fully rebuild the 70 * directory. 71 * 72 * - Zero-size symbolic link targets can be truncated to '?'. It is necessary 73 * to run the bmapbtd and symlink repair functions to salvage the symlink. 74 * 75 * - Invalid extent size hints will be removed. 76 * 77 * - Quotacheck will be scheduled if we repaired an inode that was so badly 78 * damaged that the ondisk inode had to be rebuilt. 79 * 80 * - Invalid user, group, or project IDs (aka -1U) will be reset to zero. 81 * Setuid and setgid bits are cleared. 82 * 83 * - Data and attr forks are reset to extents format with zero extents if the 84 * fork data is inconsistent. It is necessary to run the bmapbtd or bmapbta 85 * repair functions to recover the space mapping. 86 * 87 * - ACLs will not be recovered if the attr fork is zapped or the extended 88 * attribute structure itself requires salvaging. 89 * 90 * - If the attr fork is zapped, the user and group ids are reset to root and 91 * the setuid and setgid bits are removed. 92 */ 93 94 /* 95 * All the information we need to repair the ondisk inode if we can't iget the 96 * incore inode. We don't allocate this buffer unless we're going to perform 97 * a repair to the ondisk inode cluster buffer. 98 */ 99 struct xrep_inode { 100 /* Inode mapping that we saved from the initial lookup attempt. */ 101 struct xfs_imap imap; 102 103 struct xfs_scrub *sc; 104 105 /* Blocks in use on the data device by data extents or bmbt blocks. */ 106 xfs_rfsblock_t data_blocks; 107 108 /* Blocks in use on the rt device. */ 109 xfs_rfsblock_t rt_blocks; 110 111 /* Blocks in use by the attr fork. */ 112 xfs_rfsblock_t attr_blocks; 113 114 /* Number of data device extents for the data fork. */ 115 xfs_extnum_t data_extents; 116 117 /* 118 * Number of realtime device extents for the data fork. If 119 * data_extents and rt_extents indicate that the data fork has extents 120 * on both devices, we'll just back away slowly. 121 */ 122 xfs_extnum_t rt_extents; 123 124 /* Number of (data device) extents for the attr fork. */ 125 xfs_aextnum_t attr_extents; 126 127 /* Sick state to set after zapping parts of the inode. */ 128 unsigned int ino_sick_mask; 129 130 /* Must we remove all access from this file? */ 131 bool zap_acls; 132 133 /* Inode scanner to see if we can find the ftype from dirents */ 134 struct xchk_iscan ftype_iscan; 135 uint8_t alleged_ftype; 136 }; 137 138 /* 139 * Setup function for inode repair. @imap contains the ondisk inode mapping 140 * information so that we can correct the ondisk inode cluster buffer if 141 * necessary to make iget work. 142 */ 143 int 144 xrep_setup_inode( 145 struct xfs_scrub *sc, 146 const struct xfs_imap *imap) 147 { 148 struct xrep_inode *ri; 149 150 sc->buf = kzalloc(sizeof(struct xrep_inode), XCHK_GFP_FLAGS); 151 if (!sc->buf) 152 return -ENOMEM; 153 154 ri = sc->buf; 155 memcpy(&ri->imap, imap, sizeof(struct xfs_imap)); 156 ri->sc = sc; 157 return 0; 158 } 159 160 /* 161 * Make sure this ondisk inode can pass the inode buffer verifier. This is 162 * not the same as the dinode verifier. 163 */ 164 STATIC void 165 xrep_dinode_buf_core( 166 struct xfs_scrub *sc, 167 struct xfs_buf *bp, 168 unsigned int ioffset) 169 { 170 struct xfs_dinode *dip = xfs_buf_offset(bp, ioffset); 171 struct xfs_trans *tp = sc->tp; 172 struct xfs_mount *mp = sc->mp; 173 xfs_agino_t agino; 174 bool crc_ok = false; 175 bool magic_ok = false; 176 bool unlinked_ok = false; 177 178 agino = be32_to_cpu(dip->di_next_unlinked); 179 180 if (xfs_verify_agino_or_null(bp->b_pag, agino)) 181 unlinked_ok = true; 182 183 if (dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && 184 xfs_dinode_good_version(mp, dip->di_version)) 185 magic_ok = true; 186 187 if (xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, 188 XFS_DINODE_CRC_OFF)) 189 crc_ok = true; 190 191 if (magic_ok && unlinked_ok && crc_ok) 192 return; 193 194 if (!magic_ok) { 195 dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); 196 dip->di_version = 3; 197 } 198 if (!unlinked_ok) 199 dip->di_next_unlinked = cpu_to_be32(NULLAGINO); 200 xfs_dinode_calc_crc(mp, dip); 201 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); 202 xfs_trans_log_buf(tp, bp, ioffset, 203 ioffset + sizeof(struct xfs_dinode) - 1); 204 } 205 206 /* Make sure this inode cluster buffer can pass the inode buffer verifier. */ 207 STATIC void 208 xrep_dinode_buf( 209 struct xfs_scrub *sc, 210 struct xfs_buf *bp) 211 { 212 struct xfs_mount *mp = sc->mp; 213 int i; 214 int ni; 215 216 ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock; 217 for (i = 0; i < ni; i++) 218 xrep_dinode_buf_core(sc, bp, i << mp->m_sb.sb_inodelog); 219 } 220 221 /* Reinitialize things that never change in an inode. */ 222 STATIC void 223 xrep_dinode_header( 224 struct xfs_scrub *sc, 225 struct xfs_dinode *dip) 226 { 227 trace_xrep_dinode_header(sc, dip); 228 229 dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); 230 if (!xfs_dinode_good_version(sc->mp, dip->di_version)) 231 dip->di_version = 3; 232 dip->di_ino = cpu_to_be64(sc->sm->sm_ino); 233 uuid_copy(&dip->di_uuid, &sc->mp->m_sb.sb_meta_uuid); 234 dip->di_gen = cpu_to_be32(sc->sm->sm_gen); 235 } 236 237 /* 238 * If this directory entry points to the scrub target inode, then the directory 239 * we're scanning is the parent of the scrub target inode. 240 */ 241 STATIC int 242 xrep_dinode_findmode_dirent( 243 struct xfs_scrub *sc, 244 struct xfs_inode *dp, 245 xfs_dir2_dataptr_t dapos, 246 const struct xfs_name *name, 247 xfs_ino_t ino, 248 void *priv) 249 { 250 struct xrep_inode *ri = priv; 251 int error = 0; 252 253 if (xchk_should_terminate(ri->sc, &error)) 254 return error; 255 256 if (ino != sc->sm->sm_ino) 257 return 0; 258 259 /* Ignore garbage directory entry names. */ 260 if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len)) 261 return -EFSCORRUPTED; 262 263 /* Don't pick up dot or dotdot entries; we only want child dirents. */ 264 if (xfs_dir2_samename(name, &xfs_name_dotdot) || 265 xfs_dir2_samename(name, &xfs_name_dot)) 266 return 0; 267 268 /* 269 * Uhoh, more than one parent for this inode and they don't agree on 270 * the file type? 271 */ 272 if (ri->alleged_ftype != XFS_DIR3_FT_UNKNOWN && 273 ri->alleged_ftype != name->type) { 274 trace_xrep_dinode_findmode_dirent_inval(ri->sc, dp, name->type, 275 ri->alleged_ftype); 276 return -EFSCORRUPTED; 277 } 278 279 /* We found a potential parent; remember the ftype. */ 280 trace_xrep_dinode_findmode_dirent(ri->sc, dp, name->type); 281 ri->alleged_ftype = name->type; 282 return 0; 283 } 284 285 /* Try to lock a directory, or wait a jiffy. */ 286 static inline int 287 xrep_dinode_ilock_nowait( 288 struct xfs_inode *dp, 289 unsigned int lock_mode) 290 { 291 if (xfs_ilock_nowait(dp, lock_mode)) 292 return true; 293 294 schedule_timeout_killable(1); 295 return false; 296 } 297 298 /* 299 * Try to lock a directory to look for ftype hints. Since we already hold the 300 * AGI buffer, we cannot block waiting for the ILOCK because rename can take 301 * the ILOCK and then try to lock AGIs. 302 */ 303 STATIC int 304 xrep_dinode_trylock_directory( 305 struct xrep_inode *ri, 306 struct xfs_inode *dp, 307 unsigned int *lock_modep) 308 { 309 unsigned long deadline = jiffies + msecs_to_jiffies(30000); 310 unsigned int lock_mode; 311 int error = 0; 312 313 do { 314 if (xchk_should_terminate(ri->sc, &error)) 315 return error; 316 317 if (xfs_need_iread_extents(&dp->i_df)) 318 lock_mode = XFS_ILOCK_EXCL; 319 else 320 lock_mode = XFS_ILOCK_SHARED; 321 322 if (xrep_dinode_ilock_nowait(dp, lock_mode)) { 323 *lock_modep = lock_mode; 324 return 0; 325 } 326 } while (!time_is_before_jiffies(deadline)); 327 return -EBUSY; 328 } 329 330 /* 331 * If this is a directory, walk the dirents looking for any that point to the 332 * scrub target inode. 333 */ 334 STATIC int 335 xrep_dinode_findmode_walk_directory( 336 struct xrep_inode *ri, 337 struct xfs_inode *dp) 338 { 339 struct xfs_scrub *sc = ri->sc; 340 unsigned int lock_mode; 341 int error = 0; 342 343 /* 344 * Scan the directory to see if there it contains an entry pointing to 345 * the directory that we are repairing. 346 */ 347 error = xrep_dinode_trylock_directory(ri, dp, &lock_mode); 348 if (error) 349 return error; 350 351 /* 352 * If this directory is known to be sick, we cannot scan it reliably 353 * and must abort. 354 */ 355 if (xfs_inode_has_sickness(dp, XFS_SICK_INO_CORE | 356 XFS_SICK_INO_BMBTD | 357 XFS_SICK_INO_DIR)) { 358 error = -EFSCORRUPTED; 359 goto out_unlock; 360 } 361 362 /* 363 * We cannot complete our parent pointer scan if a directory looks as 364 * though it has been zapped by the inode record repair code. 365 */ 366 if (xchk_dir_looks_zapped(dp)) { 367 error = -EBUSY; 368 goto out_unlock; 369 } 370 371 error = xchk_dir_walk(sc, dp, xrep_dinode_findmode_dirent, ri); 372 if (error) 373 goto out_unlock; 374 375 out_unlock: 376 xfs_iunlock(dp, lock_mode); 377 return error; 378 } 379 380 /* 381 * Try to find the mode of the inode being repaired by looking for directories 382 * that point down to this file. 383 */ 384 STATIC int 385 xrep_dinode_find_mode( 386 struct xrep_inode *ri, 387 uint16_t *mode) 388 { 389 struct xfs_scrub *sc = ri->sc; 390 struct xfs_inode *dp; 391 int error; 392 393 /* No ftype means we have no other metadata to consult. */ 394 if (!xfs_has_ftype(sc->mp)) { 395 *mode = S_IFREG; 396 return 0; 397 } 398 399 /* 400 * Scan all directories for parents that might point down to this 401 * inode. Skip the inode being repaired during the scan since it 402 * cannot be its own parent. Note that we still hold the AGI locked 403 * so there's a real possibility that _iscan_iter can return EBUSY. 404 */ 405 xchk_iscan_start(sc, 5000, 100, &ri->ftype_iscan); 406 xchk_iscan_set_agi_trylock(&ri->ftype_iscan); 407 ri->ftype_iscan.skip_ino = sc->sm->sm_ino; 408 ri->alleged_ftype = XFS_DIR3_FT_UNKNOWN; 409 while ((error = xchk_iscan_iter(&ri->ftype_iscan, &dp)) == 1) { 410 if (S_ISDIR(VFS_I(dp)->i_mode)) 411 error = xrep_dinode_findmode_walk_directory(ri, dp); 412 xchk_iscan_mark_visited(&ri->ftype_iscan, dp); 413 xchk_irele(sc, dp); 414 if (error < 0) 415 break; 416 if (xchk_should_terminate(sc, &error)) 417 break; 418 } 419 xchk_iscan_iter_finish(&ri->ftype_iscan); 420 xchk_iscan_teardown(&ri->ftype_iscan); 421 422 if (error == -EBUSY) { 423 if (ri->alleged_ftype != XFS_DIR3_FT_UNKNOWN) { 424 /* 425 * If we got an EBUSY after finding at least one 426 * dirent, that means the scan found an inode on the 427 * inactivation list and could not open it. Accept the 428 * alleged ftype and install a new mode below. 429 */ 430 error = 0; 431 } else if (!(sc->flags & XCHK_TRY_HARDER)) { 432 /* 433 * Otherwise, retry the operation one time to see if 434 * the reason for the delay is an inode from the same 435 * cluster buffer waiting on the inactivation list. 436 */ 437 error = -EDEADLOCK; 438 } 439 } 440 if (error) 441 return error; 442 443 /* 444 * Convert the discovered ftype into the file mode. If all else fails, 445 * return S_IFREG. 446 */ 447 switch (ri->alleged_ftype) { 448 case XFS_DIR3_FT_DIR: 449 *mode = S_IFDIR; 450 break; 451 case XFS_DIR3_FT_WHT: 452 case XFS_DIR3_FT_CHRDEV: 453 *mode = S_IFCHR; 454 break; 455 case XFS_DIR3_FT_BLKDEV: 456 *mode = S_IFBLK; 457 break; 458 case XFS_DIR3_FT_FIFO: 459 *mode = S_IFIFO; 460 break; 461 case XFS_DIR3_FT_SOCK: 462 *mode = S_IFSOCK; 463 break; 464 case XFS_DIR3_FT_SYMLINK: 465 *mode = S_IFLNK; 466 break; 467 default: 468 *mode = S_IFREG; 469 break; 470 } 471 return 0; 472 } 473 474 /* Turn di_mode into /something/ recognizable. Returns true if we succeed. */ 475 STATIC int 476 xrep_dinode_mode( 477 struct xrep_inode *ri, 478 struct xfs_dinode *dip) 479 { 480 struct xfs_scrub *sc = ri->sc; 481 uint16_t mode = be16_to_cpu(dip->di_mode); 482 int error; 483 484 trace_xrep_dinode_mode(sc, dip); 485 486 if (mode == 0 || xfs_mode_to_ftype(mode) != XFS_DIR3_FT_UNKNOWN) 487 return 0; 488 489 /* Try to fix the mode. If we cannot, then leave everything alone. */ 490 error = xrep_dinode_find_mode(ri, &mode); 491 switch (error) { 492 case -EINTR: 493 case -EBUSY: 494 case -EDEADLOCK: 495 /* temporary failure or fatal signal */ 496 return error; 497 case 0: 498 /* found mode */ 499 break; 500 default: 501 /* some other error, assume S_IFREG */ 502 mode = S_IFREG; 503 break; 504 } 505 506 /* bad mode, so we set it to a file that only root can read */ 507 dip->di_mode = cpu_to_be16(mode); 508 dip->di_uid = 0; 509 dip->di_gid = 0; 510 ri->zap_acls = true; 511 return 0; 512 } 513 514 /* Fix any conflicting flags that the verifiers complain about. */ 515 STATIC void 516 xrep_dinode_flags( 517 struct xfs_scrub *sc, 518 struct xfs_dinode *dip, 519 bool isrt) 520 { 521 struct xfs_mount *mp = sc->mp; 522 uint64_t flags2 = be64_to_cpu(dip->di_flags2); 523 uint16_t flags = be16_to_cpu(dip->di_flags); 524 uint16_t mode = be16_to_cpu(dip->di_mode); 525 526 trace_xrep_dinode_flags(sc, dip); 527 528 if (isrt) 529 flags |= XFS_DIFLAG_REALTIME; 530 else 531 flags &= ~XFS_DIFLAG_REALTIME; 532 533 /* 534 * For regular files on a reflink filesystem, set the REFLINK flag to 535 * protect shared extents. A later stage will actually check those 536 * extents and clear the flag if possible. 537 */ 538 if (xfs_has_reflink(mp) && S_ISREG(mode)) 539 flags2 |= XFS_DIFLAG2_REFLINK; 540 else 541 flags2 &= ~(XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE); 542 if (flags & XFS_DIFLAG_REALTIME) 543 flags2 &= ~XFS_DIFLAG2_REFLINK; 544 if (!xfs_has_bigtime(mp)) 545 flags2 &= ~XFS_DIFLAG2_BIGTIME; 546 if (!xfs_has_large_extent_counts(mp)) 547 flags2 &= ~XFS_DIFLAG2_NREXT64; 548 if (flags2 & XFS_DIFLAG2_NREXT64) 549 dip->di_nrext64_pad = 0; 550 else if (dip->di_version >= 3) 551 dip->di_v3_pad = 0; 552 dip->di_flags = cpu_to_be16(flags); 553 dip->di_flags2 = cpu_to_be64(flags2); 554 } 555 556 /* 557 * Blow out symlink; now it points nowhere. We don't have to worry about 558 * incore state because this inode is failing the verifiers. 559 */ 560 STATIC void 561 xrep_dinode_zap_symlink( 562 struct xrep_inode *ri, 563 struct xfs_dinode *dip) 564 { 565 struct xfs_scrub *sc = ri->sc; 566 char *p; 567 568 trace_xrep_dinode_zap_symlink(sc, dip); 569 570 dip->di_format = XFS_DINODE_FMT_LOCAL; 571 dip->di_size = cpu_to_be64(1); 572 p = XFS_DFORK_PTR(dip, XFS_DATA_FORK); 573 *p = '?'; 574 ri->ino_sick_mask |= XFS_SICK_INO_SYMLINK_ZAPPED; 575 } 576 577 /* 578 * Blow out dir, make the parent point to the root. In the future repair will 579 * reconstruct this directory for us. Note that there's no in-core directory 580 * inode because the sf verifier tripped, so we don't have to worry about the 581 * dentry cache. 582 */ 583 STATIC void 584 xrep_dinode_zap_dir( 585 struct xrep_inode *ri, 586 struct xfs_dinode *dip) 587 { 588 struct xfs_scrub *sc = ri->sc; 589 struct xfs_mount *mp = sc->mp; 590 struct xfs_dir2_sf_hdr *sfp; 591 int i8count; 592 593 trace_xrep_dinode_zap_dir(sc, dip); 594 595 dip->di_format = XFS_DINODE_FMT_LOCAL; 596 i8count = mp->m_sb.sb_rootino > XFS_DIR2_MAX_SHORT_INUM; 597 sfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK); 598 sfp->count = 0; 599 sfp->i8count = i8count; 600 xfs_dir2_sf_put_parent_ino(sfp, mp->m_sb.sb_rootino); 601 dip->di_size = cpu_to_be64(xfs_dir2_sf_hdr_size(i8count)); 602 ri->ino_sick_mask |= XFS_SICK_INO_DIR_ZAPPED; 603 } 604 605 /* Make sure we don't have a garbage file size. */ 606 STATIC void 607 xrep_dinode_size( 608 struct xrep_inode *ri, 609 struct xfs_dinode *dip) 610 { 611 struct xfs_scrub *sc = ri->sc; 612 uint64_t size = be64_to_cpu(dip->di_size); 613 uint16_t mode = be16_to_cpu(dip->di_mode); 614 615 trace_xrep_dinode_size(sc, dip); 616 617 switch (mode & S_IFMT) { 618 case S_IFIFO: 619 case S_IFCHR: 620 case S_IFBLK: 621 case S_IFSOCK: 622 /* di_size can't be nonzero for special files */ 623 dip->di_size = 0; 624 break; 625 case S_IFREG: 626 /* Regular files can't be larger than 2^63-1 bytes. */ 627 dip->di_size = cpu_to_be64(size & ~(1ULL << 63)); 628 break; 629 case S_IFLNK: 630 /* 631 * Truncate ridiculously oversized symlinks. If the size is 632 * zero, reset it to point to the current directory. Both of 633 * these conditions trigger dinode verifier errors, so there 634 * is no in-core state to reset. 635 */ 636 if (size > XFS_SYMLINK_MAXLEN) 637 dip->di_size = cpu_to_be64(XFS_SYMLINK_MAXLEN); 638 else if (size == 0) 639 xrep_dinode_zap_symlink(ri, dip); 640 break; 641 case S_IFDIR: 642 /* 643 * Directories can't have a size larger than 32G. If the size 644 * is zero, reset it to an empty directory. Both of these 645 * conditions trigger dinode verifier errors, so there is no 646 * in-core state to reset. 647 */ 648 if (size > XFS_DIR2_SPACE_SIZE) 649 dip->di_size = cpu_to_be64(XFS_DIR2_SPACE_SIZE); 650 else if (size == 0) 651 xrep_dinode_zap_dir(ri, dip); 652 break; 653 } 654 } 655 656 /* Fix extent size hints. */ 657 STATIC void 658 xrep_dinode_extsize_hints( 659 struct xfs_scrub *sc, 660 struct xfs_dinode *dip) 661 { 662 struct xfs_mount *mp = sc->mp; 663 uint64_t flags2 = be64_to_cpu(dip->di_flags2); 664 uint16_t flags = be16_to_cpu(dip->di_flags); 665 uint16_t mode = be16_to_cpu(dip->di_mode); 666 667 xfs_failaddr_t fa; 668 669 trace_xrep_dinode_extsize_hints(sc, dip); 670 671 fa = xfs_inode_validate_extsize(mp, be32_to_cpu(dip->di_extsize), 672 mode, flags); 673 if (fa) { 674 dip->di_extsize = 0; 675 dip->di_flags &= ~cpu_to_be16(XFS_DIFLAG_EXTSIZE | 676 XFS_DIFLAG_EXTSZINHERIT); 677 } 678 679 if (dip->di_version < 3) 680 return; 681 682 fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize), 683 mode, flags, flags2); 684 if (fa) { 685 dip->di_cowextsize = 0; 686 dip->di_flags2 &= ~cpu_to_be64(XFS_DIFLAG2_COWEXTSIZE); 687 } 688 } 689 690 /* Count extents and blocks for an inode given an rmap. */ 691 STATIC int 692 xrep_dinode_walk_rmap( 693 struct xfs_btree_cur *cur, 694 const struct xfs_rmap_irec *rec, 695 void *priv) 696 { 697 struct xrep_inode *ri = priv; 698 int error = 0; 699 700 if (xchk_should_terminate(ri->sc, &error)) 701 return error; 702 703 /* We only care about this inode. */ 704 if (rec->rm_owner != ri->sc->sm->sm_ino) 705 return 0; 706 707 if (rec->rm_flags & XFS_RMAP_ATTR_FORK) { 708 ri->attr_blocks += rec->rm_blockcount; 709 if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK)) 710 ri->attr_extents++; 711 712 return 0; 713 } 714 715 ri->data_blocks += rec->rm_blockcount; 716 if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK)) 717 ri->data_extents++; 718 719 return 0; 720 } 721 722 /* Count extents and blocks for an inode from all AG rmap data. */ 723 STATIC int 724 xrep_dinode_count_ag_rmaps( 725 struct xrep_inode *ri, 726 struct xfs_perag *pag) 727 { 728 struct xfs_btree_cur *cur; 729 struct xfs_buf *agf; 730 int error; 731 732 error = xfs_alloc_read_agf(pag, ri->sc->tp, 0, &agf); 733 if (error) 734 return error; 735 736 cur = xfs_rmapbt_init_cursor(ri->sc->mp, ri->sc->tp, agf, pag); 737 error = xfs_rmap_query_all(cur, xrep_dinode_walk_rmap, ri); 738 xfs_btree_del_cursor(cur, error); 739 xfs_trans_brelse(ri->sc->tp, agf); 740 return error; 741 } 742 743 /* Count extents and blocks for a given inode from all rmap data. */ 744 STATIC int 745 xrep_dinode_count_rmaps( 746 struct xrep_inode *ri) 747 { 748 struct xfs_perag *pag; 749 xfs_agnumber_t agno; 750 int error; 751 752 if (!xfs_has_rmapbt(ri->sc->mp) || xfs_has_realtime(ri->sc->mp)) 753 return -EOPNOTSUPP; 754 755 for_each_perag(ri->sc->mp, agno, pag) { 756 error = xrep_dinode_count_ag_rmaps(ri, pag); 757 if (error) { 758 xfs_perag_rele(pag); 759 return error; 760 } 761 } 762 763 /* Can't have extents on both the rt and the data device. */ 764 if (ri->data_extents && ri->rt_extents) 765 return -EFSCORRUPTED; 766 767 trace_xrep_dinode_count_rmaps(ri->sc, 768 ri->data_blocks, ri->rt_blocks, ri->attr_blocks, 769 ri->data_extents, ri->rt_extents, ri->attr_extents); 770 return 0; 771 } 772 773 /* Return true if this extents-format ifork looks like garbage. */ 774 STATIC bool 775 xrep_dinode_bad_extents_fork( 776 struct xfs_scrub *sc, 777 struct xfs_dinode *dip, 778 unsigned int dfork_size, 779 int whichfork) 780 { 781 struct xfs_bmbt_irec new; 782 struct xfs_bmbt_rec *dp; 783 xfs_extnum_t nex; 784 bool isrt; 785 unsigned int i; 786 787 nex = xfs_dfork_nextents(dip, whichfork); 788 if (nex > dfork_size / sizeof(struct xfs_bmbt_rec)) 789 return true; 790 791 dp = XFS_DFORK_PTR(dip, whichfork); 792 793 isrt = dip->di_flags & cpu_to_be16(XFS_DIFLAG_REALTIME); 794 for (i = 0; i < nex; i++, dp++) { 795 xfs_failaddr_t fa; 796 797 xfs_bmbt_disk_get_all(dp, &new); 798 fa = xfs_bmap_validate_extent_raw(sc->mp, isrt, whichfork, 799 &new); 800 if (fa) 801 return true; 802 } 803 804 return false; 805 } 806 807 /* Return true if this btree-format ifork looks like garbage. */ 808 STATIC bool 809 xrep_dinode_bad_bmbt_fork( 810 struct xfs_scrub *sc, 811 struct xfs_dinode *dip, 812 unsigned int dfork_size, 813 int whichfork) 814 { 815 struct xfs_bmdr_block *dfp; 816 xfs_extnum_t nex; 817 unsigned int i; 818 unsigned int dmxr; 819 unsigned int nrecs; 820 unsigned int level; 821 822 nex = xfs_dfork_nextents(dip, whichfork); 823 if (nex <= dfork_size / sizeof(struct xfs_bmbt_rec)) 824 return true; 825 826 if (dfork_size < sizeof(struct xfs_bmdr_block)) 827 return true; 828 829 dfp = XFS_DFORK_PTR(dip, whichfork); 830 nrecs = be16_to_cpu(dfp->bb_numrecs); 831 level = be16_to_cpu(dfp->bb_level); 832 833 if (nrecs == 0 || XFS_BMDR_SPACE_CALC(nrecs) > dfork_size) 834 return true; 835 if (level == 0 || level >= XFS_BM_MAXLEVELS(sc->mp, whichfork)) 836 return true; 837 838 dmxr = xfs_bmdr_maxrecs(dfork_size, 0); 839 for (i = 1; i <= nrecs; i++) { 840 struct xfs_bmbt_key *fkp; 841 xfs_bmbt_ptr_t *fpp; 842 xfs_fileoff_t fileoff; 843 xfs_fsblock_t fsbno; 844 845 fkp = XFS_BMDR_KEY_ADDR(dfp, i); 846 fileoff = be64_to_cpu(fkp->br_startoff); 847 if (!xfs_verify_fileoff(sc->mp, fileoff)) 848 return true; 849 850 fpp = XFS_BMDR_PTR_ADDR(dfp, i, dmxr); 851 fsbno = be64_to_cpu(*fpp); 852 if (!xfs_verify_fsbno(sc->mp, fsbno)) 853 return true; 854 } 855 856 return false; 857 } 858 859 /* 860 * Check the data fork for things that will fail the ifork verifiers or the 861 * ifork formatters. 862 */ 863 STATIC bool 864 xrep_dinode_check_dfork( 865 struct xfs_scrub *sc, 866 struct xfs_dinode *dip, 867 uint16_t mode) 868 { 869 void *dfork_ptr; 870 int64_t data_size; 871 unsigned int fmt; 872 unsigned int dfork_size; 873 874 /* 875 * Verifier functions take signed int64_t, so check for bogus negative 876 * values first. 877 */ 878 data_size = be64_to_cpu(dip->di_size); 879 if (data_size < 0) 880 return true; 881 882 fmt = XFS_DFORK_FORMAT(dip, XFS_DATA_FORK); 883 switch (mode & S_IFMT) { 884 case S_IFIFO: 885 case S_IFCHR: 886 case S_IFBLK: 887 case S_IFSOCK: 888 if (fmt != XFS_DINODE_FMT_DEV) 889 return true; 890 break; 891 case S_IFREG: 892 if (fmt == XFS_DINODE_FMT_LOCAL) 893 return true; 894 fallthrough; 895 case S_IFLNK: 896 case S_IFDIR: 897 switch (fmt) { 898 case XFS_DINODE_FMT_LOCAL: 899 case XFS_DINODE_FMT_EXTENTS: 900 case XFS_DINODE_FMT_BTREE: 901 break; 902 default: 903 return true; 904 } 905 break; 906 default: 907 return true; 908 } 909 910 dfork_size = XFS_DFORK_SIZE(dip, sc->mp, XFS_DATA_FORK); 911 dfork_ptr = XFS_DFORK_PTR(dip, XFS_DATA_FORK); 912 913 switch (fmt) { 914 case XFS_DINODE_FMT_DEV: 915 break; 916 case XFS_DINODE_FMT_LOCAL: 917 /* dir/symlink structure cannot be larger than the fork */ 918 if (data_size > dfork_size) 919 return true; 920 /* directory structure must pass verification. */ 921 if (S_ISDIR(mode) && 922 xfs_dir2_sf_verify(sc->mp, dfork_ptr, data_size) != NULL) 923 return true; 924 /* symlink structure must pass verification. */ 925 if (S_ISLNK(mode) && 926 xfs_symlink_shortform_verify(dfork_ptr, data_size) != NULL) 927 return true; 928 break; 929 case XFS_DINODE_FMT_EXTENTS: 930 if (xrep_dinode_bad_extents_fork(sc, dip, dfork_size, 931 XFS_DATA_FORK)) 932 return true; 933 break; 934 case XFS_DINODE_FMT_BTREE: 935 if (xrep_dinode_bad_bmbt_fork(sc, dip, dfork_size, 936 XFS_DATA_FORK)) 937 return true; 938 break; 939 default: 940 return true; 941 } 942 943 return false; 944 } 945 946 static void 947 xrep_dinode_set_data_nextents( 948 struct xfs_dinode *dip, 949 xfs_extnum_t nextents) 950 { 951 if (xfs_dinode_has_large_extent_counts(dip)) 952 dip->di_big_nextents = cpu_to_be64(nextents); 953 else 954 dip->di_nextents = cpu_to_be32(nextents); 955 } 956 957 static void 958 xrep_dinode_set_attr_nextents( 959 struct xfs_dinode *dip, 960 xfs_extnum_t nextents) 961 { 962 if (xfs_dinode_has_large_extent_counts(dip)) 963 dip->di_big_anextents = cpu_to_be32(nextents); 964 else 965 dip->di_anextents = cpu_to_be16(nextents); 966 } 967 968 /* Reset the data fork to something sane. */ 969 STATIC void 970 xrep_dinode_zap_dfork( 971 struct xrep_inode *ri, 972 struct xfs_dinode *dip, 973 uint16_t mode) 974 { 975 struct xfs_scrub *sc = ri->sc; 976 977 trace_xrep_dinode_zap_dfork(sc, dip); 978 979 ri->ino_sick_mask |= XFS_SICK_INO_BMBTD_ZAPPED; 980 981 xrep_dinode_set_data_nextents(dip, 0); 982 ri->data_blocks = 0; 983 ri->rt_blocks = 0; 984 985 /* Special files always get reset to DEV */ 986 switch (mode & S_IFMT) { 987 case S_IFIFO: 988 case S_IFCHR: 989 case S_IFBLK: 990 case S_IFSOCK: 991 dip->di_format = XFS_DINODE_FMT_DEV; 992 dip->di_size = 0; 993 return; 994 } 995 996 /* 997 * If we have data extents, reset to an empty map and hope the user 998 * will run the bmapbtd checker next. 999 */ 1000 if (ri->data_extents || ri->rt_extents || S_ISREG(mode)) { 1001 dip->di_format = XFS_DINODE_FMT_EXTENTS; 1002 return; 1003 } 1004 1005 /* Otherwise, reset the local format to the minimum. */ 1006 switch (mode & S_IFMT) { 1007 case S_IFLNK: 1008 xrep_dinode_zap_symlink(ri, dip); 1009 break; 1010 case S_IFDIR: 1011 xrep_dinode_zap_dir(ri, dip); 1012 break; 1013 } 1014 } 1015 1016 /* 1017 * Check the attr fork for things that will fail the ifork verifiers or the 1018 * ifork formatters. 1019 */ 1020 STATIC bool 1021 xrep_dinode_check_afork( 1022 struct xfs_scrub *sc, 1023 struct xfs_dinode *dip) 1024 { 1025 struct xfs_attr_sf_hdr *afork_ptr; 1026 size_t attr_size; 1027 unsigned int afork_size; 1028 1029 if (XFS_DFORK_BOFF(dip) == 0) 1030 return dip->di_aformat != XFS_DINODE_FMT_EXTENTS || 1031 xfs_dfork_attr_extents(dip) != 0; 1032 1033 afork_size = XFS_DFORK_SIZE(dip, sc->mp, XFS_ATTR_FORK); 1034 afork_ptr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK); 1035 1036 switch (XFS_DFORK_FORMAT(dip, XFS_ATTR_FORK)) { 1037 case XFS_DINODE_FMT_LOCAL: 1038 /* Fork has to be large enough to extract the xattr size. */ 1039 if (afork_size < sizeof(struct xfs_attr_sf_hdr)) 1040 return true; 1041 1042 /* xattr structure cannot be larger than the fork */ 1043 attr_size = be16_to_cpu(afork_ptr->totsize); 1044 if (attr_size > afork_size) 1045 return true; 1046 1047 /* xattr structure must pass verification. */ 1048 return xfs_attr_shortform_verify(afork_ptr, attr_size) != NULL; 1049 case XFS_DINODE_FMT_EXTENTS: 1050 if (xrep_dinode_bad_extents_fork(sc, dip, afork_size, 1051 XFS_ATTR_FORK)) 1052 return true; 1053 break; 1054 case XFS_DINODE_FMT_BTREE: 1055 if (xrep_dinode_bad_bmbt_fork(sc, dip, afork_size, 1056 XFS_ATTR_FORK)) 1057 return true; 1058 break; 1059 default: 1060 return true; 1061 } 1062 1063 return false; 1064 } 1065 1066 /* 1067 * Reset the attr fork to empty. Since the attr fork could have contained 1068 * ACLs, make the file readable only by root. 1069 */ 1070 STATIC void 1071 xrep_dinode_zap_afork( 1072 struct xrep_inode *ri, 1073 struct xfs_dinode *dip, 1074 uint16_t mode) 1075 { 1076 struct xfs_scrub *sc = ri->sc; 1077 1078 trace_xrep_dinode_zap_afork(sc, dip); 1079 1080 ri->ino_sick_mask |= XFS_SICK_INO_BMBTA_ZAPPED; 1081 1082 dip->di_aformat = XFS_DINODE_FMT_EXTENTS; 1083 xrep_dinode_set_attr_nextents(dip, 0); 1084 ri->attr_blocks = 0; 1085 1086 /* 1087 * If the data fork is in btree format, removing the attr fork entirely 1088 * might cause verifier failures if the next level down in the bmbt 1089 * could now fit in the data fork area. 1090 */ 1091 if (dip->di_format != XFS_DINODE_FMT_BTREE) 1092 dip->di_forkoff = 0; 1093 dip->di_mode = cpu_to_be16(mode & ~0777); 1094 dip->di_uid = 0; 1095 dip->di_gid = 0; 1096 } 1097 1098 /* Make sure the fork offset is a sensible value. */ 1099 STATIC void 1100 xrep_dinode_ensure_forkoff( 1101 struct xrep_inode *ri, 1102 struct xfs_dinode *dip, 1103 uint16_t mode) 1104 { 1105 struct xfs_bmdr_block *bmdr; 1106 struct xfs_scrub *sc = ri->sc; 1107 xfs_extnum_t attr_extents, data_extents; 1108 size_t bmdr_minsz = XFS_BMDR_SPACE_CALC(1); 1109 unsigned int lit_sz = XFS_LITINO(sc->mp); 1110 unsigned int afork_min, dfork_min; 1111 1112 trace_xrep_dinode_ensure_forkoff(sc, dip); 1113 1114 /* 1115 * Before calling this function, xrep_dinode_core ensured that both 1116 * forks actually fit inside their respective literal areas. If this 1117 * was not the case, the fork was reset to FMT_EXTENTS with zero 1118 * records. If the rmapbt scan found attr or data fork blocks, this 1119 * will be noted in the dinode_stats, and we must leave enough room 1120 * for the bmap repair code to reconstruct the mapping structure. 1121 * 1122 * First, compute the minimum space required for the attr fork. 1123 */ 1124 switch (dip->di_aformat) { 1125 case XFS_DINODE_FMT_LOCAL: 1126 /* 1127 * If we still have a shortform xattr structure at all, that 1128 * means the attr fork area was exactly large enough to fit 1129 * the sf structure. 1130 */ 1131 afork_min = XFS_DFORK_SIZE(dip, sc->mp, XFS_ATTR_FORK); 1132 break; 1133 case XFS_DINODE_FMT_EXTENTS: 1134 attr_extents = xfs_dfork_attr_extents(dip); 1135 if (attr_extents) { 1136 /* 1137 * We must maintain sufficient space to hold the entire 1138 * extent map array in the data fork. Note that we 1139 * previously zapped the fork if it had no chance of 1140 * fitting in the inode. 1141 */ 1142 afork_min = sizeof(struct xfs_bmbt_rec) * attr_extents; 1143 } else if (ri->attr_extents > 0) { 1144 /* 1145 * The attr fork thinks it has zero extents, but we 1146 * found some xattr extents. We need to leave enough 1147 * empty space here so that the incore attr fork will 1148 * get created (and hence trigger the attr fork bmap 1149 * repairer). 1150 */ 1151 afork_min = bmdr_minsz; 1152 } else { 1153 /* No extents on disk or found in rmapbt. */ 1154 afork_min = 0; 1155 } 1156 break; 1157 case XFS_DINODE_FMT_BTREE: 1158 /* Must have space for btree header and key/pointers. */ 1159 bmdr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK); 1160 afork_min = XFS_BMAP_BROOT_SPACE(sc->mp, bmdr); 1161 break; 1162 default: 1163 /* We should never see any other formats. */ 1164 afork_min = 0; 1165 break; 1166 } 1167 1168 /* Compute the minimum space required for the data fork. */ 1169 switch (dip->di_format) { 1170 case XFS_DINODE_FMT_DEV: 1171 dfork_min = sizeof(__be32); 1172 break; 1173 case XFS_DINODE_FMT_UUID: 1174 dfork_min = sizeof(uuid_t); 1175 break; 1176 case XFS_DINODE_FMT_LOCAL: 1177 /* 1178 * If we still have a shortform data fork at all, that means 1179 * the data fork area was large enough to fit whatever was in 1180 * there. 1181 */ 1182 dfork_min = be64_to_cpu(dip->di_size); 1183 break; 1184 case XFS_DINODE_FMT_EXTENTS: 1185 data_extents = xfs_dfork_data_extents(dip); 1186 if (data_extents) { 1187 /* 1188 * We must maintain sufficient space to hold the entire 1189 * extent map array in the data fork. Note that we 1190 * previously zapped the fork if it had no chance of 1191 * fitting in the inode. 1192 */ 1193 dfork_min = sizeof(struct xfs_bmbt_rec) * data_extents; 1194 } else if (ri->data_extents > 0 || ri->rt_extents > 0) { 1195 /* 1196 * The data fork thinks it has zero extents, but we 1197 * found some data extents. We need to leave enough 1198 * empty space here so that the data fork bmap repair 1199 * will recover the mappings. 1200 */ 1201 dfork_min = bmdr_minsz; 1202 } else { 1203 /* No extents on disk or found in rmapbt. */ 1204 dfork_min = 0; 1205 } 1206 break; 1207 case XFS_DINODE_FMT_BTREE: 1208 /* Must have space for btree header and key/pointers. */ 1209 bmdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK); 1210 dfork_min = XFS_BMAP_BROOT_SPACE(sc->mp, bmdr); 1211 break; 1212 default: 1213 dfork_min = 0; 1214 break; 1215 } 1216 1217 /* 1218 * Round all values up to the nearest 8 bytes, because that is the 1219 * precision of di_forkoff. 1220 */ 1221 afork_min = roundup(afork_min, 8); 1222 dfork_min = roundup(dfork_min, 8); 1223 bmdr_minsz = roundup(bmdr_minsz, 8); 1224 1225 ASSERT(dfork_min <= lit_sz); 1226 ASSERT(afork_min <= lit_sz); 1227 1228 /* 1229 * If the data fork was zapped and we don't have enough space for the 1230 * recovery fork, move the attr fork up. 1231 */ 1232 if (dip->di_format == XFS_DINODE_FMT_EXTENTS && 1233 xfs_dfork_data_extents(dip) == 0 && 1234 (ri->data_extents > 0 || ri->rt_extents > 0) && 1235 bmdr_minsz > XFS_DFORK_DSIZE(dip, sc->mp)) { 1236 if (bmdr_minsz + afork_min > lit_sz) { 1237 /* 1238 * The attr for and the stub fork we need to recover 1239 * the data fork won't both fit. Zap the attr fork. 1240 */ 1241 xrep_dinode_zap_afork(ri, dip, mode); 1242 afork_min = bmdr_minsz; 1243 } else { 1244 void *before, *after; 1245 1246 /* Otherwise, just slide the attr fork up. */ 1247 before = XFS_DFORK_APTR(dip); 1248 dip->di_forkoff = bmdr_minsz >> 3; 1249 after = XFS_DFORK_APTR(dip); 1250 memmove(after, before, XFS_DFORK_ASIZE(dip, sc->mp)); 1251 } 1252 } 1253 1254 /* 1255 * If the attr fork was zapped and we don't have enough space for the 1256 * recovery fork, move the attr fork down. 1257 */ 1258 if (dip->di_aformat == XFS_DINODE_FMT_EXTENTS && 1259 xfs_dfork_attr_extents(dip) == 0 && 1260 ri->attr_extents > 0 && 1261 bmdr_minsz > XFS_DFORK_ASIZE(dip, sc->mp)) { 1262 if (dip->di_format == XFS_DINODE_FMT_BTREE) { 1263 /* 1264 * If the data fork is in btree format then we can't 1265 * adjust forkoff because that runs the risk of 1266 * violating the extents/btree format transition rules. 1267 */ 1268 } else if (bmdr_minsz + dfork_min > lit_sz) { 1269 /* 1270 * If we can't move the attr fork, too bad, we lose the 1271 * attr fork and leak its blocks. 1272 */ 1273 xrep_dinode_zap_afork(ri, dip, mode); 1274 } else { 1275 /* 1276 * Otherwise, just slide the attr fork down. The attr 1277 * fork is empty, so we don't have any old contents to 1278 * move here. 1279 */ 1280 dip->di_forkoff = (lit_sz - bmdr_minsz) >> 3; 1281 } 1282 } 1283 } 1284 1285 /* 1286 * Zap the data/attr forks if we spot anything that isn't going to pass the 1287 * ifork verifiers or the ifork formatters, because we need to get the inode 1288 * into good enough shape that the higher level repair functions can run. 1289 */ 1290 STATIC void 1291 xrep_dinode_zap_forks( 1292 struct xrep_inode *ri, 1293 struct xfs_dinode *dip) 1294 { 1295 struct xfs_scrub *sc = ri->sc; 1296 xfs_extnum_t data_extents; 1297 xfs_extnum_t attr_extents; 1298 xfs_filblks_t nblocks; 1299 uint16_t mode; 1300 bool zap_datafork = false; 1301 bool zap_attrfork = ri->zap_acls; 1302 1303 trace_xrep_dinode_zap_forks(sc, dip); 1304 1305 mode = be16_to_cpu(dip->di_mode); 1306 1307 data_extents = xfs_dfork_data_extents(dip); 1308 attr_extents = xfs_dfork_attr_extents(dip); 1309 nblocks = be64_to_cpu(dip->di_nblocks); 1310 1311 /* Inode counters don't make sense? */ 1312 if (data_extents > nblocks) 1313 zap_datafork = true; 1314 if (attr_extents > nblocks) 1315 zap_attrfork = true; 1316 if (data_extents + attr_extents > nblocks) 1317 zap_datafork = zap_attrfork = true; 1318 1319 if (!zap_datafork) 1320 zap_datafork = xrep_dinode_check_dfork(sc, dip, mode); 1321 if (!zap_attrfork) 1322 zap_attrfork = xrep_dinode_check_afork(sc, dip); 1323 1324 /* Zap whatever's bad. */ 1325 if (zap_attrfork) 1326 xrep_dinode_zap_afork(ri, dip, mode); 1327 if (zap_datafork) 1328 xrep_dinode_zap_dfork(ri, dip, mode); 1329 xrep_dinode_ensure_forkoff(ri, dip, mode); 1330 1331 /* 1332 * Zero di_nblocks if we don't have any extents at all to satisfy the 1333 * buffer verifier. 1334 */ 1335 data_extents = xfs_dfork_data_extents(dip); 1336 attr_extents = xfs_dfork_attr_extents(dip); 1337 if (data_extents + attr_extents == 0) 1338 dip->di_nblocks = 0; 1339 } 1340 1341 /* Inode didn't pass dinode verifiers, so fix the raw buffer and retry iget. */ 1342 STATIC int 1343 xrep_dinode_core( 1344 struct xrep_inode *ri) 1345 { 1346 struct xfs_scrub *sc = ri->sc; 1347 struct xfs_buf *bp; 1348 struct xfs_dinode *dip; 1349 xfs_ino_t ino = sc->sm->sm_ino; 1350 int error; 1351 int iget_error; 1352 1353 /* Figure out what this inode had mapped in both forks. */ 1354 error = xrep_dinode_count_rmaps(ri); 1355 if (error) 1356 return error; 1357 1358 /* Read the inode cluster buffer. */ 1359 error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp, 1360 ri->imap.im_blkno, ri->imap.im_len, XBF_UNMAPPED, &bp, 1361 NULL); 1362 if (error) 1363 return error; 1364 1365 /* Make sure we can pass the inode buffer verifier. */ 1366 xrep_dinode_buf(sc, bp); 1367 bp->b_ops = &xfs_inode_buf_ops; 1368 1369 /* Fix everything the verifier will complain about. */ 1370 dip = xfs_buf_offset(bp, ri->imap.im_boffset); 1371 xrep_dinode_header(sc, dip); 1372 iget_error = xrep_dinode_mode(ri, dip); 1373 if (iget_error) 1374 goto write; 1375 xrep_dinode_flags(sc, dip, ri->rt_extents > 0); 1376 xrep_dinode_size(ri, dip); 1377 xrep_dinode_extsize_hints(sc, dip); 1378 xrep_dinode_zap_forks(ri, dip); 1379 1380 write: 1381 /* Write out the inode. */ 1382 trace_xrep_dinode_fixed(sc, dip); 1383 xfs_dinode_calc_crc(sc->mp, dip); 1384 xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_DINO_BUF); 1385 xfs_trans_log_buf(sc->tp, bp, ri->imap.im_boffset, 1386 ri->imap.im_boffset + sc->mp->m_sb.sb_inodesize - 1); 1387 1388 /* 1389 * In theory, we've fixed the ondisk inode record enough that we should 1390 * be able to load the inode into the cache. Try to iget that inode 1391 * now while we hold the AGI and the inode cluster buffer and take the 1392 * IOLOCK so that we can continue with repairs without anyone else 1393 * accessing the inode. If iget fails, we still need to commit the 1394 * changes. 1395 */ 1396 if (!iget_error) 1397 iget_error = xchk_iget(sc, ino, &sc->ip); 1398 if (!iget_error) 1399 xchk_ilock(sc, XFS_IOLOCK_EXCL); 1400 1401 /* 1402 * Commit the inode cluster buffer updates and drop the AGI buffer that 1403 * we've been holding since scrub setup. From here on out, repairs 1404 * deal only with the cached inode. 1405 */ 1406 error = xrep_trans_commit(sc); 1407 if (error) 1408 return error; 1409 1410 if (iget_error) 1411 return iget_error; 1412 1413 error = xchk_trans_alloc(sc, 0); 1414 if (error) 1415 return error; 1416 1417 error = xrep_ino_dqattach(sc); 1418 if (error) 1419 return error; 1420 1421 xchk_ilock(sc, XFS_ILOCK_EXCL); 1422 if (ri->ino_sick_mask) 1423 xfs_inode_mark_sick(sc->ip, ri->ino_sick_mask); 1424 return 0; 1425 } 1426 1427 /* Fix everything xfs_dinode_verify cares about. */ 1428 STATIC int 1429 xrep_dinode_problems( 1430 struct xrep_inode *ri) 1431 { 1432 struct xfs_scrub *sc = ri->sc; 1433 int error; 1434 1435 error = xrep_dinode_core(ri); 1436 if (error) 1437 return error; 1438 1439 /* We had to fix a totally busted inode, schedule quotacheck. */ 1440 if (XFS_IS_UQUOTA_ON(sc->mp)) 1441 xrep_force_quotacheck(sc, XFS_DQTYPE_USER); 1442 if (XFS_IS_GQUOTA_ON(sc->mp)) 1443 xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP); 1444 if (XFS_IS_PQUOTA_ON(sc->mp)) 1445 xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ); 1446 1447 return 0; 1448 } 1449 1450 /* 1451 * Fix problems that the verifiers don't care about. In general these are 1452 * errors that don't cause problems elsewhere in the kernel that we can easily 1453 * detect, so we don't check them all that rigorously. 1454 */ 1455 1456 /* Make sure block and extent counts are ok. */ 1457 STATIC int 1458 xrep_inode_blockcounts( 1459 struct xfs_scrub *sc) 1460 { 1461 struct xfs_ifork *ifp; 1462 xfs_filblks_t count; 1463 xfs_filblks_t acount; 1464 xfs_extnum_t nextents; 1465 int error; 1466 1467 trace_xrep_inode_blockcounts(sc); 1468 1469 /* Set data fork counters from the data fork mappings. */ 1470 error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK, 1471 &nextents, &count); 1472 if (error) 1473 return error; 1474 if (xfs_is_reflink_inode(sc->ip)) { 1475 /* 1476 * data fork blockcount can exceed physical storage if a user 1477 * reflinks the same block over and over again. 1478 */ 1479 ; 1480 } else if (XFS_IS_REALTIME_INODE(sc->ip)) { 1481 if (count >= sc->mp->m_sb.sb_rblocks) 1482 return -EFSCORRUPTED; 1483 } else { 1484 if (count >= sc->mp->m_sb.sb_dblocks) 1485 return -EFSCORRUPTED; 1486 } 1487 error = xrep_ino_ensure_extent_count(sc, XFS_DATA_FORK, nextents); 1488 if (error) 1489 return error; 1490 sc->ip->i_df.if_nextents = nextents; 1491 1492 /* Set attr fork counters from the attr fork mappings. */ 1493 ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK); 1494 if (ifp) { 1495 error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK, 1496 &nextents, &acount); 1497 if (error) 1498 return error; 1499 if (count >= sc->mp->m_sb.sb_dblocks) 1500 return -EFSCORRUPTED; 1501 error = xrep_ino_ensure_extent_count(sc, XFS_ATTR_FORK, 1502 nextents); 1503 if (error) 1504 return error; 1505 ifp->if_nextents = nextents; 1506 } else { 1507 acount = 0; 1508 } 1509 1510 sc->ip->i_nblocks = count + acount; 1511 return 0; 1512 } 1513 1514 /* Check for invalid uid/gid/prid. */ 1515 STATIC void 1516 xrep_inode_ids( 1517 struct xfs_scrub *sc) 1518 { 1519 bool dirty = false; 1520 1521 trace_xrep_inode_ids(sc); 1522 1523 if (!uid_valid(VFS_I(sc->ip)->i_uid)) { 1524 i_uid_write(VFS_I(sc->ip), 0); 1525 dirty = true; 1526 if (XFS_IS_UQUOTA_ON(sc->mp)) 1527 xrep_force_quotacheck(sc, XFS_DQTYPE_USER); 1528 } 1529 1530 if (!gid_valid(VFS_I(sc->ip)->i_gid)) { 1531 i_gid_write(VFS_I(sc->ip), 0); 1532 dirty = true; 1533 if (XFS_IS_GQUOTA_ON(sc->mp)) 1534 xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP); 1535 } 1536 1537 if (sc->ip->i_projid == -1U) { 1538 sc->ip->i_projid = 0; 1539 dirty = true; 1540 if (XFS_IS_PQUOTA_ON(sc->mp)) 1541 xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ); 1542 } 1543 1544 /* strip setuid/setgid if we touched any of the ids */ 1545 if (dirty) 1546 VFS_I(sc->ip)->i_mode &= ~(S_ISUID | S_ISGID); 1547 } 1548 1549 static inline void 1550 xrep_clamp_timestamp( 1551 struct xfs_inode *ip, 1552 struct timespec64 *ts) 1553 { 1554 ts->tv_nsec = clamp_t(long, ts->tv_nsec, 0, NSEC_PER_SEC); 1555 *ts = timestamp_truncate(*ts, VFS_I(ip)); 1556 } 1557 1558 /* Nanosecond counters can't have more than 1 billion. */ 1559 STATIC void 1560 xrep_inode_timestamps( 1561 struct xfs_inode *ip) 1562 { 1563 struct timespec64 tstamp; 1564 struct inode *inode = VFS_I(ip); 1565 1566 tstamp = inode_get_atime(inode); 1567 xrep_clamp_timestamp(ip, &tstamp); 1568 inode_set_atime_to_ts(inode, tstamp); 1569 1570 tstamp = inode_get_mtime(inode); 1571 xrep_clamp_timestamp(ip, &tstamp); 1572 inode_set_mtime_to_ts(inode, tstamp); 1573 1574 tstamp = inode_get_ctime(inode); 1575 xrep_clamp_timestamp(ip, &tstamp); 1576 inode_set_ctime_to_ts(inode, tstamp); 1577 1578 xrep_clamp_timestamp(ip, &ip->i_crtime); 1579 } 1580 1581 /* Fix inode flags that don't make sense together. */ 1582 STATIC void 1583 xrep_inode_flags( 1584 struct xfs_scrub *sc) 1585 { 1586 uint16_t mode; 1587 1588 trace_xrep_inode_flags(sc); 1589 1590 mode = VFS_I(sc->ip)->i_mode; 1591 1592 /* Clear junk flags */ 1593 if (sc->ip->i_diflags & ~XFS_DIFLAG_ANY) 1594 sc->ip->i_diflags &= ~XFS_DIFLAG_ANY; 1595 1596 /* NEWRTBM only applies to realtime bitmaps */ 1597 if (sc->ip->i_ino == sc->mp->m_sb.sb_rbmino) 1598 sc->ip->i_diflags |= XFS_DIFLAG_NEWRTBM; 1599 else 1600 sc->ip->i_diflags &= ~XFS_DIFLAG_NEWRTBM; 1601 1602 /* These only make sense for directories. */ 1603 if (!S_ISDIR(mode)) 1604 sc->ip->i_diflags &= ~(XFS_DIFLAG_RTINHERIT | 1605 XFS_DIFLAG_EXTSZINHERIT | 1606 XFS_DIFLAG_PROJINHERIT | 1607 XFS_DIFLAG_NOSYMLINKS); 1608 1609 /* These only make sense for files. */ 1610 if (!S_ISREG(mode)) 1611 sc->ip->i_diflags &= ~(XFS_DIFLAG_REALTIME | 1612 XFS_DIFLAG_EXTSIZE); 1613 1614 /* These only make sense for non-rt files. */ 1615 if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) 1616 sc->ip->i_diflags &= ~XFS_DIFLAG_FILESTREAM; 1617 1618 /* Immutable and append only? Drop the append. */ 1619 if ((sc->ip->i_diflags & XFS_DIFLAG_IMMUTABLE) && 1620 (sc->ip->i_diflags & XFS_DIFLAG_APPEND)) 1621 sc->ip->i_diflags &= ~XFS_DIFLAG_APPEND; 1622 1623 /* Clear junk flags. */ 1624 if (sc->ip->i_diflags2 & ~XFS_DIFLAG2_ANY) 1625 sc->ip->i_diflags2 &= ~XFS_DIFLAG2_ANY; 1626 1627 /* No reflink flag unless we support it and it's a file. */ 1628 if (!xfs_has_reflink(sc->mp) || !S_ISREG(mode)) 1629 sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; 1630 1631 /* DAX only applies to files and dirs. */ 1632 if (!(S_ISREG(mode) || S_ISDIR(mode))) 1633 sc->ip->i_diflags2 &= ~XFS_DIFLAG2_DAX; 1634 1635 /* No reflink files on the realtime device. */ 1636 if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) 1637 sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; 1638 } 1639 1640 /* 1641 * Fix size problems with block/node format directories. If we fail to find 1642 * the extent list, just bail out and let the bmapbtd repair functions clean 1643 * up that mess. 1644 */ 1645 STATIC void 1646 xrep_inode_blockdir_size( 1647 struct xfs_scrub *sc) 1648 { 1649 struct xfs_iext_cursor icur; 1650 struct xfs_bmbt_irec got; 1651 struct xfs_ifork *ifp; 1652 xfs_fileoff_t off; 1653 int error; 1654 1655 trace_xrep_inode_blockdir_size(sc); 1656 1657 error = xfs_iread_extents(sc->tp, sc->ip, XFS_DATA_FORK); 1658 if (error) 1659 return; 1660 1661 /* Find the last block before 32G; this is the dir size. */ 1662 ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); 1663 off = XFS_B_TO_FSB(sc->mp, XFS_DIR2_SPACE_SIZE); 1664 if (!xfs_iext_lookup_extent_before(sc->ip, ifp, &off, &icur, &got)) { 1665 /* zero-extents directory? */ 1666 return; 1667 } 1668 1669 off = got.br_startoff + got.br_blockcount; 1670 sc->ip->i_disk_size = min_t(loff_t, XFS_DIR2_SPACE_SIZE, 1671 XFS_FSB_TO_B(sc->mp, off)); 1672 } 1673 1674 /* Fix size problems with short format directories. */ 1675 STATIC void 1676 xrep_inode_sfdir_size( 1677 struct xfs_scrub *sc) 1678 { 1679 struct xfs_ifork *ifp; 1680 1681 trace_xrep_inode_sfdir_size(sc); 1682 1683 ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); 1684 sc->ip->i_disk_size = ifp->if_bytes; 1685 } 1686 1687 /* 1688 * Fix any irregularities in a directory inode's size now that we can iterate 1689 * extent maps and access other regular inode data. 1690 */ 1691 STATIC void 1692 xrep_inode_dir_size( 1693 struct xfs_scrub *sc) 1694 { 1695 trace_xrep_inode_dir_size(sc); 1696 1697 switch (sc->ip->i_df.if_format) { 1698 case XFS_DINODE_FMT_EXTENTS: 1699 case XFS_DINODE_FMT_BTREE: 1700 xrep_inode_blockdir_size(sc); 1701 break; 1702 case XFS_DINODE_FMT_LOCAL: 1703 xrep_inode_sfdir_size(sc); 1704 break; 1705 } 1706 } 1707 1708 /* Fix extent size hint problems. */ 1709 STATIC void 1710 xrep_inode_extsize( 1711 struct xfs_scrub *sc) 1712 { 1713 /* Fix misaligned extent size hints on a directory. */ 1714 if ((sc->ip->i_diflags & XFS_DIFLAG_RTINHERIT) && 1715 (sc->ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) && 1716 xfs_extlen_to_rtxmod(sc->mp, sc->ip->i_extsize) > 0) { 1717 sc->ip->i_extsize = 0; 1718 sc->ip->i_diflags &= ~XFS_DIFLAG_EXTSZINHERIT; 1719 } 1720 } 1721 1722 /* Fix any irregularities in an inode that the verifiers don't catch. */ 1723 STATIC int 1724 xrep_inode_problems( 1725 struct xfs_scrub *sc) 1726 { 1727 int error; 1728 1729 error = xrep_inode_blockcounts(sc); 1730 if (error) 1731 return error; 1732 xrep_inode_timestamps(sc->ip); 1733 xrep_inode_flags(sc); 1734 xrep_inode_ids(sc); 1735 /* 1736 * We can now do a better job fixing the size of a directory now that 1737 * we can scan the data fork extents than we could in xrep_dinode_size. 1738 */ 1739 if (S_ISDIR(VFS_I(sc->ip)->i_mode)) 1740 xrep_inode_dir_size(sc); 1741 xrep_inode_extsize(sc); 1742 1743 trace_xrep_inode_fixed(sc); 1744 xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); 1745 return xrep_roll_trans(sc); 1746 } 1747 1748 /* Repair an inode's fields. */ 1749 int 1750 xrep_inode( 1751 struct xfs_scrub *sc) 1752 { 1753 int error = 0; 1754 1755 /* 1756 * No inode? That means we failed the _iget verifiers. Repair all 1757 * the things that the inode verifiers care about, then retry _iget. 1758 */ 1759 if (!sc->ip) { 1760 struct xrep_inode *ri = sc->buf; 1761 1762 ASSERT(ri != NULL); 1763 1764 error = xrep_dinode_problems(ri); 1765 if (error == -EBUSY) { 1766 /* 1767 * Directory scan to recover inode mode encountered a 1768 * busy inode, so we did not continue repairing things. 1769 */ 1770 return 0; 1771 } 1772 if (error) 1773 return error; 1774 1775 /* By this point we had better have a working incore inode. */ 1776 if (!sc->ip) 1777 return -EFSCORRUPTED; 1778 } 1779 1780 xfs_trans_ijoin(sc->tp, sc->ip, 0); 1781 1782 /* If we found corruption of any kind, try to fix it. */ 1783 if ((sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) || 1784 (sc->sm->sm_flags & XFS_SCRUB_OFLAG_XCORRUPT)) { 1785 error = xrep_inode_problems(sc); 1786 if (error) 1787 return error; 1788 } 1789 1790 /* See if we can clear the reflink flag. */ 1791 if (xfs_is_reflink_inode(sc->ip)) { 1792 error = xfs_reflink_clear_inode_flag(sc->ip, &sc->tp); 1793 if (error) 1794 return error; 1795 } 1796 1797 return xrep_defer_finish(sc); 1798 } 1799