1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2018-2023 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_defer.h" 13 #include "xfs_btree.h" 14 #include "xfs_bit.h" 15 #include "xfs_log_format.h" 16 #include "xfs_trans.h" 17 #include "xfs_sb.h" 18 #include "xfs_inode.h" 19 #include "xfs_icache.h" 20 #include "xfs_inode_buf.h" 21 #include "xfs_inode_fork.h" 22 #include "xfs_ialloc.h" 23 #include "xfs_da_format.h" 24 #include "xfs_reflink.h" 25 #include "xfs_alloc.h" 26 #include "xfs_rmap.h" 27 #include "xfs_rmap_btree.h" 28 #include "xfs_bmap.h" 29 #include "xfs_bmap_btree.h" 30 #include "xfs_bmap_util.h" 31 #include "xfs_dir2.h" 32 #include "xfs_dir2_priv.h" 33 #include "xfs_quota_defs.h" 34 #include "xfs_quota.h" 35 #include "xfs_ag.h" 36 #include "xfs_rtbitmap.h" 37 #include "xfs_attr_leaf.h" 38 #include "xfs_log_priv.h" 39 #include "xfs_health.h" 40 #include "scrub/xfs_scrub.h" 41 #include "scrub/scrub.h" 42 #include "scrub/common.h" 43 #include "scrub/btree.h" 44 #include "scrub/trace.h" 45 #include "scrub/repair.h" 46 #include "scrub/iscan.h" 47 #include "scrub/readdir.h" 48 49 /* 50 * Inode Record Repair 51 * =================== 52 * 53 * Roughly speaking, inode problems can be classified based on whether or not 54 * they trip the dinode verifiers. If those trip, then we won't be able to 55 * xfs_iget ourselves the inode. 56 * 57 * Therefore, the xrep_dinode_* functions fix anything that will cause the 58 * inode buffer verifier or the dinode verifier. The xrep_inode_* functions 59 * fix things on live incore inodes. The inode repair functions make decisions 60 * with security and usability implications when reviving a file: 61 * 62 * - Files with zero di_mode or a garbage di_mode are converted to regular file 63 * that only root can read. This file may not actually contain user data, 64 * if the file was not previously a regular file. Setuid and setgid bits 65 * are cleared. 66 * 67 * - Zero-size directories can be truncated to look empty. It is necessary to 68 * run the bmapbtd and directory repair functions to fully rebuild the 69 * directory. 70 * 71 * - Zero-size symbolic link targets can be truncated to '?'. It is necessary 72 * to run the bmapbtd and symlink repair functions to salvage the symlink. 73 * 74 * - Invalid extent size hints will be removed. 75 * 76 * - Quotacheck will be scheduled if we repaired an inode that was so badly 77 * damaged that the ondisk inode had to be rebuilt. 78 * 79 * - Invalid user, group, or project IDs (aka -1U) will be reset to zero. 80 * Setuid and setgid bits are cleared. 81 * 82 * - Data and attr forks are reset to extents format with zero extents if the 83 * fork data is inconsistent. It is necessary to run the bmapbtd or bmapbta 84 * repair functions to recover the space mapping. 85 * 86 * - ACLs will not be recovered if the attr fork is zapped or the extended 87 * attribute structure itself requires salvaging. 88 * 89 * - If the attr fork is zapped, the user and group ids are reset to root and 90 * the setuid and setgid bits are removed. 91 */ 92 93 /* 94 * All the information we need to repair the ondisk inode if we can't iget the 95 * incore inode. We don't allocate this buffer unless we're going to perform 96 * a repair to the ondisk inode cluster buffer. 97 */ 98 struct xrep_inode { 99 /* Inode mapping that we saved from the initial lookup attempt. */ 100 struct xfs_imap imap; 101 102 struct xfs_scrub *sc; 103 104 /* Blocks in use on the data device by data extents or bmbt blocks. */ 105 xfs_rfsblock_t data_blocks; 106 107 /* Blocks in use on the rt device. */ 108 xfs_rfsblock_t rt_blocks; 109 110 /* Blocks in use by the attr fork. */ 111 xfs_rfsblock_t attr_blocks; 112 113 /* Number of data device extents for the data fork. */ 114 xfs_extnum_t data_extents; 115 116 /* 117 * Number of realtime device extents for the data fork. If 118 * data_extents and rt_extents indicate that the data fork has extents 119 * on both devices, we'll just back away slowly. 120 */ 121 xfs_extnum_t rt_extents; 122 123 /* Number of (data device) extents for the attr fork. */ 124 xfs_aextnum_t attr_extents; 125 126 /* Sick state to set after zapping parts of the inode. */ 127 unsigned int ino_sick_mask; 128 129 /* Must we remove all access from this file? */ 130 bool zap_acls; 131 132 /* Inode scanner to see if we can find the ftype from dirents */ 133 struct xchk_iscan ftype_iscan; 134 uint8_t alleged_ftype; 135 }; 136 137 /* 138 * Setup function for inode repair. @imap contains the ondisk inode mapping 139 * information so that we can correct the ondisk inode cluster buffer if 140 * necessary to make iget work. 141 */ 142 int 143 xrep_setup_inode( 144 struct xfs_scrub *sc, 145 const struct xfs_imap *imap) 146 { 147 struct xrep_inode *ri; 148 149 sc->buf = kzalloc(sizeof(struct xrep_inode), XCHK_GFP_FLAGS); 150 if (!sc->buf) 151 return -ENOMEM; 152 153 ri = sc->buf; 154 memcpy(&ri->imap, imap, sizeof(struct xfs_imap)); 155 ri->sc = sc; 156 return 0; 157 } 158 159 /* 160 * Make sure this ondisk inode can pass the inode buffer verifier. This is 161 * not the same as the dinode verifier. 162 */ 163 STATIC void 164 xrep_dinode_buf_core( 165 struct xfs_scrub *sc, 166 struct xfs_buf *bp, 167 unsigned int ioffset) 168 { 169 struct xfs_dinode *dip = xfs_buf_offset(bp, ioffset); 170 struct xfs_trans *tp = sc->tp; 171 struct xfs_mount *mp = sc->mp; 172 xfs_agino_t agino; 173 bool crc_ok = false; 174 bool magic_ok = false; 175 bool unlinked_ok = false; 176 177 agino = be32_to_cpu(dip->di_next_unlinked); 178 179 if (xfs_verify_agino_or_null(bp->b_pag, agino)) 180 unlinked_ok = true; 181 182 if (dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && 183 xfs_dinode_good_version(mp, dip->di_version)) 184 magic_ok = true; 185 186 if (xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, 187 XFS_DINODE_CRC_OFF)) 188 crc_ok = true; 189 190 if (magic_ok && unlinked_ok && crc_ok) 191 return; 192 193 if (!magic_ok) { 194 dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); 195 dip->di_version = 3; 196 } 197 if (!unlinked_ok) 198 dip->di_next_unlinked = cpu_to_be32(NULLAGINO); 199 xfs_dinode_calc_crc(mp, dip); 200 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); 201 xfs_trans_log_buf(tp, bp, ioffset, 202 ioffset + sizeof(struct xfs_dinode) - 1); 203 } 204 205 /* Make sure this inode cluster buffer can pass the inode buffer verifier. */ 206 STATIC void 207 xrep_dinode_buf( 208 struct xfs_scrub *sc, 209 struct xfs_buf *bp) 210 { 211 struct xfs_mount *mp = sc->mp; 212 int i; 213 int ni; 214 215 ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock; 216 for (i = 0; i < ni; i++) 217 xrep_dinode_buf_core(sc, bp, i << mp->m_sb.sb_inodelog); 218 } 219 220 /* Reinitialize things that never change in an inode. */ 221 STATIC void 222 xrep_dinode_header( 223 struct xfs_scrub *sc, 224 struct xfs_dinode *dip) 225 { 226 trace_xrep_dinode_header(sc, dip); 227 228 dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); 229 if (!xfs_dinode_good_version(sc->mp, dip->di_version)) 230 dip->di_version = 3; 231 dip->di_ino = cpu_to_be64(sc->sm->sm_ino); 232 uuid_copy(&dip->di_uuid, &sc->mp->m_sb.sb_meta_uuid); 233 dip->di_gen = cpu_to_be32(sc->sm->sm_gen); 234 } 235 236 /* 237 * If this directory entry points to the scrub target inode, then the directory 238 * we're scanning is the parent of the scrub target inode. 239 */ 240 STATIC int 241 xrep_dinode_findmode_dirent( 242 struct xfs_scrub *sc, 243 struct xfs_inode *dp, 244 xfs_dir2_dataptr_t dapos, 245 const struct xfs_name *name, 246 xfs_ino_t ino, 247 void *priv) 248 { 249 struct xrep_inode *ri = priv; 250 int error = 0; 251 252 if (xchk_should_terminate(ri->sc, &error)) 253 return error; 254 255 if (ino != sc->sm->sm_ino) 256 return 0; 257 258 /* Ignore garbage directory entry names. */ 259 if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len)) 260 return -EFSCORRUPTED; 261 262 /* Don't pick up dot or dotdot entries; we only want child dirents. */ 263 if (xfs_dir2_samename(name, &xfs_name_dotdot) || 264 xfs_dir2_samename(name, &xfs_name_dot)) 265 return 0; 266 267 /* 268 * Uhoh, more than one parent for this inode and they don't agree on 269 * the file type? 270 */ 271 if (ri->alleged_ftype != XFS_DIR3_FT_UNKNOWN && 272 ri->alleged_ftype != name->type) { 273 trace_xrep_dinode_findmode_dirent_inval(ri->sc, dp, name->type, 274 ri->alleged_ftype); 275 return -EFSCORRUPTED; 276 } 277 278 /* We found a potential parent; remember the ftype. */ 279 trace_xrep_dinode_findmode_dirent(ri->sc, dp, name->type); 280 ri->alleged_ftype = name->type; 281 return 0; 282 } 283 284 /* 285 * If this is a directory, walk the dirents looking for any that point to the 286 * scrub target inode. 287 */ 288 STATIC int 289 xrep_dinode_findmode_walk_directory( 290 struct xrep_inode *ri, 291 struct xfs_inode *dp) 292 { 293 struct xfs_scrub *sc = ri->sc; 294 unsigned int lock_mode; 295 int error = 0; 296 297 /* 298 * Scan the directory to see if there it contains an entry pointing to 299 * the directory that we are repairing. 300 */ 301 lock_mode = xfs_ilock_data_map_shared(dp); 302 303 /* 304 * If this directory is known to be sick, we cannot scan it reliably 305 * and must abort. 306 */ 307 if (xfs_inode_has_sickness(dp, XFS_SICK_INO_CORE | 308 XFS_SICK_INO_BMBTD | 309 XFS_SICK_INO_DIR)) { 310 error = -EFSCORRUPTED; 311 goto out_unlock; 312 } 313 314 /* 315 * We cannot complete our parent pointer scan if a directory looks as 316 * though it has been zapped by the inode record repair code. 317 */ 318 if (xchk_dir_looks_zapped(dp)) { 319 error = -EBUSY; 320 goto out_unlock; 321 } 322 323 error = xchk_dir_walk(sc, dp, xrep_dinode_findmode_dirent, ri); 324 if (error) 325 goto out_unlock; 326 327 out_unlock: 328 xfs_iunlock(dp, lock_mode); 329 return error; 330 } 331 332 /* 333 * Try to find the mode of the inode being repaired by looking for directories 334 * that point down to this file. 335 */ 336 STATIC int 337 xrep_dinode_find_mode( 338 struct xrep_inode *ri, 339 uint16_t *mode) 340 { 341 struct xfs_scrub *sc = ri->sc; 342 struct xfs_inode *dp; 343 int error; 344 345 /* No ftype means we have no other metadata to consult. */ 346 if (!xfs_has_ftype(sc->mp)) { 347 *mode = S_IFREG; 348 return 0; 349 } 350 351 /* 352 * Scan all directories for parents that might point down to this 353 * inode. Skip the inode being repaired during the scan since it 354 * cannot be its own parent. Note that we still hold the AGI locked 355 * so there's a real possibility that _iscan_iter can return EBUSY. 356 */ 357 xchk_iscan_start(sc, 5000, 100, &ri->ftype_iscan); 358 ri->ftype_iscan.skip_ino = sc->sm->sm_ino; 359 ri->alleged_ftype = XFS_DIR3_FT_UNKNOWN; 360 while ((error = xchk_iscan_iter(&ri->ftype_iscan, &dp)) == 1) { 361 if (S_ISDIR(VFS_I(dp)->i_mode)) 362 error = xrep_dinode_findmode_walk_directory(ri, dp); 363 xchk_iscan_mark_visited(&ri->ftype_iscan, dp); 364 xchk_irele(sc, dp); 365 if (error < 0) 366 break; 367 if (xchk_should_terminate(sc, &error)) 368 break; 369 } 370 xchk_iscan_iter_finish(&ri->ftype_iscan); 371 xchk_iscan_teardown(&ri->ftype_iscan); 372 373 if (error == -EBUSY) { 374 if (ri->alleged_ftype != XFS_DIR3_FT_UNKNOWN) { 375 /* 376 * If we got an EBUSY after finding at least one 377 * dirent, that means the scan found an inode on the 378 * inactivation list and could not open it. Accept the 379 * alleged ftype and install a new mode below. 380 */ 381 error = 0; 382 } else if (!(sc->flags & XCHK_TRY_HARDER)) { 383 /* 384 * Otherwise, retry the operation one time to see if 385 * the reason for the delay is an inode from the same 386 * cluster buffer waiting on the inactivation list. 387 */ 388 error = -EDEADLOCK; 389 } 390 } 391 if (error) 392 return error; 393 394 /* 395 * Convert the discovered ftype into the file mode. If all else fails, 396 * return S_IFREG. 397 */ 398 switch (ri->alleged_ftype) { 399 case XFS_DIR3_FT_DIR: 400 *mode = S_IFDIR; 401 break; 402 case XFS_DIR3_FT_WHT: 403 case XFS_DIR3_FT_CHRDEV: 404 *mode = S_IFCHR; 405 break; 406 case XFS_DIR3_FT_BLKDEV: 407 *mode = S_IFBLK; 408 break; 409 case XFS_DIR3_FT_FIFO: 410 *mode = S_IFIFO; 411 break; 412 case XFS_DIR3_FT_SOCK: 413 *mode = S_IFSOCK; 414 break; 415 case XFS_DIR3_FT_SYMLINK: 416 *mode = S_IFLNK; 417 break; 418 default: 419 *mode = S_IFREG; 420 break; 421 } 422 return 0; 423 } 424 425 /* Turn di_mode into /something/ recognizable. Returns true if we succeed. */ 426 STATIC int 427 xrep_dinode_mode( 428 struct xrep_inode *ri, 429 struct xfs_dinode *dip) 430 { 431 struct xfs_scrub *sc = ri->sc; 432 uint16_t mode = be16_to_cpu(dip->di_mode); 433 int error; 434 435 trace_xrep_dinode_mode(sc, dip); 436 437 if (mode == 0 || xfs_mode_to_ftype(mode) != XFS_DIR3_FT_UNKNOWN) 438 return 0; 439 440 /* Try to fix the mode. If we cannot, then leave everything alone. */ 441 error = xrep_dinode_find_mode(ri, &mode); 442 switch (error) { 443 case -EINTR: 444 case -EBUSY: 445 case -EDEADLOCK: 446 /* temporary failure or fatal signal */ 447 return error; 448 case 0: 449 /* found mode */ 450 break; 451 default: 452 /* some other error, assume S_IFREG */ 453 mode = S_IFREG; 454 break; 455 } 456 457 /* bad mode, so we set it to a file that only root can read */ 458 dip->di_mode = cpu_to_be16(mode); 459 dip->di_uid = 0; 460 dip->di_gid = 0; 461 ri->zap_acls = true; 462 return 0; 463 } 464 465 /* Fix any conflicting flags that the verifiers complain about. */ 466 STATIC void 467 xrep_dinode_flags( 468 struct xfs_scrub *sc, 469 struct xfs_dinode *dip, 470 bool isrt) 471 { 472 struct xfs_mount *mp = sc->mp; 473 uint64_t flags2 = be64_to_cpu(dip->di_flags2); 474 uint16_t flags = be16_to_cpu(dip->di_flags); 475 uint16_t mode = be16_to_cpu(dip->di_mode); 476 477 trace_xrep_dinode_flags(sc, dip); 478 479 if (isrt) 480 flags |= XFS_DIFLAG_REALTIME; 481 else 482 flags &= ~XFS_DIFLAG_REALTIME; 483 484 /* 485 * For regular files on a reflink filesystem, set the REFLINK flag to 486 * protect shared extents. A later stage will actually check those 487 * extents and clear the flag if possible. 488 */ 489 if (xfs_has_reflink(mp) && S_ISREG(mode)) 490 flags2 |= XFS_DIFLAG2_REFLINK; 491 else 492 flags2 &= ~(XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE); 493 if (flags & XFS_DIFLAG_REALTIME) 494 flags2 &= ~XFS_DIFLAG2_REFLINK; 495 if (!xfs_has_bigtime(mp)) 496 flags2 &= ~XFS_DIFLAG2_BIGTIME; 497 if (!xfs_has_large_extent_counts(mp)) 498 flags2 &= ~XFS_DIFLAG2_NREXT64; 499 if (flags2 & XFS_DIFLAG2_NREXT64) 500 dip->di_nrext64_pad = 0; 501 else if (dip->di_version >= 3) 502 dip->di_v3_pad = 0; 503 dip->di_flags = cpu_to_be16(flags); 504 dip->di_flags2 = cpu_to_be64(flags2); 505 } 506 507 /* 508 * Blow out symlink; now it points nowhere. We don't have to worry about 509 * incore state because this inode is failing the verifiers. 510 */ 511 STATIC void 512 xrep_dinode_zap_symlink( 513 struct xrep_inode *ri, 514 struct xfs_dinode *dip) 515 { 516 struct xfs_scrub *sc = ri->sc; 517 char *p; 518 519 trace_xrep_dinode_zap_symlink(sc, dip); 520 521 dip->di_format = XFS_DINODE_FMT_LOCAL; 522 dip->di_size = cpu_to_be64(1); 523 p = XFS_DFORK_PTR(dip, XFS_DATA_FORK); 524 *p = '?'; 525 ri->ino_sick_mask |= XFS_SICK_INO_SYMLINK_ZAPPED; 526 } 527 528 /* 529 * Blow out dir, make the parent point to the root. In the future repair will 530 * reconstruct this directory for us. Note that there's no in-core directory 531 * inode because the sf verifier tripped, so we don't have to worry about the 532 * dentry cache. 533 */ 534 STATIC void 535 xrep_dinode_zap_dir( 536 struct xrep_inode *ri, 537 struct xfs_dinode *dip) 538 { 539 struct xfs_scrub *sc = ri->sc; 540 struct xfs_mount *mp = sc->mp; 541 struct xfs_dir2_sf_hdr *sfp; 542 int i8count; 543 544 trace_xrep_dinode_zap_dir(sc, dip); 545 546 dip->di_format = XFS_DINODE_FMT_LOCAL; 547 i8count = mp->m_sb.sb_rootino > XFS_DIR2_MAX_SHORT_INUM; 548 sfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK); 549 sfp->count = 0; 550 sfp->i8count = i8count; 551 xfs_dir2_sf_put_parent_ino(sfp, mp->m_sb.sb_rootino); 552 dip->di_size = cpu_to_be64(xfs_dir2_sf_hdr_size(i8count)); 553 ri->ino_sick_mask |= XFS_SICK_INO_DIR_ZAPPED; 554 } 555 556 /* Make sure we don't have a garbage file size. */ 557 STATIC void 558 xrep_dinode_size( 559 struct xrep_inode *ri, 560 struct xfs_dinode *dip) 561 { 562 struct xfs_scrub *sc = ri->sc; 563 uint64_t size = be64_to_cpu(dip->di_size); 564 uint16_t mode = be16_to_cpu(dip->di_mode); 565 566 trace_xrep_dinode_size(sc, dip); 567 568 switch (mode & S_IFMT) { 569 case S_IFIFO: 570 case S_IFCHR: 571 case S_IFBLK: 572 case S_IFSOCK: 573 /* di_size can't be nonzero for special files */ 574 dip->di_size = 0; 575 break; 576 case S_IFREG: 577 /* Regular files can't be larger than 2^63-1 bytes. */ 578 dip->di_size = cpu_to_be64(size & ~(1ULL << 63)); 579 break; 580 case S_IFLNK: 581 /* 582 * Truncate ridiculously oversized symlinks. If the size is 583 * zero, reset it to point to the current directory. Both of 584 * these conditions trigger dinode verifier errors, so there 585 * is no in-core state to reset. 586 */ 587 if (size > XFS_SYMLINK_MAXLEN) 588 dip->di_size = cpu_to_be64(XFS_SYMLINK_MAXLEN); 589 else if (size == 0) 590 xrep_dinode_zap_symlink(ri, dip); 591 break; 592 case S_IFDIR: 593 /* 594 * Directories can't have a size larger than 32G. If the size 595 * is zero, reset it to an empty directory. Both of these 596 * conditions trigger dinode verifier errors, so there is no 597 * in-core state to reset. 598 */ 599 if (size > XFS_DIR2_SPACE_SIZE) 600 dip->di_size = cpu_to_be64(XFS_DIR2_SPACE_SIZE); 601 else if (size == 0) 602 xrep_dinode_zap_dir(ri, dip); 603 break; 604 } 605 } 606 607 /* Fix extent size hints. */ 608 STATIC void 609 xrep_dinode_extsize_hints( 610 struct xfs_scrub *sc, 611 struct xfs_dinode *dip) 612 { 613 struct xfs_mount *mp = sc->mp; 614 uint64_t flags2 = be64_to_cpu(dip->di_flags2); 615 uint16_t flags = be16_to_cpu(dip->di_flags); 616 uint16_t mode = be16_to_cpu(dip->di_mode); 617 618 xfs_failaddr_t fa; 619 620 trace_xrep_dinode_extsize_hints(sc, dip); 621 622 fa = xfs_inode_validate_extsize(mp, be32_to_cpu(dip->di_extsize), 623 mode, flags); 624 if (fa) { 625 dip->di_extsize = 0; 626 dip->di_flags &= ~cpu_to_be16(XFS_DIFLAG_EXTSIZE | 627 XFS_DIFLAG_EXTSZINHERIT); 628 } 629 630 if (dip->di_version < 3) 631 return; 632 633 fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize), 634 mode, flags, flags2); 635 if (fa) { 636 dip->di_cowextsize = 0; 637 dip->di_flags2 &= ~cpu_to_be64(XFS_DIFLAG2_COWEXTSIZE); 638 } 639 } 640 641 /* Count extents and blocks for an inode given an rmap. */ 642 STATIC int 643 xrep_dinode_walk_rmap( 644 struct xfs_btree_cur *cur, 645 const struct xfs_rmap_irec *rec, 646 void *priv) 647 { 648 struct xrep_inode *ri = priv; 649 int error = 0; 650 651 if (xchk_should_terminate(ri->sc, &error)) 652 return error; 653 654 /* We only care about this inode. */ 655 if (rec->rm_owner != ri->sc->sm->sm_ino) 656 return 0; 657 658 if (rec->rm_flags & XFS_RMAP_ATTR_FORK) { 659 ri->attr_blocks += rec->rm_blockcount; 660 if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK)) 661 ri->attr_extents++; 662 663 return 0; 664 } 665 666 ri->data_blocks += rec->rm_blockcount; 667 if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK)) 668 ri->data_extents++; 669 670 return 0; 671 } 672 673 /* Count extents and blocks for an inode from all AG rmap data. */ 674 STATIC int 675 xrep_dinode_count_ag_rmaps( 676 struct xrep_inode *ri, 677 struct xfs_perag *pag) 678 { 679 struct xfs_btree_cur *cur; 680 struct xfs_buf *agf; 681 int error; 682 683 error = xfs_alloc_read_agf(pag, ri->sc->tp, 0, &agf); 684 if (error) 685 return error; 686 687 cur = xfs_rmapbt_init_cursor(ri->sc->mp, ri->sc->tp, agf, pag); 688 error = xfs_rmap_query_all(cur, xrep_dinode_walk_rmap, ri); 689 xfs_btree_del_cursor(cur, error); 690 xfs_trans_brelse(ri->sc->tp, agf); 691 return error; 692 } 693 694 /* Count extents and blocks for a given inode from all rmap data. */ 695 STATIC int 696 xrep_dinode_count_rmaps( 697 struct xrep_inode *ri) 698 { 699 struct xfs_perag *pag; 700 xfs_agnumber_t agno; 701 int error; 702 703 if (!xfs_has_rmapbt(ri->sc->mp) || xfs_has_realtime(ri->sc->mp)) 704 return -EOPNOTSUPP; 705 706 for_each_perag(ri->sc->mp, agno, pag) { 707 error = xrep_dinode_count_ag_rmaps(ri, pag); 708 if (error) { 709 xfs_perag_rele(pag); 710 return error; 711 } 712 } 713 714 /* Can't have extents on both the rt and the data device. */ 715 if (ri->data_extents && ri->rt_extents) 716 return -EFSCORRUPTED; 717 718 trace_xrep_dinode_count_rmaps(ri->sc, 719 ri->data_blocks, ri->rt_blocks, ri->attr_blocks, 720 ri->data_extents, ri->rt_extents, ri->attr_extents); 721 return 0; 722 } 723 724 /* Return true if this extents-format ifork looks like garbage. */ 725 STATIC bool 726 xrep_dinode_bad_extents_fork( 727 struct xfs_scrub *sc, 728 struct xfs_dinode *dip, 729 unsigned int dfork_size, 730 int whichfork) 731 { 732 struct xfs_bmbt_irec new; 733 struct xfs_bmbt_rec *dp; 734 xfs_extnum_t nex; 735 bool isrt; 736 unsigned int i; 737 738 nex = xfs_dfork_nextents(dip, whichfork); 739 if (nex > dfork_size / sizeof(struct xfs_bmbt_rec)) 740 return true; 741 742 dp = XFS_DFORK_PTR(dip, whichfork); 743 744 isrt = dip->di_flags & cpu_to_be16(XFS_DIFLAG_REALTIME); 745 for (i = 0; i < nex; i++, dp++) { 746 xfs_failaddr_t fa; 747 748 xfs_bmbt_disk_get_all(dp, &new); 749 fa = xfs_bmap_validate_extent_raw(sc->mp, isrt, whichfork, 750 &new); 751 if (fa) 752 return true; 753 } 754 755 return false; 756 } 757 758 /* Return true if this btree-format ifork looks like garbage. */ 759 STATIC bool 760 xrep_dinode_bad_bmbt_fork( 761 struct xfs_scrub *sc, 762 struct xfs_dinode *dip, 763 unsigned int dfork_size, 764 int whichfork) 765 { 766 struct xfs_bmdr_block *dfp; 767 xfs_extnum_t nex; 768 unsigned int i; 769 unsigned int dmxr; 770 unsigned int nrecs; 771 unsigned int level; 772 773 nex = xfs_dfork_nextents(dip, whichfork); 774 if (nex <= dfork_size / sizeof(struct xfs_bmbt_rec)) 775 return true; 776 777 if (dfork_size < sizeof(struct xfs_bmdr_block)) 778 return true; 779 780 dfp = XFS_DFORK_PTR(dip, whichfork); 781 nrecs = be16_to_cpu(dfp->bb_numrecs); 782 level = be16_to_cpu(dfp->bb_level); 783 784 if (nrecs == 0 || XFS_BMDR_SPACE_CALC(nrecs) > dfork_size) 785 return true; 786 if (level == 0 || level >= XFS_BM_MAXLEVELS(sc->mp, whichfork)) 787 return true; 788 789 dmxr = xfs_bmdr_maxrecs(dfork_size, 0); 790 for (i = 1; i <= nrecs; i++) { 791 struct xfs_bmbt_key *fkp; 792 xfs_bmbt_ptr_t *fpp; 793 xfs_fileoff_t fileoff; 794 xfs_fsblock_t fsbno; 795 796 fkp = XFS_BMDR_KEY_ADDR(dfp, i); 797 fileoff = be64_to_cpu(fkp->br_startoff); 798 if (!xfs_verify_fileoff(sc->mp, fileoff)) 799 return true; 800 801 fpp = XFS_BMDR_PTR_ADDR(dfp, i, dmxr); 802 fsbno = be64_to_cpu(*fpp); 803 if (!xfs_verify_fsbno(sc->mp, fsbno)) 804 return true; 805 } 806 807 return false; 808 } 809 810 /* 811 * Check the data fork for things that will fail the ifork verifiers or the 812 * ifork formatters. 813 */ 814 STATIC bool 815 xrep_dinode_check_dfork( 816 struct xfs_scrub *sc, 817 struct xfs_dinode *dip, 818 uint16_t mode) 819 { 820 void *dfork_ptr; 821 int64_t data_size; 822 unsigned int fmt; 823 unsigned int dfork_size; 824 825 /* 826 * Verifier functions take signed int64_t, so check for bogus negative 827 * values first. 828 */ 829 data_size = be64_to_cpu(dip->di_size); 830 if (data_size < 0) 831 return true; 832 833 fmt = XFS_DFORK_FORMAT(dip, XFS_DATA_FORK); 834 switch (mode & S_IFMT) { 835 case S_IFIFO: 836 case S_IFCHR: 837 case S_IFBLK: 838 case S_IFSOCK: 839 if (fmt != XFS_DINODE_FMT_DEV) 840 return true; 841 break; 842 case S_IFREG: 843 if (fmt == XFS_DINODE_FMT_LOCAL) 844 return true; 845 fallthrough; 846 case S_IFLNK: 847 case S_IFDIR: 848 switch (fmt) { 849 case XFS_DINODE_FMT_LOCAL: 850 case XFS_DINODE_FMT_EXTENTS: 851 case XFS_DINODE_FMT_BTREE: 852 break; 853 default: 854 return true; 855 } 856 break; 857 default: 858 return true; 859 } 860 861 dfork_size = XFS_DFORK_SIZE(dip, sc->mp, XFS_DATA_FORK); 862 dfork_ptr = XFS_DFORK_PTR(dip, XFS_DATA_FORK); 863 864 switch (fmt) { 865 case XFS_DINODE_FMT_DEV: 866 break; 867 case XFS_DINODE_FMT_LOCAL: 868 /* dir/symlink structure cannot be larger than the fork */ 869 if (data_size > dfork_size) 870 return true; 871 /* directory structure must pass verification. */ 872 if (S_ISDIR(mode) && 873 xfs_dir2_sf_verify(sc->mp, dfork_ptr, data_size) != NULL) 874 return true; 875 /* symlink structure must pass verification. */ 876 if (S_ISLNK(mode) && 877 xfs_symlink_shortform_verify(dfork_ptr, data_size) != NULL) 878 return true; 879 break; 880 case XFS_DINODE_FMT_EXTENTS: 881 if (xrep_dinode_bad_extents_fork(sc, dip, dfork_size, 882 XFS_DATA_FORK)) 883 return true; 884 break; 885 case XFS_DINODE_FMT_BTREE: 886 if (xrep_dinode_bad_bmbt_fork(sc, dip, dfork_size, 887 XFS_DATA_FORK)) 888 return true; 889 break; 890 default: 891 return true; 892 } 893 894 return false; 895 } 896 897 static void 898 xrep_dinode_set_data_nextents( 899 struct xfs_dinode *dip, 900 xfs_extnum_t nextents) 901 { 902 if (xfs_dinode_has_large_extent_counts(dip)) 903 dip->di_big_nextents = cpu_to_be64(nextents); 904 else 905 dip->di_nextents = cpu_to_be32(nextents); 906 } 907 908 static void 909 xrep_dinode_set_attr_nextents( 910 struct xfs_dinode *dip, 911 xfs_extnum_t nextents) 912 { 913 if (xfs_dinode_has_large_extent_counts(dip)) 914 dip->di_big_anextents = cpu_to_be32(nextents); 915 else 916 dip->di_anextents = cpu_to_be16(nextents); 917 } 918 919 /* Reset the data fork to something sane. */ 920 STATIC void 921 xrep_dinode_zap_dfork( 922 struct xrep_inode *ri, 923 struct xfs_dinode *dip, 924 uint16_t mode) 925 { 926 struct xfs_scrub *sc = ri->sc; 927 928 trace_xrep_dinode_zap_dfork(sc, dip); 929 930 ri->ino_sick_mask |= XFS_SICK_INO_BMBTD_ZAPPED; 931 932 xrep_dinode_set_data_nextents(dip, 0); 933 ri->data_blocks = 0; 934 ri->rt_blocks = 0; 935 936 /* Special files always get reset to DEV */ 937 switch (mode & S_IFMT) { 938 case S_IFIFO: 939 case S_IFCHR: 940 case S_IFBLK: 941 case S_IFSOCK: 942 dip->di_format = XFS_DINODE_FMT_DEV; 943 dip->di_size = 0; 944 return; 945 } 946 947 /* 948 * If we have data extents, reset to an empty map and hope the user 949 * will run the bmapbtd checker next. 950 */ 951 if (ri->data_extents || ri->rt_extents || S_ISREG(mode)) { 952 dip->di_format = XFS_DINODE_FMT_EXTENTS; 953 return; 954 } 955 956 /* Otherwise, reset the local format to the minimum. */ 957 switch (mode & S_IFMT) { 958 case S_IFLNK: 959 xrep_dinode_zap_symlink(ri, dip); 960 break; 961 case S_IFDIR: 962 xrep_dinode_zap_dir(ri, dip); 963 break; 964 } 965 } 966 967 /* 968 * Check the attr fork for things that will fail the ifork verifiers or the 969 * ifork formatters. 970 */ 971 STATIC bool 972 xrep_dinode_check_afork( 973 struct xfs_scrub *sc, 974 struct xfs_dinode *dip) 975 { 976 struct xfs_attr_sf_hdr *afork_ptr; 977 size_t attr_size; 978 unsigned int afork_size; 979 980 if (XFS_DFORK_BOFF(dip) == 0) 981 return dip->di_aformat != XFS_DINODE_FMT_EXTENTS || 982 xfs_dfork_attr_extents(dip) != 0; 983 984 afork_size = XFS_DFORK_SIZE(dip, sc->mp, XFS_ATTR_FORK); 985 afork_ptr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK); 986 987 switch (XFS_DFORK_FORMAT(dip, XFS_ATTR_FORK)) { 988 case XFS_DINODE_FMT_LOCAL: 989 /* Fork has to be large enough to extract the xattr size. */ 990 if (afork_size < sizeof(struct xfs_attr_sf_hdr)) 991 return true; 992 993 /* xattr structure cannot be larger than the fork */ 994 attr_size = be16_to_cpu(afork_ptr->totsize); 995 if (attr_size > afork_size) 996 return true; 997 998 /* xattr structure must pass verification. */ 999 return xfs_attr_shortform_verify(afork_ptr, attr_size) != NULL; 1000 case XFS_DINODE_FMT_EXTENTS: 1001 if (xrep_dinode_bad_extents_fork(sc, dip, afork_size, 1002 XFS_ATTR_FORK)) 1003 return true; 1004 break; 1005 case XFS_DINODE_FMT_BTREE: 1006 if (xrep_dinode_bad_bmbt_fork(sc, dip, afork_size, 1007 XFS_ATTR_FORK)) 1008 return true; 1009 break; 1010 default: 1011 return true; 1012 } 1013 1014 return false; 1015 } 1016 1017 /* 1018 * Reset the attr fork to empty. Since the attr fork could have contained 1019 * ACLs, make the file readable only by root. 1020 */ 1021 STATIC void 1022 xrep_dinode_zap_afork( 1023 struct xrep_inode *ri, 1024 struct xfs_dinode *dip, 1025 uint16_t mode) 1026 { 1027 struct xfs_scrub *sc = ri->sc; 1028 1029 trace_xrep_dinode_zap_afork(sc, dip); 1030 1031 ri->ino_sick_mask |= XFS_SICK_INO_BMBTA_ZAPPED; 1032 1033 dip->di_aformat = XFS_DINODE_FMT_EXTENTS; 1034 xrep_dinode_set_attr_nextents(dip, 0); 1035 ri->attr_blocks = 0; 1036 1037 /* 1038 * If the data fork is in btree format, removing the attr fork entirely 1039 * might cause verifier failures if the next level down in the bmbt 1040 * could now fit in the data fork area. 1041 */ 1042 if (dip->di_format != XFS_DINODE_FMT_BTREE) 1043 dip->di_forkoff = 0; 1044 dip->di_mode = cpu_to_be16(mode & ~0777); 1045 dip->di_uid = 0; 1046 dip->di_gid = 0; 1047 } 1048 1049 /* Make sure the fork offset is a sensible value. */ 1050 STATIC void 1051 xrep_dinode_ensure_forkoff( 1052 struct xrep_inode *ri, 1053 struct xfs_dinode *dip, 1054 uint16_t mode) 1055 { 1056 struct xfs_bmdr_block *bmdr; 1057 struct xfs_scrub *sc = ri->sc; 1058 xfs_extnum_t attr_extents, data_extents; 1059 size_t bmdr_minsz = XFS_BMDR_SPACE_CALC(1); 1060 unsigned int lit_sz = XFS_LITINO(sc->mp); 1061 unsigned int afork_min, dfork_min; 1062 1063 trace_xrep_dinode_ensure_forkoff(sc, dip); 1064 1065 /* 1066 * Before calling this function, xrep_dinode_core ensured that both 1067 * forks actually fit inside their respective literal areas. If this 1068 * was not the case, the fork was reset to FMT_EXTENTS with zero 1069 * records. If the rmapbt scan found attr or data fork blocks, this 1070 * will be noted in the dinode_stats, and we must leave enough room 1071 * for the bmap repair code to reconstruct the mapping structure. 1072 * 1073 * First, compute the minimum space required for the attr fork. 1074 */ 1075 switch (dip->di_aformat) { 1076 case XFS_DINODE_FMT_LOCAL: 1077 /* 1078 * If we still have a shortform xattr structure at all, that 1079 * means the attr fork area was exactly large enough to fit 1080 * the sf structure. 1081 */ 1082 afork_min = XFS_DFORK_SIZE(dip, sc->mp, XFS_ATTR_FORK); 1083 break; 1084 case XFS_DINODE_FMT_EXTENTS: 1085 attr_extents = xfs_dfork_attr_extents(dip); 1086 if (attr_extents) { 1087 /* 1088 * We must maintain sufficient space to hold the entire 1089 * extent map array in the data fork. Note that we 1090 * previously zapped the fork if it had no chance of 1091 * fitting in the inode. 1092 */ 1093 afork_min = sizeof(struct xfs_bmbt_rec) * attr_extents; 1094 } else if (ri->attr_extents > 0) { 1095 /* 1096 * The attr fork thinks it has zero extents, but we 1097 * found some xattr extents. We need to leave enough 1098 * empty space here so that the incore attr fork will 1099 * get created (and hence trigger the attr fork bmap 1100 * repairer). 1101 */ 1102 afork_min = bmdr_minsz; 1103 } else { 1104 /* No extents on disk or found in rmapbt. */ 1105 afork_min = 0; 1106 } 1107 break; 1108 case XFS_DINODE_FMT_BTREE: 1109 /* Must have space for btree header and key/pointers. */ 1110 bmdr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK); 1111 afork_min = XFS_BMAP_BROOT_SPACE(sc->mp, bmdr); 1112 break; 1113 default: 1114 /* We should never see any other formats. */ 1115 afork_min = 0; 1116 break; 1117 } 1118 1119 /* Compute the minimum space required for the data fork. */ 1120 switch (dip->di_format) { 1121 case XFS_DINODE_FMT_DEV: 1122 dfork_min = sizeof(__be32); 1123 break; 1124 case XFS_DINODE_FMT_UUID: 1125 dfork_min = sizeof(uuid_t); 1126 break; 1127 case XFS_DINODE_FMT_LOCAL: 1128 /* 1129 * If we still have a shortform data fork at all, that means 1130 * the data fork area was large enough to fit whatever was in 1131 * there. 1132 */ 1133 dfork_min = be64_to_cpu(dip->di_size); 1134 break; 1135 case XFS_DINODE_FMT_EXTENTS: 1136 data_extents = xfs_dfork_data_extents(dip); 1137 if (data_extents) { 1138 /* 1139 * We must maintain sufficient space to hold the entire 1140 * extent map array in the data fork. Note that we 1141 * previously zapped the fork if it had no chance of 1142 * fitting in the inode. 1143 */ 1144 dfork_min = sizeof(struct xfs_bmbt_rec) * data_extents; 1145 } else if (ri->data_extents > 0 || ri->rt_extents > 0) { 1146 /* 1147 * The data fork thinks it has zero extents, but we 1148 * found some data extents. We need to leave enough 1149 * empty space here so that the data fork bmap repair 1150 * will recover the mappings. 1151 */ 1152 dfork_min = bmdr_minsz; 1153 } else { 1154 /* No extents on disk or found in rmapbt. */ 1155 dfork_min = 0; 1156 } 1157 break; 1158 case XFS_DINODE_FMT_BTREE: 1159 /* Must have space for btree header and key/pointers. */ 1160 bmdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK); 1161 dfork_min = XFS_BMAP_BROOT_SPACE(sc->mp, bmdr); 1162 break; 1163 default: 1164 dfork_min = 0; 1165 break; 1166 } 1167 1168 /* 1169 * Round all values up to the nearest 8 bytes, because that is the 1170 * precision of di_forkoff. 1171 */ 1172 afork_min = roundup(afork_min, 8); 1173 dfork_min = roundup(dfork_min, 8); 1174 bmdr_minsz = roundup(bmdr_minsz, 8); 1175 1176 ASSERT(dfork_min <= lit_sz); 1177 ASSERT(afork_min <= lit_sz); 1178 1179 /* 1180 * If the data fork was zapped and we don't have enough space for the 1181 * recovery fork, move the attr fork up. 1182 */ 1183 if (dip->di_format == XFS_DINODE_FMT_EXTENTS && 1184 xfs_dfork_data_extents(dip) == 0 && 1185 (ri->data_extents > 0 || ri->rt_extents > 0) && 1186 bmdr_minsz > XFS_DFORK_DSIZE(dip, sc->mp)) { 1187 if (bmdr_minsz + afork_min > lit_sz) { 1188 /* 1189 * The attr for and the stub fork we need to recover 1190 * the data fork won't both fit. Zap the attr fork. 1191 */ 1192 xrep_dinode_zap_afork(ri, dip, mode); 1193 afork_min = bmdr_minsz; 1194 } else { 1195 void *before, *after; 1196 1197 /* Otherwise, just slide the attr fork up. */ 1198 before = XFS_DFORK_APTR(dip); 1199 dip->di_forkoff = bmdr_minsz >> 3; 1200 after = XFS_DFORK_APTR(dip); 1201 memmove(after, before, XFS_DFORK_ASIZE(dip, sc->mp)); 1202 } 1203 } 1204 1205 /* 1206 * If the attr fork was zapped and we don't have enough space for the 1207 * recovery fork, move the attr fork down. 1208 */ 1209 if (dip->di_aformat == XFS_DINODE_FMT_EXTENTS && 1210 xfs_dfork_attr_extents(dip) == 0 && 1211 ri->attr_extents > 0 && 1212 bmdr_minsz > XFS_DFORK_ASIZE(dip, sc->mp)) { 1213 if (dip->di_format == XFS_DINODE_FMT_BTREE) { 1214 /* 1215 * If the data fork is in btree format then we can't 1216 * adjust forkoff because that runs the risk of 1217 * violating the extents/btree format transition rules. 1218 */ 1219 } else if (bmdr_minsz + dfork_min > lit_sz) { 1220 /* 1221 * If we can't move the attr fork, too bad, we lose the 1222 * attr fork and leak its blocks. 1223 */ 1224 xrep_dinode_zap_afork(ri, dip, mode); 1225 } else { 1226 /* 1227 * Otherwise, just slide the attr fork down. The attr 1228 * fork is empty, so we don't have any old contents to 1229 * move here. 1230 */ 1231 dip->di_forkoff = (lit_sz - bmdr_minsz) >> 3; 1232 } 1233 } 1234 } 1235 1236 /* 1237 * Zap the data/attr forks if we spot anything that isn't going to pass the 1238 * ifork verifiers or the ifork formatters, because we need to get the inode 1239 * into good enough shape that the higher level repair functions can run. 1240 */ 1241 STATIC void 1242 xrep_dinode_zap_forks( 1243 struct xrep_inode *ri, 1244 struct xfs_dinode *dip) 1245 { 1246 struct xfs_scrub *sc = ri->sc; 1247 xfs_extnum_t data_extents; 1248 xfs_extnum_t attr_extents; 1249 xfs_filblks_t nblocks; 1250 uint16_t mode; 1251 bool zap_datafork = false; 1252 bool zap_attrfork = ri->zap_acls; 1253 1254 trace_xrep_dinode_zap_forks(sc, dip); 1255 1256 mode = be16_to_cpu(dip->di_mode); 1257 1258 data_extents = xfs_dfork_data_extents(dip); 1259 attr_extents = xfs_dfork_attr_extents(dip); 1260 nblocks = be64_to_cpu(dip->di_nblocks); 1261 1262 /* Inode counters don't make sense? */ 1263 if (data_extents > nblocks) 1264 zap_datafork = true; 1265 if (attr_extents > nblocks) 1266 zap_attrfork = true; 1267 if (data_extents + attr_extents > nblocks) 1268 zap_datafork = zap_attrfork = true; 1269 1270 if (!zap_datafork) 1271 zap_datafork = xrep_dinode_check_dfork(sc, dip, mode); 1272 if (!zap_attrfork) 1273 zap_attrfork = xrep_dinode_check_afork(sc, dip); 1274 1275 /* Zap whatever's bad. */ 1276 if (zap_attrfork) 1277 xrep_dinode_zap_afork(ri, dip, mode); 1278 if (zap_datafork) 1279 xrep_dinode_zap_dfork(ri, dip, mode); 1280 xrep_dinode_ensure_forkoff(ri, dip, mode); 1281 1282 /* 1283 * Zero di_nblocks if we don't have any extents at all to satisfy the 1284 * buffer verifier. 1285 */ 1286 data_extents = xfs_dfork_data_extents(dip); 1287 attr_extents = xfs_dfork_attr_extents(dip); 1288 if (data_extents + attr_extents == 0) 1289 dip->di_nblocks = 0; 1290 } 1291 1292 /* Inode didn't pass dinode verifiers, so fix the raw buffer and retry iget. */ 1293 STATIC int 1294 xrep_dinode_core( 1295 struct xrep_inode *ri) 1296 { 1297 struct xfs_scrub *sc = ri->sc; 1298 struct xfs_buf *bp; 1299 struct xfs_dinode *dip; 1300 xfs_ino_t ino = sc->sm->sm_ino; 1301 int error; 1302 int iget_error; 1303 1304 /* Figure out what this inode had mapped in both forks. */ 1305 error = xrep_dinode_count_rmaps(ri); 1306 if (error) 1307 return error; 1308 1309 /* Read the inode cluster buffer. */ 1310 error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp, 1311 ri->imap.im_blkno, ri->imap.im_len, XBF_UNMAPPED, &bp, 1312 NULL); 1313 if (error) 1314 return error; 1315 1316 /* Make sure we can pass the inode buffer verifier. */ 1317 xrep_dinode_buf(sc, bp); 1318 bp->b_ops = &xfs_inode_buf_ops; 1319 1320 /* Fix everything the verifier will complain about. */ 1321 dip = xfs_buf_offset(bp, ri->imap.im_boffset); 1322 xrep_dinode_header(sc, dip); 1323 iget_error = xrep_dinode_mode(ri, dip); 1324 if (iget_error) 1325 goto write; 1326 xrep_dinode_flags(sc, dip, ri->rt_extents > 0); 1327 xrep_dinode_size(ri, dip); 1328 xrep_dinode_extsize_hints(sc, dip); 1329 xrep_dinode_zap_forks(ri, dip); 1330 1331 write: 1332 /* Write out the inode. */ 1333 trace_xrep_dinode_fixed(sc, dip); 1334 xfs_dinode_calc_crc(sc->mp, dip); 1335 xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_DINO_BUF); 1336 xfs_trans_log_buf(sc->tp, bp, ri->imap.im_boffset, 1337 ri->imap.im_boffset + sc->mp->m_sb.sb_inodesize - 1); 1338 1339 /* 1340 * In theory, we've fixed the ondisk inode record enough that we should 1341 * be able to load the inode into the cache. Try to iget that inode 1342 * now while we hold the AGI and the inode cluster buffer and take the 1343 * IOLOCK so that we can continue with repairs without anyone else 1344 * accessing the inode. If iget fails, we still need to commit the 1345 * changes. 1346 */ 1347 if (!iget_error) 1348 iget_error = xchk_iget(sc, ino, &sc->ip); 1349 if (!iget_error) 1350 xchk_ilock(sc, XFS_IOLOCK_EXCL); 1351 1352 /* 1353 * Commit the inode cluster buffer updates and drop the AGI buffer that 1354 * we've been holding since scrub setup. From here on out, repairs 1355 * deal only with the cached inode. 1356 */ 1357 error = xrep_trans_commit(sc); 1358 if (error) 1359 return error; 1360 1361 if (iget_error) 1362 return iget_error; 1363 1364 error = xchk_trans_alloc(sc, 0); 1365 if (error) 1366 return error; 1367 1368 error = xrep_ino_dqattach(sc); 1369 if (error) 1370 return error; 1371 1372 xchk_ilock(sc, XFS_ILOCK_EXCL); 1373 if (ri->ino_sick_mask) 1374 xfs_inode_mark_sick(sc->ip, ri->ino_sick_mask); 1375 return 0; 1376 } 1377 1378 /* Fix everything xfs_dinode_verify cares about. */ 1379 STATIC int 1380 xrep_dinode_problems( 1381 struct xrep_inode *ri) 1382 { 1383 struct xfs_scrub *sc = ri->sc; 1384 int error; 1385 1386 error = xrep_dinode_core(ri); 1387 if (error) 1388 return error; 1389 1390 /* We had to fix a totally busted inode, schedule quotacheck. */ 1391 if (XFS_IS_UQUOTA_ON(sc->mp)) 1392 xrep_force_quotacheck(sc, XFS_DQTYPE_USER); 1393 if (XFS_IS_GQUOTA_ON(sc->mp)) 1394 xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP); 1395 if (XFS_IS_PQUOTA_ON(sc->mp)) 1396 xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ); 1397 1398 return 0; 1399 } 1400 1401 /* 1402 * Fix problems that the verifiers don't care about. In general these are 1403 * errors that don't cause problems elsewhere in the kernel that we can easily 1404 * detect, so we don't check them all that rigorously. 1405 */ 1406 1407 /* Make sure block and extent counts are ok. */ 1408 STATIC int 1409 xrep_inode_blockcounts( 1410 struct xfs_scrub *sc) 1411 { 1412 struct xfs_ifork *ifp; 1413 xfs_filblks_t count; 1414 xfs_filblks_t acount; 1415 xfs_extnum_t nextents; 1416 int error; 1417 1418 trace_xrep_inode_blockcounts(sc); 1419 1420 /* Set data fork counters from the data fork mappings. */ 1421 error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK, 1422 &nextents, &count); 1423 if (error) 1424 return error; 1425 if (xfs_is_reflink_inode(sc->ip)) { 1426 /* 1427 * data fork blockcount can exceed physical storage if a user 1428 * reflinks the same block over and over again. 1429 */ 1430 ; 1431 } else if (XFS_IS_REALTIME_INODE(sc->ip)) { 1432 if (count >= sc->mp->m_sb.sb_rblocks) 1433 return -EFSCORRUPTED; 1434 } else { 1435 if (count >= sc->mp->m_sb.sb_dblocks) 1436 return -EFSCORRUPTED; 1437 } 1438 error = xrep_ino_ensure_extent_count(sc, XFS_DATA_FORK, nextents); 1439 if (error) 1440 return error; 1441 sc->ip->i_df.if_nextents = nextents; 1442 1443 /* Set attr fork counters from the attr fork mappings. */ 1444 ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK); 1445 if (ifp) { 1446 error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK, 1447 &nextents, &acount); 1448 if (error) 1449 return error; 1450 if (count >= sc->mp->m_sb.sb_dblocks) 1451 return -EFSCORRUPTED; 1452 error = xrep_ino_ensure_extent_count(sc, XFS_ATTR_FORK, 1453 nextents); 1454 if (error) 1455 return error; 1456 ifp->if_nextents = nextents; 1457 } else { 1458 acount = 0; 1459 } 1460 1461 sc->ip->i_nblocks = count + acount; 1462 return 0; 1463 } 1464 1465 /* Check for invalid uid/gid/prid. */ 1466 STATIC void 1467 xrep_inode_ids( 1468 struct xfs_scrub *sc) 1469 { 1470 bool dirty = false; 1471 1472 trace_xrep_inode_ids(sc); 1473 1474 if (!uid_valid(VFS_I(sc->ip)->i_uid)) { 1475 i_uid_write(VFS_I(sc->ip), 0); 1476 dirty = true; 1477 if (XFS_IS_UQUOTA_ON(sc->mp)) 1478 xrep_force_quotacheck(sc, XFS_DQTYPE_USER); 1479 } 1480 1481 if (!gid_valid(VFS_I(sc->ip)->i_gid)) { 1482 i_gid_write(VFS_I(sc->ip), 0); 1483 dirty = true; 1484 if (XFS_IS_GQUOTA_ON(sc->mp)) 1485 xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP); 1486 } 1487 1488 if (sc->ip->i_projid == -1U) { 1489 sc->ip->i_projid = 0; 1490 dirty = true; 1491 if (XFS_IS_PQUOTA_ON(sc->mp)) 1492 xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ); 1493 } 1494 1495 /* strip setuid/setgid if we touched any of the ids */ 1496 if (dirty) 1497 VFS_I(sc->ip)->i_mode &= ~(S_ISUID | S_ISGID); 1498 } 1499 1500 static inline void 1501 xrep_clamp_timestamp( 1502 struct xfs_inode *ip, 1503 struct timespec64 *ts) 1504 { 1505 ts->tv_nsec = clamp_t(long, ts->tv_nsec, 0, NSEC_PER_SEC); 1506 *ts = timestamp_truncate(*ts, VFS_I(ip)); 1507 } 1508 1509 /* Nanosecond counters can't have more than 1 billion. */ 1510 STATIC void 1511 xrep_inode_timestamps( 1512 struct xfs_inode *ip) 1513 { 1514 struct timespec64 tstamp; 1515 struct inode *inode = VFS_I(ip); 1516 1517 tstamp = inode_get_atime(inode); 1518 xrep_clamp_timestamp(ip, &tstamp); 1519 inode_set_atime_to_ts(inode, tstamp); 1520 1521 tstamp = inode_get_mtime(inode); 1522 xrep_clamp_timestamp(ip, &tstamp); 1523 inode_set_mtime_to_ts(inode, tstamp); 1524 1525 tstamp = inode_get_ctime(inode); 1526 xrep_clamp_timestamp(ip, &tstamp); 1527 inode_set_ctime_to_ts(inode, tstamp); 1528 1529 xrep_clamp_timestamp(ip, &ip->i_crtime); 1530 } 1531 1532 /* Fix inode flags that don't make sense together. */ 1533 STATIC void 1534 xrep_inode_flags( 1535 struct xfs_scrub *sc) 1536 { 1537 uint16_t mode; 1538 1539 trace_xrep_inode_flags(sc); 1540 1541 mode = VFS_I(sc->ip)->i_mode; 1542 1543 /* Clear junk flags */ 1544 if (sc->ip->i_diflags & ~XFS_DIFLAG_ANY) 1545 sc->ip->i_diflags &= ~XFS_DIFLAG_ANY; 1546 1547 /* NEWRTBM only applies to realtime bitmaps */ 1548 if (sc->ip->i_ino == sc->mp->m_sb.sb_rbmino) 1549 sc->ip->i_diflags |= XFS_DIFLAG_NEWRTBM; 1550 else 1551 sc->ip->i_diflags &= ~XFS_DIFLAG_NEWRTBM; 1552 1553 /* These only make sense for directories. */ 1554 if (!S_ISDIR(mode)) 1555 sc->ip->i_diflags &= ~(XFS_DIFLAG_RTINHERIT | 1556 XFS_DIFLAG_EXTSZINHERIT | 1557 XFS_DIFLAG_PROJINHERIT | 1558 XFS_DIFLAG_NOSYMLINKS); 1559 1560 /* These only make sense for files. */ 1561 if (!S_ISREG(mode)) 1562 sc->ip->i_diflags &= ~(XFS_DIFLAG_REALTIME | 1563 XFS_DIFLAG_EXTSIZE); 1564 1565 /* These only make sense for non-rt files. */ 1566 if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) 1567 sc->ip->i_diflags &= ~XFS_DIFLAG_FILESTREAM; 1568 1569 /* Immutable and append only? Drop the append. */ 1570 if ((sc->ip->i_diflags & XFS_DIFLAG_IMMUTABLE) && 1571 (sc->ip->i_diflags & XFS_DIFLAG_APPEND)) 1572 sc->ip->i_diflags &= ~XFS_DIFLAG_APPEND; 1573 1574 /* Clear junk flags. */ 1575 if (sc->ip->i_diflags2 & ~XFS_DIFLAG2_ANY) 1576 sc->ip->i_diflags2 &= ~XFS_DIFLAG2_ANY; 1577 1578 /* No reflink flag unless we support it and it's a file. */ 1579 if (!xfs_has_reflink(sc->mp) || !S_ISREG(mode)) 1580 sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; 1581 1582 /* DAX only applies to files and dirs. */ 1583 if (!(S_ISREG(mode) || S_ISDIR(mode))) 1584 sc->ip->i_diflags2 &= ~XFS_DIFLAG2_DAX; 1585 1586 /* No reflink files on the realtime device. */ 1587 if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) 1588 sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; 1589 } 1590 1591 /* 1592 * Fix size problems with block/node format directories. If we fail to find 1593 * the extent list, just bail out and let the bmapbtd repair functions clean 1594 * up that mess. 1595 */ 1596 STATIC void 1597 xrep_inode_blockdir_size( 1598 struct xfs_scrub *sc) 1599 { 1600 struct xfs_iext_cursor icur; 1601 struct xfs_bmbt_irec got; 1602 struct xfs_ifork *ifp; 1603 xfs_fileoff_t off; 1604 int error; 1605 1606 trace_xrep_inode_blockdir_size(sc); 1607 1608 error = xfs_iread_extents(sc->tp, sc->ip, XFS_DATA_FORK); 1609 if (error) 1610 return; 1611 1612 /* Find the last block before 32G; this is the dir size. */ 1613 ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); 1614 off = XFS_B_TO_FSB(sc->mp, XFS_DIR2_SPACE_SIZE); 1615 if (!xfs_iext_lookup_extent_before(sc->ip, ifp, &off, &icur, &got)) { 1616 /* zero-extents directory? */ 1617 return; 1618 } 1619 1620 off = got.br_startoff + got.br_blockcount; 1621 sc->ip->i_disk_size = min_t(loff_t, XFS_DIR2_SPACE_SIZE, 1622 XFS_FSB_TO_B(sc->mp, off)); 1623 } 1624 1625 /* Fix size problems with short format directories. */ 1626 STATIC void 1627 xrep_inode_sfdir_size( 1628 struct xfs_scrub *sc) 1629 { 1630 struct xfs_ifork *ifp; 1631 1632 trace_xrep_inode_sfdir_size(sc); 1633 1634 ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); 1635 sc->ip->i_disk_size = ifp->if_bytes; 1636 } 1637 1638 /* 1639 * Fix any irregularities in a directory inode's size now that we can iterate 1640 * extent maps and access other regular inode data. 1641 */ 1642 STATIC void 1643 xrep_inode_dir_size( 1644 struct xfs_scrub *sc) 1645 { 1646 trace_xrep_inode_dir_size(sc); 1647 1648 switch (sc->ip->i_df.if_format) { 1649 case XFS_DINODE_FMT_EXTENTS: 1650 case XFS_DINODE_FMT_BTREE: 1651 xrep_inode_blockdir_size(sc); 1652 break; 1653 case XFS_DINODE_FMT_LOCAL: 1654 xrep_inode_sfdir_size(sc); 1655 break; 1656 } 1657 } 1658 1659 /* Fix extent size hint problems. */ 1660 STATIC void 1661 xrep_inode_extsize( 1662 struct xfs_scrub *sc) 1663 { 1664 /* Fix misaligned extent size hints on a directory. */ 1665 if ((sc->ip->i_diflags & XFS_DIFLAG_RTINHERIT) && 1666 (sc->ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) && 1667 xfs_extlen_to_rtxmod(sc->mp, sc->ip->i_extsize) > 0) { 1668 sc->ip->i_extsize = 0; 1669 sc->ip->i_diflags &= ~XFS_DIFLAG_EXTSZINHERIT; 1670 } 1671 } 1672 1673 /* Fix any irregularities in an inode that the verifiers don't catch. */ 1674 STATIC int 1675 xrep_inode_problems( 1676 struct xfs_scrub *sc) 1677 { 1678 int error; 1679 1680 error = xrep_inode_blockcounts(sc); 1681 if (error) 1682 return error; 1683 xrep_inode_timestamps(sc->ip); 1684 xrep_inode_flags(sc); 1685 xrep_inode_ids(sc); 1686 /* 1687 * We can now do a better job fixing the size of a directory now that 1688 * we can scan the data fork extents than we could in xrep_dinode_size. 1689 */ 1690 if (S_ISDIR(VFS_I(sc->ip)->i_mode)) 1691 xrep_inode_dir_size(sc); 1692 xrep_inode_extsize(sc); 1693 1694 trace_xrep_inode_fixed(sc); 1695 xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); 1696 return xrep_roll_trans(sc); 1697 } 1698 1699 /* Repair an inode's fields. */ 1700 int 1701 xrep_inode( 1702 struct xfs_scrub *sc) 1703 { 1704 int error = 0; 1705 1706 /* 1707 * No inode? That means we failed the _iget verifiers. Repair all 1708 * the things that the inode verifiers care about, then retry _iget. 1709 */ 1710 if (!sc->ip) { 1711 struct xrep_inode *ri = sc->buf; 1712 1713 ASSERT(ri != NULL); 1714 1715 error = xrep_dinode_problems(ri); 1716 if (error == -EBUSY) { 1717 /* 1718 * Directory scan to recover inode mode encountered a 1719 * busy inode, so we did not continue repairing things. 1720 */ 1721 return 0; 1722 } 1723 if (error) 1724 return error; 1725 1726 /* By this point we had better have a working incore inode. */ 1727 if (!sc->ip) 1728 return -EFSCORRUPTED; 1729 } 1730 1731 xfs_trans_ijoin(sc->tp, sc->ip, 0); 1732 1733 /* If we found corruption of any kind, try to fix it. */ 1734 if ((sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) || 1735 (sc->sm->sm_flags & XFS_SCRUB_OFLAG_XCORRUPT)) { 1736 error = xrep_inode_problems(sc); 1737 if (error) 1738 return error; 1739 } 1740 1741 /* See if we can clear the reflink flag. */ 1742 if (xfs_is_reflink_inode(sc->ip)) { 1743 error = xfs_reflink_clear_inode_flag(sc->ip, &sc->tp); 1744 if (error) 1745 return error; 1746 } 1747 1748 return xrep_defer_finish(sc); 1749 } 1750