1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2018-2023 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_defer.h" 13 #include "xfs_btree.h" 14 #include "xfs_bit.h" 15 #include "xfs_log_format.h" 16 #include "xfs_trans.h" 17 #include "xfs_sb.h" 18 #include "xfs_inode.h" 19 #include "xfs_icache.h" 20 #include "xfs_inode_buf.h" 21 #include "xfs_inode_fork.h" 22 #include "xfs_ialloc.h" 23 #include "xfs_da_format.h" 24 #include "xfs_reflink.h" 25 #include "xfs_alloc.h" 26 #include "xfs_rmap.h" 27 #include "xfs_rmap_btree.h" 28 #include "xfs_bmap.h" 29 #include "xfs_bmap_btree.h" 30 #include "xfs_bmap_util.h" 31 #include "xfs_dir2.h" 32 #include "xfs_dir2_priv.h" 33 #include "xfs_quota_defs.h" 34 #include "xfs_quota.h" 35 #include "xfs_ag.h" 36 #include "xfs_rtbitmap.h" 37 #include "xfs_attr_leaf.h" 38 #include "xfs_log_priv.h" 39 #include "xfs_health.h" 40 #include "xfs_symlink_remote.h" 41 #include "scrub/xfs_scrub.h" 42 #include "scrub/scrub.h" 43 #include "scrub/common.h" 44 #include "scrub/btree.h" 45 #include "scrub/trace.h" 46 #include "scrub/repair.h" 47 #include "scrub/iscan.h" 48 #include "scrub/readdir.h" 49 50 /* 51 * Inode Record Repair 52 * =================== 53 * 54 * Roughly speaking, inode problems can be classified based on whether or not 55 * they trip the dinode verifiers. If those trip, then we won't be able to 56 * xfs_iget ourselves the inode. 57 * 58 * Therefore, the xrep_dinode_* functions fix anything that will cause the 59 * inode buffer verifier or the dinode verifier. The xrep_inode_* functions 60 * fix things on live incore inodes. The inode repair functions make decisions 61 * with security and usability implications when reviving a file: 62 * 63 * - Files with zero di_mode or a garbage di_mode are converted to regular file 64 * that only root can read. This file may not actually contain user data, 65 * if the file was not previously a regular file. Setuid and setgid bits 66 * are cleared. 67 * 68 * - Zero-size directories can be truncated to look empty. It is necessary to 69 * run the bmapbtd and directory repair functions to fully rebuild the 70 * directory. 71 * 72 * - Zero-size symbolic link targets can be truncated to '?'. It is necessary 73 * to run the bmapbtd and symlink repair functions to salvage the symlink. 74 * 75 * - Invalid extent size hints will be removed. 76 * 77 * - Quotacheck will be scheduled if we repaired an inode that was so badly 78 * damaged that the ondisk inode had to be rebuilt. 79 * 80 * - Invalid user, group, or project IDs (aka -1U) will be reset to zero. 81 * Setuid and setgid bits are cleared. 82 * 83 * - Data and attr forks are reset to extents format with zero extents if the 84 * fork data is inconsistent. It is necessary to run the bmapbtd or bmapbta 85 * repair functions to recover the space mapping. 86 * 87 * - ACLs will not be recovered if the attr fork is zapped or the extended 88 * attribute structure itself requires salvaging. 89 * 90 * - If the attr fork is zapped, the user and group ids are reset to root and 91 * the setuid and setgid bits are removed. 92 */ 93 94 /* 95 * All the information we need to repair the ondisk inode if we can't iget the 96 * incore inode. We don't allocate this buffer unless we're going to perform 97 * a repair to the ondisk inode cluster buffer. 98 */ 99 struct xrep_inode { 100 /* Inode mapping that we saved from the initial lookup attempt. */ 101 struct xfs_imap imap; 102 103 struct xfs_scrub *sc; 104 105 /* Blocks in use on the data device by data extents or bmbt blocks. */ 106 xfs_rfsblock_t data_blocks; 107 108 /* Blocks in use on the rt device. */ 109 xfs_rfsblock_t rt_blocks; 110 111 /* Blocks in use by the attr fork. */ 112 xfs_rfsblock_t attr_blocks; 113 114 /* Number of data device extents for the data fork. */ 115 xfs_extnum_t data_extents; 116 117 /* 118 * Number of realtime device extents for the data fork. If 119 * data_extents and rt_extents indicate that the data fork has extents 120 * on both devices, we'll just back away slowly. 121 */ 122 xfs_extnum_t rt_extents; 123 124 /* Number of (data device) extents for the attr fork. */ 125 xfs_aextnum_t attr_extents; 126 127 /* Sick state to set after zapping parts of the inode. */ 128 unsigned int ino_sick_mask; 129 130 /* Must we remove all access from this file? */ 131 bool zap_acls; 132 133 /* Inode scanner to see if we can find the ftype from dirents */ 134 struct xchk_iscan ftype_iscan; 135 uint8_t alleged_ftype; 136 }; 137 138 /* 139 * Setup function for inode repair. @imap contains the ondisk inode mapping 140 * information so that we can correct the ondisk inode cluster buffer if 141 * necessary to make iget work. 142 */ 143 int 144 xrep_setup_inode( 145 struct xfs_scrub *sc, 146 const struct xfs_imap *imap) 147 { 148 struct xrep_inode *ri; 149 150 sc->buf = kzalloc(sizeof(struct xrep_inode), XCHK_GFP_FLAGS); 151 if (!sc->buf) 152 return -ENOMEM; 153 154 ri = sc->buf; 155 memcpy(&ri->imap, imap, sizeof(struct xfs_imap)); 156 ri->sc = sc; 157 return 0; 158 } 159 160 /* 161 * Make sure this ondisk inode can pass the inode buffer verifier. This is 162 * not the same as the dinode verifier. 163 */ 164 STATIC void 165 xrep_dinode_buf_core( 166 struct xfs_scrub *sc, 167 struct xfs_buf *bp, 168 unsigned int ioffset) 169 { 170 struct xfs_dinode *dip = xfs_buf_offset(bp, ioffset); 171 struct xfs_trans *tp = sc->tp; 172 struct xfs_mount *mp = sc->mp; 173 xfs_agino_t agino; 174 bool crc_ok = false; 175 bool magic_ok = false; 176 bool unlinked_ok = false; 177 178 agino = be32_to_cpu(dip->di_next_unlinked); 179 180 if (xfs_verify_agino_or_null(bp->b_pag, agino)) 181 unlinked_ok = true; 182 183 if (dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && 184 xfs_dinode_good_version(mp, dip->di_version)) 185 magic_ok = true; 186 187 if (xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, 188 XFS_DINODE_CRC_OFF)) 189 crc_ok = true; 190 191 if (magic_ok && unlinked_ok && crc_ok) 192 return; 193 194 if (!magic_ok) { 195 dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); 196 dip->di_version = 3; 197 } 198 if (!unlinked_ok) 199 dip->di_next_unlinked = cpu_to_be32(NULLAGINO); 200 xfs_dinode_calc_crc(mp, dip); 201 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); 202 xfs_trans_log_buf(tp, bp, ioffset, 203 ioffset + sizeof(struct xfs_dinode) - 1); 204 } 205 206 /* Make sure this inode cluster buffer can pass the inode buffer verifier. */ 207 STATIC void 208 xrep_dinode_buf( 209 struct xfs_scrub *sc, 210 struct xfs_buf *bp) 211 { 212 struct xfs_mount *mp = sc->mp; 213 int i; 214 int ni; 215 216 ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock; 217 for (i = 0; i < ni; i++) 218 xrep_dinode_buf_core(sc, bp, i << mp->m_sb.sb_inodelog); 219 } 220 221 /* Reinitialize things that never change in an inode. */ 222 STATIC void 223 xrep_dinode_header( 224 struct xfs_scrub *sc, 225 struct xfs_dinode *dip) 226 { 227 trace_xrep_dinode_header(sc, dip); 228 229 dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); 230 if (!xfs_dinode_good_version(sc->mp, dip->di_version)) 231 dip->di_version = 3; 232 dip->di_ino = cpu_to_be64(sc->sm->sm_ino); 233 uuid_copy(&dip->di_uuid, &sc->mp->m_sb.sb_meta_uuid); 234 dip->di_gen = cpu_to_be32(sc->sm->sm_gen); 235 } 236 237 /* 238 * If this directory entry points to the scrub target inode, then the directory 239 * we're scanning is the parent of the scrub target inode. 240 */ 241 STATIC int 242 xrep_dinode_findmode_dirent( 243 struct xfs_scrub *sc, 244 struct xfs_inode *dp, 245 xfs_dir2_dataptr_t dapos, 246 const struct xfs_name *name, 247 xfs_ino_t ino, 248 void *priv) 249 { 250 struct xrep_inode *ri = priv; 251 int error = 0; 252 253 if (xchk_should_terminate(ri->sc, &error)) 254 return error; 255 256 if (ino != sc->sm->sm_ino) 257 return 0; 258 259 /* Ignore garbage directory entry names. */ 260 if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len)) 261 return -EFSCORRUPTED; 262 263 /* Don't pick up dot or dotdot entries; we only want child dirents. */ 264 if (xfs_dir2_samename(name, &xfs_name_dotdot) || 265 xfs_dir2_samename(name, &xfs_name_dot)) 266 return 0; 267 268 /* 269 * Uhoh, more than one parent for this inode and they don't agree on 270 * the file type? 271 */ 272 if (ri->alleged_ftype != XFS_DIR3_FT_UNKNOWN && 273 ri->alleged_ftype != name->type) { 274 trace_xrep_dinode_findmode_dirent_inval(ri->sc, dp, name->type, 275 ri->alleged_ftype); 276 return -EFSCORRUPTED; 277 } 278 279 /* We found a potential parent; remember the ftype. */ 280 trace_xrep_dinode_findmode_dirent(ri->sc, dp, name->type); 281 ri->alleged_ftype = name->type; 282 return 0; 283 } 284 285 /* 286 * If this is a directory, walk the dirents looking for any that point to the 287 * scrub target inode. 288 */ 289 STATIC int 290 xrep_dinode_findmode_walk_directory( 291 struct xrep_inode *ri, 292 struct xfs_inode *dp) 293 { 294 struct xfs_scrub *sc = ri->sc; 295 unsigned int lock_mode; 296 int error = 0; 297 298 /* 299 * Scan the directory to see if there it contains an entry pointing to 300 * the directory that we are repairing. 301 */ 302 lock_mode = xfs_ilock_data_map_shared(dp); 303 304 /* 305 * If this directory is known to be sick, we cannot scan it reliably 306 * and must abort. 307 */ 308 if (xfs_inode_has_sickness(dp, XFS_SICK_INO_CORE | 309 XFS_SICK_INO_BMBTD | 310 XFS_SICK_INO_DIR)) { 311 error = -EFSCORRUPTED; 312 goto out_unlock; 313 } 314 315 /* 316 * We cannot complete our parent pointer scan if a directory looks as 317 * though it has been zapped by the inode record repair code. 318 */ 319 if (xchk_dir_looks_zapped(dp)) { 320 error = -EBUSY; 321 goto out_unlock; 322 } 323 324 error = xchk_dir_walk(sc, dp, xrep_dinode_findmode_dirent, ri); 325 if (error) 326 goto out_unlock; 327 328 out_unlock: 329 xfs_iunlock(dp, lock_mode); 330 return error; 331 } 332 333 /* 334 * Try to find the mode of the inode being repaired by looking for directories 335 * that point down to this file. 336 */ 337 STATIC int 338 xrep_dinode_find_mode( 339 struct xrep_inode *ri, 340 uint16_t *mode) 341 { 342 struct xfs_scrub *sc = ri->sc; 343 struct xfs_inode *dp; 344 int error; 345 346 /* No ftype means we have no other metadata to consult. */ 347 if (!xfs_has_ftype(sc->mp)) { 348 *mode = S_IFREG; 349 return 0; 350 } 351 352 /* 353 * Scan all directories for parents that might point down to this 354 * inode. Skip the inode being repaired during the scan since it 355 * cannot be its own parent. Note that we still hold the AGI locked 356 * so there's a real possibility that _iscan_iter can return EBUSY. 357 */ 358 xchk_iscan_start(sc, 5000, 100, &ri->ftype_iscan); 359 ri->ftype_iscan.skip_ino = sc->sm->sm_ino; 360 ri->alleged_ftype = XFS_DIR3_FT_UNKNOWN; 361 while ((error = xchk_iscan_iter(&ri->ftype_iscan, &dp)) == 1) { 362 if (S_ISDIR(VFS_I(dp)->i_mode)) 363 error = xrep_dinode_findmode_walk_directory(ri, dp); 364 xchk_iscan_mark_visited(&ri->ftype_iscan, dp); 365 xchk_irele(sc, dp); 366 if (error < 0) 367 break; 368 if (xchk_should_terminate(sc, &error)) 369 break; 370 } 371 xchk_iscan_iter_finish(&ri->ftype_iscan); 372 xchk_iscan_teardown(&ri->ftype_iscan); 373 374 if (error == -EBUSY) { 375 if (ri->alleged_ftype != XFS_DIR3_FT_UNKNOWN) { 376 /* 377 * If we got an EBUSY after finding at least one 378 * dirent, that means the scan found an inode on the 379 * inactivation list and could not open it. Accept the 380 * alleged ftype and install a new mode below. 381 */ 382 error = 0; 383 } else if (!(sc->flags & XCHK_TRY_HARDER)) { 384 /* 385 * Otherwise, retry the operation one time to see if 386 * the reason for the delay is an inode from the same 387 * cluster buffer waiting on the inactivation list. 388 */ 389 error = -EDEADLOCK; 390 } 391 } 392 if (error) 393 return error; 394 395 /* 396 * Convert the discovered ftype into the file mode. If all else fails, 397 * return S_IFREG. 398 */ 399 switch (ri->alleged_ftype) { 400 case XFS_DIR3_FT_DIR: 401 *mode = S_IFDIR; 402 break; 403 case XFS_DIR3_FT_WHT: 404 case XFS_DIR3_FT_CHRDEV: 405 *mode = S_IFCHR; 406 break; 407 case XFS_DIR3_FT_BLKDEV: 408 *mode = S_IFBLK; 409 break; 410 case XFS_DIR3_FT_FIFO: 411 *mode = S_IFIFO; 412 break; 413 case XFS_DIR3_FT_SOCK: 414 *mode = S_IFSOCK; 415 break; 416 case XFS_DIR3_FT_SYMLINK: 417 *mode = S_IFLNK; 418 break; 419 default: 420 *mode = S_IFREG; 421 break; 422 } 423 return 0; 424 } 425 426 /* Turn di_mode into /something/ recognizable. Returns true if we succeed. */ 427 STATIC int 428 xrep_dinode_mode( 429 struct xrep_inode *ri, 430 struct xfs_dinode *dip) 431 { 432 struct xfs_scrub *sc = ri->sc; 433 uint16_t mode = be16_to_cpu(dip->di_mode); 434 int error; 435 436 trace_xrep_dinode_mode(sc, dip); 437 438 if (mode == 0 || xfs_mode_to_ftype(mode) != XFS_DIR3_FT_UNKNOWN) 439 return 0; 440 441 /* Try to fix the mode. If we cannot, then leave everything alone. */ 442 error = xrep_dinode_find_mode(ri, &mode); 443 switch (error) { 444 case -EINTR: 445 case -EBUSY: 446 case -EDEADLOCK: 447 /* temporary failure or fatal signal */ 448 return error; 449 case 0: 450 /* found mode */ 451 break; 452 default: 453 /* some other error, assume S_IFREG */ 454 mode = S_IFREG; 455 break; 456 } 457 458 /* bad mode, so we set it to a file that only root can read */ 459 dip->di_mode = cpu_to_be16(mode); 460 dip->di_uid = 0; 461 dip->di_gid = 0; 462 ri->zap_acls = true; 463 return 0; 464 } 465 466 /* Fix any conflicting flags that the verifiers complain about. */ 467 STATIC void 468 xrep_dinode_flags( 469 struct xfs_scrub *sc, 470 struct xfs_dinode *dip, 471 bool isrt) 472 { 473 struct xfs_mount *mp = sc->mp; 474 uint64_t flags2 = be64_to_cpu(dip->di_flags2); 475 uint16_t flags = be16_to_cpu(dip->di_flags); 476 uint16_t mode = be16_to_cpu(dip->di_mode); 477 478 trace_xrep_dinode_flags(sc, dip); 479 480 if (isrt) 481 flags |= XFS_DIFLAG_REALTIME; 482 else 483 flags &= ~XFS_DIFLAG_REALTIME; 484 485 /* 486 * For regular files on a reflink filesystem, set the REFLINK flag to 487 * protect shared extents. A later stage will actually check those 488 * extents and clear the flag if possible. 489 */ 490 if (xfs_has_reflink(mp) && S_ISREG(mode)) 491 flags2 |= XFS_DIFLAG2_REFLINK; 492 else 493 flags2 &= ~(XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE); 494 if (flags & XFS_DIFLAG_REALTIME) 495 flags2 &= ~XFS_DIFLAG2_REFLINK; 496 if (!xfs_has_bigtime(mp)) 497 flags2 &= ~XFS_DIFLAG2_BIGTIME; 498 if (!xfs_has_large_extent_counts(mp)) 499 flags2 &= ~XFS_DIFLAG2_NREXT64; 500 if (flags2 & XFS_DIFLAG2_NREXT64) 501 dip->di_nrext64_pad = 0; 502 else if (dip->di_version >= 3) 503 dip->di_v3_pad = 0; 504 dip->di_flags = cpu_to_be16(flags); 505 dip->di_flags2 = cpu_to_be64(flags2); 506 } 507 508 /* 509 * Blow out symlink; now it points nowhere. We don't have to worry about 510 * incore state because this inode is failing the verifiers. 511 */ 512 STATIC void 513 xrep_dinode_zap_symlink( 514 struct xrep_inode *ri, 515 struct xfs_dinode *dip) 516 { 517 struct xfs_scrub *sc = ri->sc; 518 char *p; 519 520 trace_xrep_dinode_zap_symlink(sc, dip); 521 522 dip->di_format = XFS_DINODE_FMT_LOCAL; 523 dip->di_size = cpu_to_be64(1); 524 p = XFS_DFORK_PTR(dip, XFS_DATA_FORK); 525 *p = '?'; 526 ri->ino_sick_mask |= XFS_SICK_INO_SYMLINK_ZAPPED; 527 } 528 529 /* 530 * Blow out dir, make the parent point to the root. In the future repair will 531 * reconstruct this directory for us. Note that there's no in-core directory 532 * inode because the sf verifier tripped, so we don't have to worry about the 533 * dentry cache. 534 */ 535 STATIC void 536 xrep_dinode_zap_dir( 537 struct xrep_inode *ri, 538 struct xfs_dinode *dip) 539 { 540 struct xfs_scrub *sc = ri->sc; 541 struct xfs_mount *mp = sc->mp; 542 struct xfs_dir2_sf_hdr *sfp; 543 int i8count; 544 545 trace_xrep_dinode_zap_dir(sc, dip); 546 547 dip->di_format = XFS_DINODE_FMT_LOCAL; 548 i8count = mp->m_sb.sb_rootino > XFS_DIR2_MAX_SHORT_INUM; 549 sfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK); 550 sfp->count = 0; 551 sfp->i8count = i8count; 552 xfs_dir2_sf_put_parent_ino(sfp, mp->m_sb.sb_rootino); 553 dip->di_size = cpu_to_be64(xfs_dir2_sf_hdr_size(i8count)); 554 ri->ino_sick_mask |= XFS_SICK_INO_DIR_ZAPPED; 555 } 556 557 /* Make sure we don't have a garbage file size. */ 558 STATIC void 559 xrep_dinode_size( 560 struct xrep_inode *ri, 561 struct xfs_dinode *dip) 562 { 563 struct xfs_scrub *sc = ri->sc; 564 uint64_t size = be64_to_cpu(dip->di_size); 565 uint16_t mode = be16_to_cpu(dip->di_mode); 566 567 trace_xrep_dinode_size(sc, dip); 568 569 switch (mode & S_IFMT) { 570 case S_IFIFO: 571 case S_IFCHR: 572 case S_IFBLK: 573 case S_IFSOCK: 574 /* di_size can't be nonzero for special files */ 575 dip->di_size = 0; 576 break; 577 case S_IFREG: 578 /* Regular files can't be larger than 2^63-1 bytes. */ 579 dip->di_size = cpu_to_be64(size & ~(1ULL << 63)); 580 break; 581 case S_IFLNK: 582 /* 583 * Truncate ridiculously oversized symlinks. If the size is 584 * zero, reset it to point to the current directory. Both of 585 * these conditions trigger dinode verifier errors, so there 586 * is no in-core state to reset. 587 */ 588 if (size > XFS_SYMLINK_MAXLEN) 589 dip->di_size = cpu_to_be64(XFS_SYMLINK_MAXLEN); 590 else if (size == 0) 591 xrep_dinode_zap_symlink(ri, dip); 592 break; 593 case S_IFDIR: 594 /* 595 * Directories can't have a size larger than 32G. If the size 596 * is zero, reset it to an empty directory. Both of these 597 * conditions trigger dinode verifier errors, so there is no 598 * in-core state to reset. 599 */ 600 if (size > XFS_DIR2_SPACE_SIZE) 601 dip->di_size = cpu_to_be64(XFS_DIR2_SPACE_SIZE); 602 else if (size == 0) 603 xrep_dinode_zap_dir(ri, dip); 604 break; 605 } 606 } 607 608 /* Fix extent size hints. */ 609 STATIC void 610 xrep_dinode_extsize_hints( 611 struct xfs_scrub *sc, 612 struct xfs_dinode *dip) 613 { 614 struct xfs_mount *mp = sc->mp; 615 uint64_t flags2 = be64_to_cpu(dip->di_flags2); 616 uint16_t flags = be16_to_cpu(dip->di_flags); 617 uint16_t mode = be16_to_cpu(dip->di_mode); 618 619 xfs_failaddr_t fa; 620 621 trace_xrep_dinode_extsize_hints(sc, dip); 622 623 fa = xfs_inode_validate_extsize(mp, be32_to_cpu(dip->di_extsize), 624 mode, flags); 625 if (fa) { 626 dip->di_extsize = 0; 627 dip->di_flags &= ~cpu_to_be16(XFS_DIFLAG_EXTSIZE | 628 XFS_DIFLAG_EXTSZINHERIT); 629 } 630 631 if (dip->di_version < 3) 632 return; 633 634 fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize), 635 mode, flags, flags2); 636 if (fa) { 637 dip->di_cowextsize = 0; 638 dip->di_flags2 &= ~cpu_to_be64(XFS_DIFLAG2_COWEXTSIZE); 639 } 640 } 641 642 /* Count extents and blocks for an inode given an rmap. */ 643 STATIC int 644 xrep_dinode_walk_rmap( 645 struct xfs_btree_cur *cur, 646 const struct xfs_rmap_irec *rec, 647 void *priv) 648 { 649 struct xrep_inode *ri = priv; 650 int error = 0; 651 652 if (xchk_should_terminate(ri->sc, &error)) 653 return error; 654 655 /* We only care about this inode. */ 656 if (rec->rm_owner != ri->sc->sm->sm_ino) 657 return 0; 658 659 if (rec->rm_flags & XFS_RMAP_ATTR_FORK) { 660 ri->attr_blocks += rec->rm_blockcount; 661 if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK)) 662 ri->attr_extents++; 663 664 return 0; 665 } 666 667 ri->data_blocks += rec->rm_blockcount; 668 if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK)) 669 ri->data_extents++; 670 671 return 0; 672 } 673 674 /* Count extents and blocks for an inode from all AG rmap data. */ 675 STATIC int 676 xrep_dinode_count_ag_rmaps( 677 struct xrep_inode *ri, 678 struct xfs_perag *pag) 679 { 680 struct xfs_btree_cur *cur; 681 struct xfs_buf *agf; 682 int error; 683 684 error = xfs_alloc_read_agf(pag, ri->sc->tp, 0, &agf); 685 if (error) 686 return error; 687 688 cur = xfs_rmapbt_init_cursor(ri->sc->mp, ri->sc->tp, agf, pag); 689 error = xfs_rmap_query_all(cur, xrep_dinode_walk_rmap, ri); 690 xfs_btree_del_cursor(cur, error); 691 xfs_trans_brelse(ri->sc->tp, agf); 692 return error; 693 } 694 695 /* Count extents and blocks for a given inode from all rmap data. */ 696 STATIC int 697 xrep_dinode_count_rmaps( 698 struct xrep_inode *ri) 699 { 700 struct xfs_perag *pag; 701 xfs_agnumber_t agno; 702 int error; 703 704 if (!xfs_has_rmapbt(ri->sc->mp) || xfs_has_realtime(ri->sc->mp)) 705 return -EOPNOTSUPP; 706 707 for_each_perag(ri->sc->mp, agno, pag) { 708 error = xrep_dinode_count_ag_rmaps(ri, pag); 709 if (error) { 710 xfs_perag_rele(pag); 711 return error; 712 } 713 } 714 715 /* Can't have extents on both the rt and the data device. */ 716 if (ri->data_extents && ri->rt_extents) 717 return -EFSCORRUPTED; 718 719 trace_xrep_dinode_count_rmaps(ri->sc, 720 ri->data_blocks, ri->rt_blocks, ri->attr_blocks, 721 ri->data_extents, ri->rt_extents, ri->attr_extents); 722 return 0; 723 } 724 725 /* Return true if this extents-format ifork looks like garbage. */ 726 STATIC bool 727 xrep_dinode_bad_extents_fork( 728 struct xfs_scrub *sc, 729 struct xfs_dinode *dip, 730 unsigned int dfork_size, 731 int whichfork) 732 { 733 struct xfs_bmbt_irec new; 734 struct xfs_bmbt_rec *dp; 735 xfs_extnum_t nex; 736 bool isrt; 737 unsigned int i; 738 739 nex = xfs_dfork_nextents(dip, whichfork); 740 if (nex > dfork_size / sizeof(struct xfs_bmbt_rec)) 741 return true; 742 743 dp = XFS_DFORK_PTR(dip, whichfork); 744 745 isrt = dip->di_flags & cpu_to_be16(XFS_DIFLAG_REALTIME); 746 for (i = 0; i < nex; i++, dp++) { 747 xfs_failaddr_t fa; 748 749 xfs_bmbt_disk_get_all(dp, &new); 750 fa = xfs_bmap_validate_extent_raw(sc->mp, isrt, whichfork, 751 &new); 752 if (fa) 753 return true; 754 } 755 756 return false; 757 } 758 759 /* Return true if this btree-format ifork looks like garbage. */ 760 STATIC bool 761 xrep_dinode_bad_bmbt_fork( 762 struct xfs_scrub *sc, 763 struct xfs_dinode *dip, 764 unsigned int dfork_size, 765 int whichfork) 766 { 767 struct xfs_bmdr_block *dfp; 768 xfs_extnum_t nex; 769 unsigned int i; 770 unsigned int dmxr; 771 unsigned int nrecs; 772 unsigned int level; 773 774 nex = xfs_dfork_nextents(dip, whichfork); 775 if (nex <= dfork_size / sizeof(struct xfs_bmbt_rec)) 776 return true; 777 778 if (dfork_size < sizeof(struct xfs_bmdr_block)) 779 return true; 780 781 dfp = XFS_DFORK_PTR(dip, whichfork); 782 nrecs = be16_to_cpu(dfp->bb_numrecs); 783 level = be16_to_cpu(dfp->bb_level); 784 785 if (nrecs == 0 || XFS_BMDR_SPACE_CALC(nrecs) > dfork_size) 786 return true; 787 if (level == 0 || level >= XFS_BM_MAXLEVELS(sc->mp, whichfork)) 788 return true; 789 790 dmxr = xfs_bmdr_maxrecs(dfork_size, 0); 791 for (i = 1; i <= nrecs; i++) { 792 struct xfs_bmbt_key *fkp; 793 xfs_bmbt_ptr_t *fpp; 794 xfs_fileoff_t fileoff; 795 xfs_fsblock_t fsbno; 796 797 fkp = XFS_BMDR_KEY_ADDR(dfp, i); 798 fileoff = be64_to_cpu(fkp->br_startoff); 799 if (!xfs_verify_fileoff(sc->mp, fileoff)) 800 return true; 801 802 fpp = XFS_BMDR_PTR_ADDR(dfp, i, dmxr); 803 fsbno = be64_to_cpu(*fpp); 804 if (!xfs_verify_fsbno(sc->mp, fsbno)) 805 return true; 806 } 807 808 return false; 809 } 810 811 /* 812 * Check the data fork for things that will fail the ifork verifiers or the 813 * ifork formatters. 814 */ 815 STATIC bool 816 xrep_dinode_check_dfork( 817 struct xfs_scrub *sc, 818 struct xfs_dinode *dip, 819 uint16_t mode) 820 { 821 void *dfork_ptr; 822 int64_t data_size; 823 unsigned int fmt; 824 unsigned int dfork_size; 825 826 /* 827 * Verifier functions take signed int64_t, so check for bogus negative 828 * values first. 829 */ 830 data_size = be64_to_cpu(dip->di_size); 831 if (data_size < 0) 832 return true; 833 834 fmt = XFS_DFORK_FORMAT(dip, XFS_DATA_FORK); 835 switch (mode & S_IFMT) { 836 case S_IFIFO: 837 case S_IFCHR: 838 case S_IFBLK: 839 case S_IFSOCK: 840 if (fmt != XFS_DINODE_FMT_DEV) 841 return true; 842 break; 843 case S_IFREG: 844 if (fmt == XFS_DINODE_FMT_LOCAL) 845 return true; 846 fallthrough; 847 case S_IFLNK: 848 case S_IFDIR: 849 switch (fmt) { 850 case XFS_DINODE_FMT_LOCAL: 851 case XFS_DINODE_FMT_EXTENTS: 852 case XFS_DINODE_FMT_BTREE: 853 break; 854 default: 855 return true; 856 } 857 break; 858 default: 859 return true; 860 } 861 862 dfork_size = XFS_DFORK_SIZE(dip, sc->mp, XFS_DATA_FORK); 863 dfork_ptr = XFS_DFORK_PTR(dip, XFS_DATA_FORK); 864 865 switch (fmt) { 866 case XFS_DINODE_FMT_DEV: 867 break; 868 case XFS_DINODE_FMT_LOCAL: 869 /* dir/symlink structure cannot be larger than the fork */ 870 if (data_size > dfork_size) 871 return true; 872 /* directory structure must pass verification. */ 873 if (S_ISDIR(mode) && 874 xfs_dir2_sf_verify(sc->mp, dfork_ptr, data_size) != NULL) 875 return true; 876 /* symlink structure must pass verification. */ 877 if (S_ISLNK(mode) && 878 xfs_symlink_shortform_verify(dfork_ptr, data_size) != NULL) 879 return true; 880 break; 881 case XFS_DINODE_FMT_EXTENTS: 882 if (xrep_dinode_bad_extents_fork(sc, dip, dfork_size, 883 XFS_DATA_FORK)) 884 return true; 885 break; 886 case XFS_DINODE_FMT_BTREE: 887 if (xrep_dinode_bad_bmbt_fork(sc, dip, dfork_size, 888 XFS_DATA_FORK)) 889 return true; 890 break; 891 default: 892 return true; 893 } 894 895 return false; 896 } 897 898 static void 899 xrep_dinode_set_data_nextents( 900 struct xfs_dinode *dip, 901 xfs_extnum_t nextents) 902 { 903 if (xfs_dinode_has_large_extent_counts(dip)) 904 dip->di_big_nextents = cpu_to_be64(nextents); 905 else 906 dip->di_nextents = cpu_to_be32(nextents); 907 } 908 909 static void 910 xrep_dinode_set_attr_nextents( 911 struct xfs_dinode *dip, 912 xfs_extnum_t nextents) 913 { 914 if (xfs_dinode_has_large_extent_counts(dip)) 915 dip->di_big_anextents = cpu_to_be32(nextents); 916 else 917 dip->di_anextents = cpu_to_be16(nextents); 918 } 919 920 /* Reset the data fork to something sane. */ 921 STATIC void 922 xrep_dinode_zap_dfork( 923 struct xrep_inode *ri, 924 struct xfs_dinode *dip, 925 uint16_t mode) 926 { 927 struct xfs_scrub *sc = ri->sc; 928 929 trace_xrep_dinode_zap_dfork(sc, dip); 930 931 ri->ino_sick_mask |= XFS_SICK_INO_BMBTD_ZAPPED; 932 933 xrep_dinode_set_data_nextents(dip, 0); 934 ri->data_blocks = 0; 935 ri->rt_blocks = 0; 936 937 /* Special files always get reset to DEV */ 938 switch (mode & S_IFMT) { 939 case S_IFIFO: 940 case S_IFCHR: 941 case S_IFBLK: 942 case S_IFSOCK: 943 dip->di_format = XFS_DINODE_FMT_DEV; 944 dip->di_size = 0; 945 return; 946 } 947 948 /* 949 * If we have data extents, reset to an empty map and hope the user 950 * will run the bmapbtd checker next. 951 */ 952 if (ri->data_extents || ri->rt_extents || S_ISREG(mode)) { 953 dip->di_format = XFS_DINODE_FMT_EXTENTS; 954 return; 955 } 956 957 /* Otherwise, reset the local format to the minimum. */ 958 switch (mode & S_IFMT) { 959 case S_IFLNK: 960 xrep_dinode_zap_symlink(ri, dip); 961 break; 962 case S_IFDIR: 963 xrep_dinode_zap_dir(ri, dip); 964 break; 965 } 966 } 967 968 /* 969 * Check the attr fork for things that will fail the ifork verifiers or the 970 * ifork formatters. 971 */ 972 STATIC bool 973 xrep_dinode_check_afork( 974 struct xfs_scrub *sc, 975 struct xfs_dinode *dip) 976 { 977 struct xfs_attr_sf_hdr *afork_ptr; 978 size_t attr_size; 979 unsigned int afork_size; 980 981 if (XFS_DFORK_BOFF(dip) == 0) 982 return dip->di_aformat != XFS_DINODE_FMT_EXTENTS || 983 xfs_dfork_attr_extents(dip) != 0; 984 985 afork_size = XFS_DFORK_SIZE(dip, sc->mp, XFS_ATTR_FORK); 986 afork_ptr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK); 987 988 switch (XFS_DFORK_FORMAT(dip, XFS_ATTR_FORK)) { 989 case XFS_DINODE_FMT_LOCAL: 990 /* Fork has to be large enough to extract the xattr size. */ 991 if (afork_size < sizeof(struct xfs_attr_sf_hdr)) 992 return true; 993 994 /* xattr structure cannot be larger than the fork */ 995 attr_size = be16_to_cpu(afork_ptr->totsize); 996 if (attr_size > afork_size) 997 return true; 998 999 /* xattr structure must pass verification. */ 1000 return xfs_attr_shortform_verify(afork_ptr, attr_size) != NULL; 1001 case XFS_DINODE_FMT_EXTENTS: 1002 if (xrep_dinode_bad_extents_fork(sc, dip, afork_size, 1003 XFS_ATTR_FORK)) 1004 return true; 1005 break; 1006 case XFS_DINODE_FMT_BTREE: 1007 if (xrep_dinode_bad_bmbt_fork(sc, dip, afork_size, 1008 XFS_ATTR_FORK)) 1009 return true; 1010 break; 1011 default: 1012 return true; 1013 } 1014 1015 return false; 1016 } 1017 1018 /* 1019 * Reset the attr fork to empty. Since the attr fork could have contained 1020 * ACLs, make the file readable only by root. 1021 */ 1022 STATIC void 1023 xrep_dinode_zap_afork( 1024 struct xrep_inode *ri, 1025 struct xfs_dinode *dip, 1026 uint16_t mode) 1027 { 1028 struct xfs_scrub *sc = ri->sc; 1029 1030 trace_xrep_dinode_zap_afork(sc, dip); 1031 1032 ri->ino_sick_mask |= XFS_SICK_INO_BMBTA_ZAPPED; 1033 1034 dip->di_aformat = XFS_DINODE_FMT_EXTENTS; 1035 xrep_dinode_set_attr_nextents(dip, 0); 1036 ri->attr_blocks = 0; 1037 1038 /* 1039 * If the data fork is in btree format, removing the attr fork entirely 1040 * might cause verifier failures if the next level down in the bmbt 1041 * could now fit in the data fork area. 1042 */ 1043 if (dip->di_format != XFS_DINODE_FMT_BTREE) 1044 dip->di_forkoff = 0; 1045 dip->di_mode = cpu_to_be16(mode & ~0777); 1046 dip->di_uid = 0; 1047 dip->di_gid = 0; 1048 } 1049 1050 /* Make sure the fork offset is a sensible value. */ 1051 STATIC void 1052 xrep_dinode_ensure_forkoff( 1053 struct xrep_inode *ri, 1054 struct xfs_dinode *dip, 1055 uint16_t mode) 1056 { 1057 struct xfs_bmdr_block *bmdr; 1058 struct xfs_scrub *sc = ri->sc; 1059 xfs_extnum_t attr_extents, data_extents; 1060 size_t bmdr_minsz = XFS_BMDR_SPACE_CALC(1); 1061 unsigned int lit_sz = XFS_LITINO(sc->mp); 1062 unsigned int afork_min, dfork_min; 1063 1064 trace_xrep_dinode_ensure_forkoff(sc, dip); 1065 1066 /* 1067 * Before calling this function, xrep_dinode_core ensured that both 1068 * forks actually fit inside their respective literal areas. If this 1069 * was not the case, the fork was reset to FMT_EXTENTS with zero 1070 * records. If the rmapbt scan found attr or data fork blocks, this 1071 * will be noted in the dinode_stats, and we must leave enough room 1072 * for the bmap repair code to reconstruct the mapping structure. 1073 * 1074 * First, compute the minimum space required for the attr fork. 1075 */ 1076 switch (dip->di_aformat) { 1077 case XFS_DINODE_FMT_LOCAL: 1078 /* 1079 * If we still have a shortform xattr structure at all, that 1080 * means the attr fork area was exactly large enough to fit 1081 * the sf structure. 1082 */ 1083 afork_min = XFS_DFORK_SIZE(dip, sc->mp, XFS_ATTR_FORK); 1084 break; 1085 case XFS_DINODE_FMT_EXTENTS: 1086 attr_extents = xfs_dfork_attr_extents(dip); 1087 if (attr_extents) { 1088 /* 1089 * We must maintain sufficient space to hold the entire 1090 * extent map array in the data fork. Note that we 1091 * previously zapped the fork if it had no chance of 1092 * fitting in the inode. 1093 */ 1094 afork_min = sizeof(struct xfs_bmbt_rec) * attr_extents; 1095 } else if (ri->attr_extents > 0) { 1096 /* 1097 * The attr fork thinks it has zero extents, but we 1098 * found some xattr extents. We need to leave enough 1099 * empty space here so that the incore attr fork will 1100 * get created (and hence trigger the attr fork bmap 1101 * repairer). 1102 */ 1103 afork_min = bmdr_minsz; 1104 } else { 1105 /* No extents on disk or found in rmapbt. */ 1106 afork_min = 0; 1107 } 1108 break; 1109 case XFS_DINODE_FMT_BTREE: 1110 /* Must have space for btree header and key/pointers. */ 1111 bmdr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK); 1112 afork_min = XFS_BMAP_BROOT_SPACE(sc->mp, bmdr); 1113 break; 1114 default: 1115 /* We should never see any other formats. */ 1116 afork_min = 0; 1117 break; 1118 } 1119 1120 /* Compute the minimum space required for the data fork. */ 1121 switch (dip->di_format) { 1122 case XFS_DINODE_FMT_DEV: 1123 dfork_min = sizeof(__be32); 1124 break; 1125 case XFS_DINODE_FMT_UUID: 1126 dfork_min = sizeof(uuid_t); 1127 break; 1128 case XFS_DINODE_FMT_LOCAL: 1129 /* 1130 * If we still have a shortform data fork at all, that means 1131 * the data fork area was large enough to fit whatever was in 1132 * there. 1133 */ 1134 dfork_min = be64_to_cpu(dip->di_size); 1135 break; 1136 case XFS_DINODE_FMT_EXTENTS: 1137 data_extents = xfs_dfork_data_extents(dip); 1138 if (data_extents) { 1139 /* 1140 * We must maintain sufficient space to hold the entire 1141 * extent map array in the data fork. Note that we 1142 * previously zapped the fork if it had no chance of 1143 * fitting in the inode. 1144 */ 1145 dfork_min = sizeof(struct xfs_bmbt_rec) * data_extents; 1146 } else if (ri->data_extents > 0 || ri->rt_extents > 0) { 1147 /* 1148 * The data fork thinks it has zero extents, but we 1149 * found some data extents. We need to leave enough 1150 * empty space here so that the data fork bmap repair 1151 * will recover the mappings. 1152 */ 1153 dfork_min = bmdr_minsz; 1154 } else { 1155 /* No extents on disk or found in rmapbt. */ 1156 dfork_min = 0; 1157 } 1158 break; 1159 case XFS_DINODE_FMT_BTREE: 1160 /* Must have space for btree header and key/pointers. */ 1161 bmdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK); 1162 dfork_min = XFS_BMAP_BROOT_SPACE(sc->mp, bmdr); 1163 break; 1164 default: 1165 dfork_min = 0; 1166 break; 1167 } 1168 1169 /* 1170 * Round all values up to the nearest 8 bytes, because that is the 1171 * precision of di_forkoff. 1172 */ 1173 afork_min = roundup(afork_min, 8); 1174 dfork_min = roundup(dfork_min, 8); 1175 bmdr_minsz = roundup(bmdr_minsz, 8); 1176 1177 ASSERT(dfork_min <= lit_sz); 1178 ASSERT(afork_min <= lit_sz); 1179 1180 /* 1181 * If the data fork was zapped and we don't have enough space for the 1182 * recovery fork, move the attr fork up. 1183 */ 1184 if (dip->di_format == XFS_DINODE_FMT_EXTENTS && 1185 xfs_dfork_data_extents(dip) == 0 && 1186 (ri->data_extents > 0 || ri->rt_extents > 0) && 1187 bmdr_minsz > XFS_DFORK_DSIZE(dip, sc->mp)) { 1188 if (bmdr_minsz + afork_min > lit_sz) { 1189 /* 1190 * The attr for and the stub fork we need to recover 1191 * the data fork won't both fit. Zap the attr fork. 1192 */ 1193 xrep_dinode_zap_afork(ri, dip, mode); 1194 afork_min = bmdr_minsz; 1195 } else { 1196 void *before, *after; 1197 1198 /* Otherwise, just slide the attr fork up. */ 1199 before = XFS_DFORK_APTR(dip); 1200 dip->di_forkoff = bmdr_minsz >> 3; 1201 after = XFS_DFORK_APTR(dip); 1202 memmove(after, before, XFS_DFORK_ASIZE(dip, sc->mp)); 1203 } 1204 } 1205 1206 /* 1207 * If the attr fork was zapped and we don't have enough space for the 1208 * recovery fork, move the attr fork down. 1209 */ 1210 if (dip->di_aformat == XFS_DINODE_FMT_EXTENTS && 1211 xfs_dfork_attr_extents(dip) == 0 && 1212 ri->attr_extents > 0 && 1213 bmdr_minsz > XFS_DFORK_ASIZE(dip, sc->mp)) { 1214 if (dip->di_format == XFS_DINODE_FMT_BTREE) { 1215 /* 1216 * If the data fork is in btree format then we can't 1217 * adjust forkoff because that runs the risk of 1218 * violating the extents/btree format transition rules. 1219 */ 1220 } else if (bmdr_minsz + dfork_min > lit_sz) { 1221 /* 1222 * If we can't move the attr fork, too bad, we lose the 1223 * attr fork and leak its blocks. 1224 */ 1225 xrep_dinode_zap_afork(ri, dip, mode); 1226 } else { 1227 /* 1228 * Otherwise, just slide the attr fork down. The attr 1229 * fork is empty, so we don't have any old contents to 1230 * move here. 1231 */ 1232 dip->di_forkoff = (lit_sz - bmdr_minsz) >> 3; 1233 } 1234 } 1235 } 1236 1237 /* 1238 * Zap the data/attr forks if we spot anything that isn't going to pass the 1239 * ifork verifiers or the ifork formatters, because we need to get the inode 1240 * into good enough shape that the higher level repair functions can run. 1241 */ 1242 STATIC void 1243 xrep_dinode_zap_forks( 1244 struct xrep_inode *ri, 1245 struct xfs_dinode *dip) 1246 { 1247 struct xfs_scrub *sc = ri->sc; 1248 xfs_extnum_t data_extents; 1249 xfs_extnum_t attr_extents; 1250 xfs_filblks_t nblocks; 1251 uint16_t mode; 1252 bool zap_datafork = false; 1253 bool zap_attrfork = ri->zap_acls; 1254 1255 trace_xrep_dinode_zap_forks(sc, dip); 1256 1257 mode = be16_to_cpu(dip->di_mode); 1258 1259 data_extents = xfs_dfork_data_extents(dip); 1260 attr_extents = xfs_dfork_attr_extents(dip); 1261 nblocks = be64_to_cpu(dip->di_nblocks); 1262 1263 /* Inode counters don't make sense? */ 1264 if (data_extents > nblocks) 1265 zap_datafork = true; 1266 if (attr_extents > nblocks) 1267 zap_attrfork = true; 1268 if (data_extents + attr_extents > nblocks) 1269 zap_datafork = zap_attrfork = true; 1270 1271 if (!zap_datafork) 1272 zap_datafork = xrep_dinode_check_dfork(sc, dip, mode); 1273 if (!zap_attrfork) 1274 zap_attrfork = xrep_dinode_check_afork(sc, dip); 1275 1276 /* Zap whatever's bad. */ 1277 if (zap_attrfork) 1278 xrep_dinode_zap_afork(ri, dip, mode); 1279 if (zap_datafork) 1280 xrep_dinode_zap_dfork(ri, dip, mode); 1281 xrep_dinode_ensure_forkoff(ri, dip, mode); 1282 1283 /* 1284 * Zero di_nblocks if we don't have any extents at all to satisfy the 1285 * buffer verifier. 1286 */ 1287 data_extents = xfs_dfork_data_extents(dip); 1288 attr_extents = xfs_dfork_attr_extents(dip); 1289 if (data_extents + attr_extents == 0) 1290 dip->di_nblocks = 0; 1291 } 1292 1293 /* Inode didn't pass dinode verifiers, so fix the raw buffer and retry iget. */ 1294 STATIC int 1295 xrep_dinode_core( 1296 struct xrep_inode *ri) 1297 { 1298 struct xfs_scrub *sc = ri->sc; 1299 struct xfs_buf *bp; 1300 struct xfs_dinode *dip; 1301 xfs_ino_t ino = sc->sm->sm_ino; 1302 int error; 1303 int iget_error; 1304 1305 /* Figure out what this inode had mapped in both forks. */ 1306 error = xrep_dinode_count_rmaps(ri); 1307 if (error) 1308 return error; 1309 1310 /* Read the inode cluster buffer. */ 1311 error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp, 1312 ri->imap.im_blkno, ri->imap.im_len, XBF_UNMAPPED, &bp, 1313 NULL); 1314 if (error) 1315 return error; 1316 1317 /* Make sure we can pass the inode buffer verifier. */ 1318 xrep_dinode_buf(sc, bp); 1319 bp->b_ops = &xfs_inode_buf_ops; 1320 1321 /* Fix everything the verifier will complain about. */ 1322 dip = xfs_buf_offset(bp, ri->imap.im_boffset); 1323 xrep_dinode_header(sc, dip); 1324 iget_error = xrep_dinode_mode(ri, dip); 1325 if (iget_error) 1326 goto write; 1327 xrep_dinode_flags(sc, dip, ri->rt_extents > 0); 1328 xrep_dinode_size(ri, dip); 1329 xrep_dinode_extsize_hints(sc, dip); 1330 xrep_dinode_zap_forks(ri, dip); 1331 1332 write: 1333 /* Write out the inode. */ 1334 trace_xrep_dinode_fixed(sc, dip); 1335 xfs_dinode_calc_crc(sc->mp, dip); 1336 xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_DINO_BUF); 1337 xfs_trans_log_buf(sc->tp, bp, ri->imap.im_boffset, 1338 ri->imap.im_boffset + sc->mp->m_sb.sb_inodesize - 1); 1339 1340 /* 1341 * In theory, we've fixed the ondisk inode record enough that we should 1342 * be able to load the inode into the cache. Try to iget that inode 1343 * now while we hold the AGI and the inode cluster buffer and take the 1344 * IOLOCK so that we can continue with repairs without anyone else 1345 * accessing the inode. If iget fails, we still need to commit the 1346 * changes. 1347 */ 1348 if (!iget_error) 1349 iget_error = xchk_iget(sc, ino, &sc->ip); 1350 if (!iget_error) 1351 xchk_ilock(sc, XFS_IOLOCK_EXCL); 1352 1353 /* 1354 * Commit the inode cluster buffer updates and drop the AGI buffer that 1355 * we've been holding since scrub setup. From here on out, repairs 1356 * deal only with the cached inode. 1357 */ 1358 error = xrep_trans_commit(sc); 1359 if (error) 1360 return error; 1361 1362 if (iget_error) 1363 return iget_error; 1364 1365 error = xchk_trans_alloc(sc, 0); 1366 if (error) 1367 return error; 1368 1369 error = xrep_ino_dqattach(sc); 1370 if (error) 1371 return error; 1372 1373 xchk_ilock(sc, XFS_ILOCK_EXCL); 1374 if (ri->ino_sick_mask) 1375 xfs_inode_mark_sick(sc->ip, ri->ino_sick_mask); 1376 return 0; 1377 } 1378 1379 /* Fix everything xfs_dinode_verify cares about. */ 1380 STATIC int 1381 xrep_dinode_problems( 1382 struct xrep_inode *ri) 1383 { 1384 struct xfs_scrub *sc = ri->sc; 1385 int error; 1386 1387 error = xrep_dinode_core(ri); 1388 if (error) 1389 return error; 1390 1391 /* We had to fix a totally busted inode, schedule quotacheck. */ 1392 if (XFS_IS_UQUOTA_ON(sc->mp)) 1393 xrep_force_quotacheck(sc, XFS_DQTYPE_USER); 1394 if (XFS_IS_GQUOTA_ON(sc->mp)) 1395 xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP); 1396 if (XFS_IS_PQUOTA_ON(sc->mp)) 1397 xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ); 1398 1399 return 0; 1400 } 1401 1402 /* 1403 * Fix problems that the verifiers don't care about. In general these are 1404 * errors that don't cause problems elsewhere in the kernel that we can easily 1405 * detect, so we don't check them all that rigorously. 1406 */ 1407 1408 /* Make sure block and extent counts are ok. */ 1409 STATIC int 1410 xrep_inode_blockcounts( 1411 struct xfs_scrub *sc) 1412 { 1413 struct xfs_ifork *ifp; 1414 xfs_filblks_t count; 1415 xfs_filblks_t acount; 1416 xfs_extnum_t nextents; 1417 int error; 1418 1419 trace_xrep_inode_blockcounts(sc); 1420 1421 /* Set data fork counters from the data fork mappings. */ 1422 error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK, 1423 &nextents, &count); 1424 if (error) 1425 return error; 1426 if (xfs_is_reflink_inode(sc->ip)) { 1427 /* 1428 * data fork blockcount can exceed physical storage if a user 1429 * reflinks the same block over and over again. 1430 */ 1431 ; 1432 } else if (XFS_IS_REALTIME_INODE(sc->ip)) { 1433 if (count >= sc->mp->m_sb.sb_rblocks) 1434 return -EFSCORRUPTED; 1435 } else { 1436 if (count >= sc->mp->m_sb.sb_dblocks) 1437 return -EFSCORRUPTED; 1438 } 1439 error = xrep_ino_ensure_extent_count(sc, XFS_DATA_FORK, nextents); 1440 if (error) 1441 return error; 1442 sc->ip->i_df.if_nextents = nextents; 1443 1444 /* Set attr fork counters from the attr fork mappings. */ 1445 ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK); 1446 if (ifp) { 1447 error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK, 1448 &nextents, &acount); 1449 if (error) 1450 return error; 1451 if (count >= sc->mp->m_sb.sb_dblocks) 1452 return -EFSCORRUPTED; 1453 error = xrep_ino_ensure_extent_count(sc, XFS_ATTR_FORK, 1454 nextents); 1455 if (error) 1456 return error; 1457 ifp->if_nextents = nextents; 1458 } else { 1459 acount = 0; 1460 } 1461 1462 sc->ip->i_nblocks = count + acount; 1463 return 0; 1464 } 1465 1466 /* Check for invalid uid/gid/prid. */ 1467 STATIC void 1468 xrep_inode_ids( 1469 struct xfs_scrub *sc) 1470 { 1471 bool dirty = false; 1472 1473 trace_xrep_inode_ids(sc); 1474 1475 if (!uid_valid(VFS_I(sc->ip)->i_uid)) { 1476 i_uid_write(VFS_I(sc->ip), 0); 1477 dirty = true; 1478 if (XFS_IS_UQUOTA_ON(sc->mp)) 1479 xrep_force_quotacheck(sc, XFS_DQTYPE_USER); 1480 } 1481 1482 if (!gid_valid(VFS_I(sc->ip)->i_gid)) { 1483 i_gid_write(VFS_I(sc->ip), 0); 1484 dirty = true; 1485 if (XFS_IS_GQUOTA_ON(sc->mp)) 1486 xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP); 1487 } 1488 1489 if (sc->ip->i_projid == -1U) { 1490 sc->ip->i_projid = 0; 1491 dirty = true; 1492 if (XFS_IS_PQUOTA_ON(sc->mp)) 1493 xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ); 1494 } 1495 1496 /* strip setuid/setgid if we touched any of the ids */ 1497 if (dirty) 1498 VFS_I(sc->ip)->i_mode &= ~(S_ISUID | S_ISGID); 1499 } 1500 1501 static inline void 1502 xrep_clamp_timestamp( 1503 struct xfs_inode *ip, 1504 struct timespec64 *ts) 1505 { 1506 ts->tv_nsec = clamp_t(long, ts->tv_nsec, 0, NSEC_PER_SEC); 1507 *ts = timestamp_truncate(*ts, VFS_I(ip)); 1508 } 1509 1510 /* Nanosecond counters can't have more than 1 billion. */ 1511 STATIC void 1512 xrep_inode_timestamps( 1513 struct xfs_inode *ip) 1514 { 1515 struct timespec64 tstamp; 1516 struct inode *inode = VFS_I(ip); 1517 1518 tstamp = inode_get_atime(inode); 1519 xrep_clamp_timestamp(ip, &tstamp); 1520 inode_set_atime_to_ts(inode, tstamp); 1521 1522 tstamp = inode_get_mtime(inode); 1523 xrep_clamp_timestamp(ip, &tstamp); 1524 inode_set_mtime_to_ts(inode, tstamp); 1525 1526 tstamp = inode_get_ctime(inode); 1527 xrep_clamp_timestamp(ip, &tstamp); 1528 inode_set_ctime_to_ts(inode, tstamp); 1529 1530 xrep_clamp_timestamp(ip, &ip->i_crtime); 1531 } 1532 1533 /* Fix inode flags that don't make sense together. */ 1534 STATIC void 1535 xrep_inode_flags( 1536 struct xfs_scrub *sc) 1537 { 1538 uint16_t mode; 1539 1540 trace_xrep_inode_flags(sc); 1541 1542 mode = VFS_I(sc->ip)->i_mode; 1543 1544 /* Clear junk flags */ 1545 if (sc->ip->i_diflags & ~XFS_DIFLAG_ANY) 1546 sc->ip->i_diflags &= ~XFS_DIFLAG_ANY; 1547 1548 /* NEWRTBM only applies to realtime bitmaps */ 1549 if (sc->ip->i_ino == sc->mp->m_sb.sb_rbmino) 1550 sc->ip->i_diflags |= XFS_DIFLAG_NEWRTBM; 1551 else 1552 sc->ip->i_diflags &= ~XFS_DIFLAG_NEWRTBM; 1553 1554 /* These only make sense for directories. */ 1555 if (!S_ISDIR(mode)) 1556 sc->ip->i_diflags &= ~(XFS_DIFLAG_RTINHERIT | 1557 XFS_DIFLAG_EXTSZINHERIT | 1558 XFS_DIFLAG_PROJINHERIT | 1559 XFS_DIFLAG_NOSYMLINKS); 1560 1561 /* These only make sense for files. */ 1562 if (!S_ISREG(mode)) 1563 sc->ip->i_diflags &= ~(XFS_DIFLAG_REALTIME | 1564 XFS_DIFLAG_EXTSIZE); 1565 1566 /* These only make sense for non-rt files. */ 1567 if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) 1568 sc->ip->i_diflags &= ~XFS_DIFLAG_FILESTREAM; 1569 1570 /* Immutable and append only? Drop the append. */ 1571 if ((sc->ip->i_diflags & XFS_DIFLAG_IMMUTABLE) && 1572 (sc->ip->i_diflags & XFS_DIFLAG_APPEND)) 1573 sc->ip->i_diflags &= ~XFS_DIFLAG_APPEND; 1574 1575 /* Clear junk flags. */ 1576 if (sc->ip->i_diflags2 & ~XFS_DIFLAG2_ANY) 1577 sc->ip->i_diflags2 &= ~XFS_DIFLAG2_ANY; 1578 1579 /* No reflink flag unless we support it and it's a file. */ 1580 if (!xfs_has_reflink(sc->mp) || !S_ISREG(mode)) 1581 sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; 1582 1583 /* DAX only applies to files and dirs. */ 1584 if (!(S_ISREG(mode) || S_ISDIR(mode))) 1585 sc->ip->i_diflags2 &= ~XFS_DIFLAG2_DAX; 1586 1587 /* No reflink files on the realtime device. */ 1588 if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) 1589 sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; 1590 } 1591 1592 /* 1593 * Fix size problems with block/node format directories. If we fail to find 1594 * the extent list, just bail out and let the bmapbtd repair functions clean 1595 * up that mess. 1596 */ 1597 STATIC void 1598 xrep_inode_blockdir_size( 1599 struct xfs_scrub *sc) 1600 { 1601 struct xfs_iext_cursor icur; 1602 struct xfs_bmbt_irec got; 1603 struct xfs_ifork *ifp; 1604 xfs_fileoff_t off; 1605 int error; 1606 1607 trace_xrep_inode_blockdir_size(sc); 1608 1609 error = xfs_iread_extents(sc->tp, sc->ip, XFS_DATA_FORK); 1610 if (error) 1611 return; 1612 1613 /* Find the last block before 32G; this is the dir size. */ 1614 ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); 1615 off = XFS_B_TO_FSB(sc->mp, XFS_DIR2_SPACE_SIZE); 1616 if (!xfs_iext_lookup_extent_before(sc->ip, ifp, &off, &icur, &got)) { 1617 /* zero-extents directory? */ 1618 return; 1619 } 1620 1621 off = got.br_startoff + got.br_blockcount; 1622 sc->ip->i_disk_size = min_t(loff_t, XFS_DIR2_SPACE_SIZE, 1623 XFS_FSB_TO_B(sc->mp, off)); 1624 } 1625 1626 /* Fix size problems with short format directories. */ 1627 STATIC void 1628 xrep_inode_sfdir_size( 1629 struct xfs_scrub *sc) 1630 { 1631 struct xfs_ifork *ifp; 1632 1633 trace_xrep_inode_sfdir_size(sc); 1634 1635 ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); 1636 sc->ip->i_disk_size = ifp->if_bytes; 1637 } 1638 1639 /* 1640 * Fix any irregularities in a directory inode's size now that we can iterate 1641 * extent maps and access other regular inode data. 1642 */ 1643 STATIC void 1644 xrep_inode_dir_size( 1645 struct xfs_scrub *sc) 1646 { 1647 trace_xrep_inode_dir_size(sc); 1648 1649 switch (sc->ip->i_df.if_format) { 1650 case XFS_DINODE_FMT_EXTENTS: 1651 case XFS_DINODE_FMT_BTREE: 1652 xrep_inode_blockdir_size(sc); 1653 break; 1654 case XFS_DINODE_FMT_LOCAL: 1655 xrep_inode_sfdir_size(sc); 1656 break; 1657 } 1658 } 1659 1660 /* Fix extent size hint problems. */ 1661 STATIC void 1662 xrep_inode_extsize( 1663 struct xfs_scrub *sc) 1664 { 1665 /* Fix misaligned extent size hints on a directory. */ 1666 if ((sc->ip->i_diflags & XFS_DIFLAG_RTINHERIT) && 1667 (sc->ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) && 1668 xfs_extlen_to_rtxmod(sc->mp, sc->ip->i_extsize) > 0) { 1669 sc->ip->i_extsize = 0; 1670 sc->ip->i_diflags &= ~XFS_DIFLAG_EXTSZINHERIT; 1671 } 1672 } 1673 1674 /* Fix any irregularities in an inode that the verifiers don't catch. */ 1675 STATIC int 1676 xrep_inode_problems( 1677 struct xfs_scrub *sc) 1678 { 1679 int error; 1680 1681 error = xrep_inode_blockcounts(sc); 1682 if (error) 1683 return error; 1684 xrep_inode_timestamps(sc->ip); 1685 xrep_inode_flags(sc); 1686 xrep_inode_ids(sc); 1687 /* 1688 * We can now do a better job fixing the size of a directory now that 1689 * we can scan the data fork extents than we could in xrep_dinode_size. 1690 */ 1691 if (S_ISDIR(VFS_I(sc->ip)->i_mode)) 1692 xrep_inode_dir_size(sc); 1693 xrep_inode_extsize(sc); 1694 1695 trace_xrep_inode_fixed(sc); 1696 xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); 1697 return xrep_roll_trans(sc); 1698 } 1699 1700 /* Repair an inode's fields. */ 1701 int 1702 xrep_inode( 1703 struct xfs_scrub *sc) 1704 { 1705 int error = 0; 1706 1707 /* 1708 * No inode? That means we failed the _iget verifiers. Repair all 1709 * the things that the inode verifiers care about, then retry _iget. 1710 */ 1711 if (!sc->ip) { 1712 struct xrep_inode *ri = sc->buf; 1713 1714 ASSERT(ri != NULL); 1715 1716 error = xrep_dinode_problems(ri); 1717 if (error == -EBUSY) { 1718 /* 1719 * Directory scan to recover inode mode encountered a 1720 * busy inode, so we did not continue repairing things. 1721 */ 1722 return 0; 1723 } 1724 if (error) 1725 return error; 1726 1727 /* By this point we had better have a working incore inode. */ 1728 if (!sc->ip) 1729 return -EFSCORRUPTED; 1730 } 1731 1732 xfs_trans_ijoin(sc->tp, sc->ip, 0); 1733 1734 /* If we found corruption of any kind, try to fix it. */ 1735 if ((sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) || 1736 (sc->sm->sm_flags & XFS_SCRUB_OFLAG_XCORRUPT)) { 1737 error = xrep_inode_problems(sc); 1738 if (error) 1739 return error; 1740 } 1741 1742 /* See if we can clear the reflink flag. */ 1743 if (xfs_is_reflink_inode(sc->ip)) { 1744 error = xfs_reflink_clear_inode_flag(sc->ip, &sc->tp); 1745 if (error) 1746 return error; 1747 } 1748 1749 return xrep_defer_finish(sc); 1750 } 1751