1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (c) 2020-2024 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs_platform.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_defer.h" 13 #include "xfs_bit.h" 14 #include "xfs_log_format.h" 15 #include "xfs_trans.h" 16 #include "xfs_sb.h" 17 #include "xfs_inode.h" 18 #include "xfs_icache.h" 19 #include "xfs_da_format.h" 20 #include "xfs_da_btree.h" 21 #include "xfs_dir2.h" 22 #include "xfs_dir2_priv.h" 23 #include "xfs_bmap.h" 24 #include "xfs_quota.h" 25 #include "xfs_bmap_btree.h" 26 #include "xfs_trans_space.h" 27 #include "xfs_bmap_util.h" 28 #include "xfs_exchmaps.h" 29 #include "xfs_exchrange.h" 30 #include "xfs_ag.h" 31 #include "xfs_parent.h" 32 #include "scrub/xfs_scrub.h" 33 #include "scrub/scrub.h" 34 #include "scrub/common.h" 35 #include "scrub/trace.h" 36 #include "scrub/repair.h" 37 #include "scrub/tempfile.h" 38 #include "scrub/tempexch.h" 39 #include "scrub/xfile.h" 40 #include "scrub/xfarray.h" 41 #include "scrub/xfblob.h" 42 #include "scrub/iscan.h" 43 #include "scrub/readdir.h" 44 #include "scrub/reap.h" 45 #include "scrub/findparent.h" 46 #include "scrub/orphanage.h" 47 #include "scrub/listxattr.h" 48 49 /* 50 * Directory Repair 51 * ================ 52 * 53 * We repair directories by reading the directory data blocks looking for 54 * directory entries that look salvageable (name passes verifiers, entry points 55 * to a valid allocated inode, etc). Each entry worth salvaging is stashed in 56 * memory, and the stashed entries are periodically replayed into a temporary 57 * directory to constrain memory use. Batching the construction of the 58 * temporary directory in this fashion reduces lock cycling of the directory 59 * being repaired and the temporary directory, and will later become important 60 * for parent pointer scanning. 61 * 62 * If parent pointers are enabled on this filesystem, we instead reconstruct 63 * the directory by visiting each parent pointer of each file in the filesystem 64 * and translating the relevant parent pointer records into dirents. In this 65 * case, it is advantageous to stash all directory entries created from parent 66 * pointers for a single child file before replaying them into the temporary 67 * directory. To save memory, the live filesystem scan reuses the findparent 68 * fields. Directory repair chooses either parent pointer scanning or 69 * directory entry salvaging, but not both. 70 * 71 * Directory entries added to the temporary directory do not elevate the link 72 * counts of the inodes found. When salvaging completes, the remaining stashed 73 * entries are replayed to the temporary directory. An atomic mapping exchange 74 * is used to commit the new directory blocks to the directory being repaired. 75 * This will disrupt readdir cursors. 76 * 77 * Locking Issues 78 * -------------- 79 * 80 * If /a, /a/b, and /c are all directories, the VFS does not take i_rwsem on 81 * /a/b for a "mv /a/b /c/" operation. This means that only b's ILOCK protects 82 * b's dotdot update. This is in contrast to every other dotdot update (link, 83 * remove, mkdir). If the repair code drops the ILOCK, it must either 84 * revalidate the dotdot entry or use dirent hooks to capture updates from 85 * other threads. 86 */ 87 88 /* Create a dirent in the tempdir. */ 89 #define XREP_DIRENT_ADD (1) 90 91 /* Remove a dirent from the tempdir. */ 92 #define XREP_DIRENT_REMOVE (2) 93 94 /* Directory entry to be restored in the new directory. */ 95 struct xrep_dirent { 96 /* Cookie for retrieval of the dirent name. */ 97 xfblob_cookie name_cookie; 98 99 /* Target inode number. */ 100 xfs_ino_t ino; 101 102 /* Length of the dirent name. */ 103 uint8_t namelen; 104 105 /* File type of the dirent. */ 106 uint8_t ftype; 107 108 /* XREP_DIRENT_{ADD,REMOVE} */ 109 uint8_t action; 110 }; 111 112 /* 113 * Stash up to 8 pages of recovered dirent data in dir_entries and dir_names 114 * before we write them to the temp dir. 115 */ 116 #define XREP_DIR_MAX_STASH_BYTES (PAGE_SIZE * 8) 117 118 struct xrep_dir { 119 struct xfs_scrub *sc; 120 121 /* Fixed-size array of xrep_dirent structures. */ 122 struct xfarray *dir_entries; 123 124 /* Blobs containing directory entry names. */ 125 struct xfblob *dir_names; 126 127 /* Information for exchanging data forks at the end. */ 128 struct xrep_tempexch tx; 129 130 /* Preallocated args struct for performing dir operations */ 131 struct xfs_da_args args; 132 133 /* 134 * Information used to scan the filesystem to find the inumber of the 135 * dotdot entry for this directory. For directory salvaging when 136 * parent pointers are not enabled, we use the findparent_* functions 137 * on this object and access only the parent_ino field directly. 138 * 139 * When parent pointers are enabled, however, the pptr scanner uses the 140 * iscan, hooks, lock, and parent_ino fields of this object directly. 141 * @pscan.lock coordinates access to dir_entries, dir_names, 142 * parent_ino, subdirs, dirents, and args. This reduces the memory 143 * requirements of this structure. 144 */ 145 struct xrep_parent_scan_info pscan; 146 147 /* 148 * Context information for attaching this directory to the lost+found 149 * if this directory does not have a parent. 150 */ 151 struct xrep_adoption adoption; 152 153 /* How many subdirectories did we find? */ 154 uint64_t subdirs; 155 156 /* How many dirents did we find? */ 157 unsigned int dirents; 158 159 /* Should we move this directory to the orphanage? */ 160 bool needs_adoption; 161 162 /* Directory entry name, plus the trailing null. */ 163 struct xfs_name xname; 164 unsigned char namebuf[MAXNAMELEN]; 165 }; 166 167 /* Tear down all the incore stuff we created. */ 168 static void 169 xrep_dir_teardown( 170 struct xfs_scrub *sc) 171 { 172 struct xrep_dir *rd = sc->buf; 173 174 xrep_findparent_scan_teardown(&rd->pscan); 175 if (rd->dir_names) 176 xfblob_destroy(rd->dir_names); 177 rd->dir_names = NULL; 178 if (rd->dir_entries) 179 xfarray_destroy(rd->dir_entries); 180 rd->dir_names = NULL; 181 } 182 183 /* Set up for a directory repair. */ 184 int 185 xrep_setup_directory( 186 struct xfs_scrub *sc) 187 { 188 struct xrep_dir *rd; 189 int error; 190 191 xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS); 192 193 error = xrep_orphanage_try_create(sc); 194 if (error) 195 return error; 196 197 error = xrep_tempfile_create(sc, S_IFDIR); 198 if (error) 199 return error; 200 201 rd = kvzalloc(sizeof(struct xrep_dir), XCHK_GFP_FLAGS); 202 if (!rd) 203 return -ENOMEM; 204 rd->sc = sc; 205 rd->xname.name = rd->namebuf; 206 sc->buf = rd; 207 208 return 0; 209 } 210 211 /* 212 * Look up the dotdot entry and confirm that it's really the parent. 213 * Returns NULLFSINO if we don't know what to do. 214 */ 215 static inline xfs_ino_t 216 xrep_dir_lookup_parent( 217 struct xrep_dir *rd) 218 { 219 struct xfs_scrub *sc = rd->sc; 220 xfs_ino_t ino; 221 int error; 222 223 error = xfs_dir_lookup(sc->tp, sc->ip, &xfs_name_dotdot, &ino, NULL); 224 if (error) 225 return NULLFSINO; 226 if (!xfs_verify_dir_ino(sc->mp, ino)) 227 return NULLFSINO; 228 229 error = xrep_findparent_confirm(sc, &ino); 230 if (error) 231 return NULLFSINO; 232 233 return ino; 234 } 235 236 /* 237 * Look up '..' in the dentry cache and confirm that it's really the parent. 238 * Returns NULLFSINO if the dcache misses or if the hit is implausible. 239 */ 240 static inline xfs_ino_t 241 xrep_dir_dcache_parent( 242 struct xrep_dir *rd) 243 { 244 struct xfs_scrub *sc = rd->sc; 245 xfs_ino_t parent_ino; 246 int error; 247 248 parent_ino = xrep_findparent_from_dcache(sc); 249 if (parent_ino == NULLFSINO) 250 return parent_ino; 251 252 error = xrep_findparent_confirm(sc, &parent_ino); 253 if (error) 254 return NULLFSINO; 255 256 return parent_ino; 257 } 258 259 /* Try to find the parent of the directory being repaired. */ 260 STATIC int 261 xrep_dir_find_parent( 262 struct xrep_dir *rd) 263 { 264 xfs_ino_t ino; 265 266 ino = xrep_findparent_self_reference(rd->sc); 267 if (ino != NULLFSINO) { 268 xrep_findparent_scan_finish_early(&rd->pscan, ino); 269 return 0; 270 } 271 272 ino = xrep_dir_dcache_parent(rd); 273 if (ino != NULLFSINO) { 274 xrep_findparent_scan_finish_early(&rd->pscan, ino); 275 return 0; 276 } 277 278 ino = xrep_dir_lookup_parent(rd); 279 if (ino != NULLFSINO) { 280 xrep_findparent_scan_finish_early(&rd->pscan, ino); 281 return 0; 282 } 283 284 /* 285 * A full filesystem scan is the last resort. On a busy filesystem, 286 * the scan can fail with -EBUSY if we cannot grab IOLOCKs. That means 287 * that we don't know what who the parent is, so we should return to 288 * userspace. 289 */ 290 return xrep_findparent_scan(&rd->pscan); 291 } 292 293 /* 294 * Decide if we want to salvage this entry. We don't bother with oversized 295 * names or the dot entry. 296 */ 297 STATIC int 298 xrep_dir_want_salvage( 299 struct xrep_dir *rd, 300 const char *name, 301 int namelen, 302 xfs_ino_t ino) 303 { 304 struct xfs_mount *mp = rd->sc->mp; 305 306 /* No pointers to ourselves or to garbage. */ 307 if (ino == rd->sc->ip->i_ino) 308 return false; 309 if (!xfs_verify_dir_ino(mp, ino)) 310 return false; 311 312 /* No weird looking names or dot entries. */ 313 if (namelen >= MAXNAMELEN || namelen <= 0) 314 return false; 315 if (namelen == 1 && name[0] == '.') 316 return false; 317 if (!xfs_dir2_namecheck(name, namelen)) 318 return false; 319 320 return true; 321 } 322 323 /* 324 * Remember that we want to create a dirent in the tempdir. These stashed 325 * actions will be replayed later. 326 */ 327 STATIC int 328 xrep_dir_stash_createname( 329 struct xrep_dir *rd, 330 const struct xfs_name *name, 331 xfs_ino_t ino) 332 { 333 struct xrep_dirent dirent = { 334 .action = XREP_DIRENT_ADD, 335 .ino = ino, 336 .namelen = name->len, 337 .ftype = name->type, 338 }; 339 int error; 340 341 trace_xrep_dir_stash_createname(rd->sc->tempip, name, ino); 342 343 error = xfblob_storename(rd->dir_names, &dirent.name_cookie, name); 344 if (error) 345 return error; 346 347 return xfarray_append(rd->dir_entries, &dirent); 348 } 349 350 /* 351 * Remember that we want to remove a dirent from the tempdir. These stashed 352 * actions will be replayed later. 353 */ 354 STATIC int 355 xrep_dir_stash_removename( 356 struct xrep_dir *rd, 357 const struct xfs_name *name, 358 xfs_ino_t ino) 359 { 360 struct xrep_dirent dirent = { 361 .action = XREP_DIRENT_REMOVE, 362 .ino = ino, 363 .namelen = name->len, 364 .ftype = name->type, 365 }; 366 int error; 367 368 trace_xrep_dir_stash_removename(rd->sc->tempip, name, ino); 369 370 error = xfblob_storename(rd->dir_names, &dirent.name_cookie, name); 371 if (error) 372 return error; 373 374 return xfarray_append(rd->dir_entries, &dirent); 375 } 376 377 /* Allocate an in-core record to hold entries while we rebuild the dir data. */ 378 STATIC int 379 xrep_dir_salvage_entry( 380 struct xrep_dir *rd, 381 unsigned char *name, 382 unsigned int namelen, 383 xfs_ino_t ino) 384 { 385 struct xfs_name xname = { 386 .name = name, 387 }; 388 struct xfs_scrub *sc = rd->sc; 389 struct xfs_inode *ip; 390 unsigned int i = 0; 391 int error = 0; 392 393 if (xchk_should_terminate(sc, &error)) 394 return error; 395 396 /* 397 * Truncate the name to the first character that would trip namecheck. 398 * If we no longer have a name after that, ignore this entry. 399 */ 400 while (i < namelen && name[i] != 0 && name[i] != '/') 401 i++; 402 if (i == 0) 403 return 0; 404 xname.len = i; 405 406 /* Ignore '..' entries; we already picked the new parent. */ 407 if (xname.len == 2 && name[0] == '.' && name[1] == '.') { 408 trace_xrep_dir_salvaged_parent(sc->ip, ino); 409 return 0; 410 } 411 412 trace_xrep_dir_salvage_entry(sc->ip, &xname, ino); 413 414 /* 415 * Compute the ftype or dump the entry if we can't. We don't lock the 416 * inode because inodes can't change type while we have a reference. 417 */ 418 error = xchk_iget(sc, ino, &ip); 419 if (error) 420 return 0; 421 422 /* Don't mix metadata and regular directory trees. */ 423 if (xfs_is_metadir_inode(ip) != xfs_is_metadir_inode(rd->sc->ip)) { 424 xchk_irele(sc, ip); 425 return 0; 426 } 427 428 xname.type = xfs_mode_to_ftype(VFS_I(ip)->i_mode); 429 xchk_irele(sc, ip); 430 431 return xrep_dir_stash_createname(rd, &xname, ino); 432 } 433 434 /* Record a shortform directory entry for later reinsertion. */ 435 STATIC int 436 xrep_dir_salvage_sf_entry( 437 struct xrep_dir *rd, 438 struct xfs_dir2_sf_hdr *sfp, 439 struct xfs_dir2_sf_entry *sfep) 440 { 441 xfs_ino_t ino; 442 443 ino = xfs_dir2_sf_get_ino(rd->sc->mp, sfp, sfep); 444 if (!xrep_dir_want_salvage(rd, sfep->name, sfep->namelen, ino)) 445 return 0; 446 447 return xrep_dir_salvage_entry(rd, sfep->name, sfep->namelen, ino); 448 } 449 450 /* Record a regular directory entry for later reinsertion. */ 451 STATIC int 452 xrep_dir_salvage_data_entry( 453 struct xrep_dir *rd, 454 struct xfs_dir2_data_entry *dep) 455 { 456 xfs_ino_t ino; 457 458 ino = be64_to_cpu(dep->inumber); 459 if (!xrep_dir_want_salvage(rd, dep->name, dep->namelen, ino)) 460 return 0; 461 462 return xrep_dir_salvage_entry(rd, dep->name, dep->namelen, ino); 463 } 464 465 /* Try to recover block/data format directory entries. */ 466 STATIC int 467 xrep_dir_recover_data( 468 struct xrep_dir *rd, 469 struct xfs_buf *bp) 470 { 471 struct xfs_da_geometry *geo = rd->sc->mp->m_dir_geo; 472 unsigned int offset; 473 unsigned int end; 474 int error = 0; 475 476 /* 477 * Loop over the data portion of the block. 478 * Each object is a real entry (dep) or an unused one (dup). 479 */ 480 offset = geo->data_entry_offset; 481 end = min_t(unsigned int, BBTOB(bp->b_length), 482 xfs_dir3_data_end_offset(geo, bp->b_addr)); 483 484 while (offset < end) { 485 struct xfs_dir2_data_unused *dup = bp->b_addr + offset; 486 struct xfs_dir2_data_entry *dep = bp->b_addr + offset; 487 488 if (xchk_should_terminate(rd->sc, &error)) 489 return error; 490 491 /* Skip unused entries. */ 492 if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { 493 offset += be16_to_cpu(dup->length); 494 continue; 495 } 496 497 /* Don't walk off the end of the block. */ 498 offset += xfs_dir2_data_entsize(rd->sc->mp, dep->namelen); 499 if (offset > end) 500 break; 501 502 /* Ok, let's save this entry. */ 503 error = xrep_dir_salvage_data_entry(rd, dep); 504 if (error) 505 return error; 506 507 } 508 509 return 0; 510 } 511 512 /* Try to recover shortform directory entries. */ 513 STATIC int 514 xrep_dir_recover_sf( 515 struct xrep_dir *rd) 516 { 517 struct xfs_dir2_sf_hdr *hdr; 518 struct xfs_dir2_sf_entry *sfep; 519 struct xfs_dir2_sf_entry *next; 520 struct xfs_ifork *ifp; 521 xfs_ino_t ino; 522 unsigned char *end; 523 int error = 0; 524 525 ifp = xfs_ifork_ptr(rd->sc->ip, XFS_DATA_FORK); 526 hdr = ifp->if_data; 527 end = (unsigned char *)ifp->if_data + ifp->if_bytes; 528 529 ino = xfs_dir2_sf_get_parent_ino(hdr); 530 trace_xrep_dir_salvaged_parent(rd->sc->ip, ino); 531 532 sfep = xfs_dir2_sf_firstentry(hdr); 533 while ((unsigned char *)sfep < end) { 534 if (xchk_should_terminate(rd->sc, &error)) 535 return error; 536 537 next = xfs_dir2_sf_nextentry(rd->sc->mp, hdr, sfep); 538 if ((unsigned char *)next > end) 539 break; 540 541 /* Ok, let's save this entry. */ 542 error = xrep_dir_salvage_sf_entry(rd, hdr, sfep); 543 if (error) 544 return error; 545 546 sfep = next; 547 } 548 549 return 0; 550 } 551 552 /* 553 * Try to figure out the format of this directory from the data fork mappings 554 * and the directory size. If we can be reasonably sure of format, we can be 555 * more aggressive in salvaging directory entries. On return, @magic_guess 556 * will be set to DIR3_BLOCK_MAGIC if we think this is a "block format" 557 * directory; DIR3_DATA_MAGIC if we think this is a "data format" directory, 558 * and 0 if we can't tell. 559 */ 560 STATIC void 561 xrep_dir_guess_format( 562 struct xrep_dir *rd, 563 __be32 *magic_guess) 564 { 565 struct xfs_inode *dp = rd->sc->ip; 566 struct xfs_mount *mp = rd->sc->mp; 567 struct xfs_da_geometry *geo = mp->m_dir_geo; 568 xfs_fileoff_t last; 569 int error; 570 571 ASSERT(xfs_has_crc(mp)); 572 573 *magic_guess = 0; 574 575 /* 576 * If there's a single directory block and the directory size is 577 * exactly one block, this has to be a single block format directory. 578 */ 579 error = xfs_bmap_last_offset(dp, &last, XFS_DATA_FORK); 580 if (!error && XFS_FSB_TO_B(mp, last) == geo->blksize && 581 dp->i_disk_size == geo->blksize) { 582 *magic_guess = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC); 583 return; 584 } 585 586 /* 587 * If the last extent before the leaf offset matches the directory 588 * size and the directory size is larger than 1 block, this is a 589 * data format directory. 590 */ 591 last = geo->leafblk; 592 error = xfs_bmap_last_before(rd->sc->tp, dp, &last, XFS_DATA_FORK); 593 if (!error && 594 XFS_FSB_TO_B(mp, last) > geo->blksize && 595 XFS_FSB_TO_B(mp, last) == dp->i_disk_size) { 596 *magic_guess = cpu_to_be32(XFS_DIR3_DATA_MAGIC); 597 return; 598 } 599 } 600 601 /* Recover directory entries from a specific directory block. */ 602 STATIC int 603 xrep_dir_recover_dirblock( 604 struct xrep_dir *rd, 605 __be32 magic_guess, 606 xfs_dablk_t dabno) 607 { 608 struct xfs_dir2_data_hdr *hdr; 609 struct xfs_buf *bp; 610 __be32 oldmagic; 611 int error; 612 613 /* 614 * Try to read buffer. We invalidate them in the next step so we don't 615 * bother to set a buffer type or ops. 616 */ 617 error = xfs_da_read_buf(rd->sc->tp, rd->sc->ip, dabno, 618 XFS_DABUF_MAP_HOLE_OK, &bp, XFS_DATA_FORK, NULL); 619 if (error || !bp) 620 return error; 621 622 hdr = bp->b_addr; 623 oldmagic = hdr->magic; 624 625 trace_xrep_dir_recover_dirblock(rd->sc->ip, dabno, 626 be32_to_cpu(hdr->magic), be32_to_cpu(magic_guess)); 627 628 /* 629 * If we're sure of the block's format, proceed with the salvage 630 * operation using the specified magic number. 631 */ 632 if (magic_guess) { 633 hdr->magic = magic_guess; 634 goto recover; 635 } 636 637 /* 638 * If we couldn't guess what type of directory this is, then we will 639 * only salvage entries from directory blocks that match the magic 640 * number and pass verifiers. 641 */ 642 switch (hdr->magic) { 643 case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC): 644 case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC): 645 if (!xrep_buf_verify_struct(bp, &xfs_dir3_block_buf_ops)) 646 goto out; 647 if (xfs_dir3_block_header_check(bp, rd->sc->ip->i_ino) != NULL) 648 goto out; 649 break; 650 case cpu_to_be32(XFS_DIR2_DATA_MAGIC): 651 case cpu_to_be32(XFS_DIR3_DATA_MAGIC): 652 if (!xrep_buf_verify_struct(bp, &xfs_dir3_data_buf_ops)) 653 goto out; 654 if (xfs_dir3_data_header_check(bp, rd->sc->ip->i_ino) != NULL) 655 goto out; 656 break; 657 default: 658 goto out; 659 } 660 661 recover: 662 error = xrep_dir_recover_data(rd, bp); 663 664 out: 665 hdr->magic = oldmagic; 666 xfs_trans_brelse(rd->sc->tp, bp); 667 return error; 668 } 669 670 static inline void 671 xrep_dir_init_args( 672 struct xrep_dir *rd, 673 struct xfs_inode *dp, 674 const struct xfs_name *name) 675 { 676 memset(&rd->args, 0, sizeof(struct xfs_da_args)); 677 rd->args.geo = rd->sc->mp->m_dir_geo; 678 rd->args.whichfork = XFS_DATA_FORK; 679 rd->args.owner = rd->sc->ip->i_ino; 680 rd->args.trans = rd->sc->tp; 681 rd->args.dp = dp; 682 if (!name) 683 return; 684 rd->args.name = name->name; 685 rd->args.namelen = name->len; 686 rd->args.filetype = name->type; 687 rd->args.hashval = xfs_dir2_hashname(rd->sc->mp, name); 688 } 689 690 /* Replay a stashed createname into the temporary directory. */ 691 STATIC int 692 xrep_dir_replay_createname( 693 struct xrep_dir *rd, 694 const struct xfs_name *name, 695 xfs_ino_t inum, 696 xfs_extlen_t total) 697 { 698 struct xfs_scrub *sc = rd->sc; 699 struct xfs_inode *dp = rd->sc->tempip; 700 int error; 701 702 ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); 703 704 error = xfs_dir_ino_validate(sc->mp, inum); 705 if (error) 706 return error; 707 708 trace_xrep_dir_replay_createname(dp, name, inum); 709 710 xrep_dir_init_args(rd, dp, name); 711 rd->args.inumber = inum; 712 rd->args.total = total; 713 rd->args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; 714 return xfs_dir_createname_args(&rd->args); 715 } 716 717 /* Replay a stashed removename onto the temporary directory. */ 718 STATIC int 719 xrep_dir_replay_removename( 720 struct xrep_dir *rd, 721 const struct xfs_name *name, 722 xfs_extlen_t total) 723 { 724 struct xfs_inode *dp = rd->args.dp; 725 726 ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); 727 728 xrep_dir_init_args(rd, dp, name); 729 rd->args.op_flags = 0; 730 rd->args.total = total; 731 732 trace_xrep_dir_replay_removename(dp, name, 0); 733 return xfs_dir_removename_args(&rd->args); 734 } 735 736 /* 737 * Add this stashed incore directory entry to the temporary directory. 738 * The caller must hold the tempdir's IOLOCK, must not hold any ILOCKs, and 739 * must not be in transaction context. 740 */ 741 STATIC int 742 xrep_dir_replay_update( 743 struct xrep_dir *rd, 744 const struct xfs_name *xname, 745 const struct xrep_dirent *dirent) 746 { 747 struct xfs_mount *mp = rd->sc->mp; 748 #ifdef DEBUG 749 xfs_ino_t ino; 750 #endif 751 uint resblks; 752 int error; 753 754 resblks = xfs_link_space_res(mp, xname->len); 755 error = xchk_trans_alloc(rd->sc, resblks); 756 if (error) 757 return error; 758 759 /* Lock the temporary directory and join it to the transaction */ 760 xrep_tempfile_ilock(rd->sc); 761 xfs_trans_ijoin(rd->sc->tp, rd->sc->tempip, 0); 762 763 switch (dirent->action) { 764 case XREP_DIRENT_ADD: 765 /* 766 * Create a replacement dirent in the temporary directory. 767 * Note that _createname doesn't check for existing entries. 768 * There shouldn't be any in the temporary dir, but we'll 769 * verify this in debug mode. 770 */ 771 #ifdef DEBUG 772 error = xchk_dir_lookup(rd->sc, rd->sc->tempip, xname, &ino); 773 if (error != -ENOENT) { 774 ASSERT(error != -ENOENT); 775 goto out_cancel; 776 } 777 #endif 778 779 error = xrep_dir_replay_createname(rd, xname, dirent->ino, 780 resblks); 781 if (error) 782 goto out_cancel; 783 784 if (xname->type == XFS_DIR3_FT_DIR) 785 rd->subdirs++; 786 rd->dirents++; 787 break; 788 case XREP_DIRENT_REMOVE: 789 /* 790 * Remove a dirent from the temporary directory. Note that 791 * _removename doesn't check the inode target of the exist 792 * entry. There should be a perfect match in the temporary 793 * dir, but we'll verify this in debug mode. 794 */ 795 #ifdef DEBUG 796 error = xchk_dir_lookup(rd->sc, rd->sc->tempip, xname, &ino); 797 if (error) { 798 ASSERT(error != 0); 799 goto out_cancel; 800 } 801 if (ino != dirent->ino) { 802 ASSERT(ino == dirent->ino); 803 error = -EIO; 804 goto out_cancel; 805 } 806 #endif 807 808 error = xrep_dir_replay_removename(rd, xname, resblks); 809 if (error) 810 goto out_cancel; 811 812 if (xname->type == XFS_DIR3_FT_DIR) 813 rd->subdirs--; 814 rd->dirents--; 815 break; 816 default: 817 ASSERT(0); 818 error = -EIO; 819 goto out_cancel; 820 } 821 822 /* Commit and unlock. */ 823 error = xrep_trans_commit(rd->sc); 824 if (error) 825 return error; 826 827 xrep_tempfile_iunlock(rd->sc); 828 return 0; 829 out_cancel: 830 xchk_trans_cancel(rd->sc); 831 xrep_tempfile_iunlock(rd->sc); 832 return error; 833 } 834 835 /* 836 * Flush stashed incore dirent updates that have been recorded by the scanner. 837 * This is done to reduce the memory requirements of the directory rebuild, 838 * since directories can contain up to 32GB of directory data. 839 * 840 * Caller must not hold transactions or ILOCKs. Caller must hold the tempdir 841 * IOLOCK. 842 */ 843 STATIC int 844 xrep_dir_replay_updates( 845 struct xrep_dir *rd) 846 { 847 xfarray_idx_t array_cur; 848 int error; 849 850 /* Add all the salvaged dirents to the temporary directory. */ 851 mutex_lock(&rd->pscan.lock); 852 foreach_xfarray_idx(rd->dir_entries, array_cur) { 853 struct xrep_dirent dirent; 854 855 error = xfarray_load(rd->dir_entries, array_cur, &dirent); 856 if (error) 857 goto out_unlock; 858 859 error = xfblob_loadname(rd->dir_names, dirent.name_cookie, 860 &rd->xname, dirent.namelen); 861 if (error) 862 goto out_unlock; 863 rd->xname.type = dirent.ftype; 864 mutex_unlock(&rd->pscan.lock); 865 866 error = xrep_dir_replay_update(rd, &rd->xname, &dirent); 867 if (error) 868 return error; 869 mutex_lock(&rd->pscan.lock); 870 } 871 872 /* Empty out both arrays now that we've added the entries. */ 873 xfarray_truncate(rd->dir_entries); 874 xfblob_truncate(rd->dir_names); 875 mutex_unlock(&rd->pscan.lock); 876 return 0; 877 out_unlock: 878 mutex_unlock(&rd->pscan.lock); 879 return error; 880 } 881 882 /* 883 * Periodically flush stashed directory entries to the temporary dir. This 884 * is done to reduce the memory requirements of the directory rebuild, since 885 * directories can contain up to 32GB of directory data. 886 */ 887 STATIC int 888 xrep_dir_flush_stashed( 889 struct xrep_dir *rd) 890 { 891 int error; 892 893 /* 894 * Entering this function, the scrub context has a reference to the 895 * inode being repaired, the temporary file, and a scrub transaction 896 * that we use during dirent salvaging to avoid livelocking if there 897 * are cycles in the directory structures. We hold ILOCK_EXCL on both 898 * the inode being repaired and the temporary file, though they are 899 * not ijoined to the scrub transaction. 900 * 901 * To constrain kernel memory use, we occasionally write salvaged 902 * dirents from the xfarray and xfblob structures into the temporary 903 * directory in preparation for exchanging the directory structures at 904 * the end. Updating the temporary file requires a transaction, so we 905 * commit the scrub transaction and drop the two ILOCKs so that 906 * we can allocate whatever transaction we want. 907 * 908 * We still hold IOLOCK_EXCL on the inode being repaired, which 909 * prevents anyone from accessing the damaged directory data while we 910 * repair it. 911 */ 912 error = xrep_trans_commit(rd->sc); 913 if (error) 914 return error; 915 xchk_iunlock(rd->sc, XFS_ILOCK_EXCL); 916 917 /* 918 * Take the IOLOCK of the temporary file while we modify dirents. This 919 * isn't strictly required because the temporary file is never revealed 920 * to userspace, but we follow the same locking rules. We still hold 921 * sc->ip's IOLOCK. 922 */ 923 error = xrep_tempfile_iolock_polled(rd->sc); 924 if (error) 925 return error; 926 927 /* Write to the tempdir all the updates that we've stashed. */ 928 error = xrep_dir_replay_updates(rd); 929 xrep_tempfile_iounlock(rd->sc); 930 if (error) 931 return error; 932 933 /* 934 * Recreate the salvage transaction and relock the dir we're salvaging. 935 */ 936 error = xchk_trans_alloc(rd->sc, 0); 937 if (error) 938 return error; 939 xchk_ilock(rd->sc, XFS_ILOCK_EXCL); 940 return 0; 941 } 942 943 /* Decide if we've stashed too much dirent data in memory. */ 944 static inline bool 945 xrep_dir_want_flush_stashed( 946 struct xrep_dir *rd) 947 { 948 unsigned long long bytes; 949 950 bytes = xfarray_bytes(rd->dir_entries) + xfblob_bytes(rd->dir_names); 951 return bytes > XREP_DIR_MAX_STASH_BYTES; 952 } 953 954 /* Extract as many directory entries as we can. */ 955 STATIC int 956 xrep_dir_recover( 957 struct xrep_dir *rd) 958 { 959 struct xfs_bmbt_irec got; 960 struct xfs_scrub *sc = rd->sc; 961 struct xfs_da_geometry *geo = sc->mp->m_dir_geo; 962 xfs_fileoff_t offset; 963 xfs_dablk_t dabno; 964 __be32 magic_guess; 965 int nmap; 966 int error; 967 968 xrep_dir_guess_format(rd, &magic_guess); 969 970 /* Iterate each directory data block in the data fork. */ 971 for (offset = 0; 972 offset < geo->leafblk; 973 offset = got.br_startoff + got.br_blockcount) { 974 nmap = 1; 975 error = xfs_bmapi_read(sc->ip, offset, geo->leafblk - offset, 976 &got, &nmap, 0); 977 if (error) 978 return error; 979 if (nmap != 1) 980 return -EFSCORRUPTED; 981 if (!xfs_bmap_is_written_extent(&got)) 982 continue; 983 984 for (dabno = round_up(got.br_startoff, geo->fsbcount); 985 dabno < got.br_startoff + got.br_blockcount; 986 dabno += geo->fsbcount) { 987 if (xchk_should_terminate(rd->sc, &error)) 988 return error; 989 990 error = xrep_dir_recover_dirblock(rd, 991 magic_guess, dabno); 992 if (error) 993 return error; 994 995 /* Flush dirents to constrain memory usage. */ 996 if (xrep_dir_want_flush_stashed(rd)) { 997 error = xrep_dir_flush_stashed(rd); 998 if (error) 999 return error; 1000 } 1001 } 1002 } 1003 1004 return 0; 1005 } 1006 1007 /* 1008 * Find all the directory entries for this inode by scraping them out of the 1009 * directory leaf blocks by hand, and flushing them into the temp dir. 1010 */ 1011 STATIC int 1012 xrep_dir_find_entries( 1013 struct xrep_dir *rd) 1014 { 1015 struct xfs_inode *dp = rd->sc->ip; 1016 int error; 1017 1018 /* 1019 * Salvage directory entries from the old directory, and write them to 1020 * the temporary directory. 1021 */ 1022 if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { 1023 error = xrep_dir_recover_sf(rd); 1024 } else { 1025 error = xfs_iread_extents(rd->sc->tp, dp, XFS_DATA_FORK); 1026 if (error) 1027 return error; 1028 1029 error = xrep_dir_recover(rd); 1030 } 1031 if (error) 1032 return error; 1033 1034 return xrep_dir_flush_stashed(rd); 1035 } 1036 1037 /* Scan all files in the filesystem for dirents. */ 1038 STATIC int 1039 xrep_dir_salvage_entries( 1040 struct xrep_dir *rd) 1041 { 1042 struct xfs_scrub *sc = rd->sc; 1043 int error; 1044 1045 /* 1046 * Drop the ILOCK on this directory so that we can scan for this 1047 * directory's parent. Figure out who is going to be the parent of 1048 * this directory, then retake the ILOCK so that we can salvage 1049 * directory entries. 1050 */ 1051 xchk_iunlock(sc, XFS_ILOCK_EXCL); 1052 error = xrep_dir_find_parent(rd); 1053 xchk_ilock(sc, XFS_ILOCK_EXCL); 1054 if (error) 1055 return error; 1056 1057 /* 1058 * Collect directory entries by parsing raw leaf blocks to salvage 1059 * whatever we can. When we're done, free the staging memory before 1060 * exchanging the directories to reduce memory usage. 1061 */ 1062 error = xrep_dir_find_entries(rd); 1063 if (error) 1064 return error; 1065 1066 /* 1067 * Cancel the repair transaction and drop the ILOCK so that we can 1068 * (later) use the atomic mapping exchange functions to compute the 1069 * correct block reservations and re-lock the inodes. 1070 * 1071 * We still hold IOLOCK_EXCL (aka i_rwsem) which will prevent directory 1072 * modifications, but there's nothing to prevent userspace from reading 1073 * the directory until we're ready for the exchange operation. Reads 1074 * will return -EIO without shutting down the fs, so we're ok with 1075 * that. 1076 * 1077 * The VFS can change dotdot on us, but the findparent scan will keep 1078 * our incore parent inode up to date. See the note on locking issues 1079 * for more details. 1080 */ 1081 error = xrep_trans_commit(sc); 1082 if (error) 1083 return error; 1084 1085 xchk_iunlock(sc, XFS_ILOCK_EXCL); 1086 return 0; 1087 } 1088 1089 1090 /* 1091 * Examine a parent pointer of a file. If it leads us back to the directory 1092 * that we're rebuilding, create an incore dirent from the parent pointer and 1093 * stash it. 1094 */ 1095 STATIC int 1096 xrep_dir_scan_pptr( 1097 struct xfs_scrub *sc, 1098 struct xfs_inode *ip, 1099 unsigned int attr_flags, 1100 const unsigned char *name, 1101 unsigned int namelen, 1102 const void *value, 1103 unsigned int valuelen, 1104 void *priv) 1105 { 1106 struct xfs_name xname = { 1107 .name = name, 1108 .len = namelen, 1109 .type = xfs_mode_to_ftype(VFS_I(ip)->i_mode), 1110 }; 1111 xfs_ino_t parent_ino; 1112 uint32_t parent_gen; 1113 struct xrep_dir *rd = priv; 1114 int error; 1115 1116 if (!(attr_flags & XFS_ATTR_PARENT)) 1117 return 0; 1118 1119 /* 1120 * Ignore parent pointers that point back to a different dir, list the 1121 * wrong generation number, or are invalid. 1122 */ 1123 error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value, 1124 valuelen, &parent_ino, &parent_gen); 1125 if (error) 1126 return error; 1127 1128 if (parent_ino != sc->ip->i_ino || 1129 parent_gen != VFS_I(sc->ip)->i_generation) 1130 return 0; 1131 1132 mutex_lock(&rd->pscan.lock); 1133 error = xrep_dir_stash_createname(rd, &xname, ip->i_ino); 1134 mutex_unlock(&rd->pscan.lock); 1135 return error; 1136 } 1137 1138 /* 1139 * If this child dirent points to the directory being repaired, remember that 1140 * fact so that we can reset the dotdot entry if necessary. 1141 */ 1142 STATIC int 1143 xrep_dir_scan_dirent( 1144 struct xfs_scrub *sc, 1145 struct xfs_inode *dp, 1146 xfs_dir2_dataptr_t dapos, 1147 const struct xfs_name *name, 1148 xfs_ino_t ino, 1149 void *priv) 1150 { 1151 struct xrep_dir *rd = priv; 1152 1153 /* Dirent doesn't point to this directory. */ 1154 if (ino != rd->sc->ip->i_ino) 1155 return 0; 1156 1157 /* Ignore garbage inum. */ 1158 if (!xfs_verify_dir_ino(rd->sc->mp, ino)) 1159 return 0; 1160 1161 /* No weird looking names. */ 1162 if (name->len >= MAXNAMELEN || name->len <= 0) 1163 return 0; 1164 1165 /* Don't pick up dot or dotdot entries; we only want child dirents. */ 1166 if (xfs_dir2_samename(name, &xfs_name_dotdot) || 1167 xfs_dir2_samename(name, &xfs_name_dot)) 1168 return 0; 1169 1170 trace_xrep_dir_stash_createname(sc->tempip, &xfs_name_dotdot, 1171 dp->i_ino); 1172 1173 xrep_findparent_scan_found(&rd->pscan, dp->i_ino); 1174 return 0; 1175 } 1176 1177 /* 1178 * Decide if we want to look for child dirents or parent pointers in this file. 1179 * Skip the dir being repaired and any files being used to stage repairs. 1180 */ 1181 static inline bool 1182 xrep_dir_want_scan( 1183 struct xrep_dir *rd, 1184 const struct xfs_inode *ip) 1185 { 1186 return ip != rd->sc->ip && !xrep_is_tempfile(ip); 1187 } 1188 1189 /* 1190 * Take ILOCK on a file that we want to scan. 1191 * 1192 * Select ILOCK_EXCL if the file is a directory with an unloaded data bmbt or 1193 * has an unloaded attr bmbt. Otherwise, take ILOCK_SHARED. 1194 */ 1195 static inline unsigned int 1196 xrep_dir_scan_ilock( 1197 struct xrep_dir *rd, 1198 struct xfs_inode *ip) 1199 { 1200 uint lock_mode = XFS_ILOCK_SHARED; 1201 1202 /* Need to take the shared ILOCK to advance the iscan cursor. */ 1203 if (!xrep_dir_want_scan(rd, ip)) 1204 goto lock; 1205 1206 if (S_ISDIR(VFS_I(ip)->i_mode) && xfs_need_iread_extents(&ip->i_df)) { 1207 lock_mode = XFS_ILOCK_EXCL; 1208 goto lock; 1209 } 1210 1211 if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af)) 1212 lock_mode = XFS_ILOCK_EXCL; 1213 1214 lock: 1215 xfs_ilock(ip, lock_mode); 1216 return lock_mode; 1217 } 1218 1219 /* 1220 * Scan this file for relevant child dirents or parent pointers that point to 1221 * the directory we're rebuilding. 1222 */ 1223 STATIC int 1224 xrep_dir_scan_file( 1225 struct xrep_dir *rd, 1226 struct xfs_inode *ip) 1227 { 1228 unsigned int lock_mode; 1229 int error = 0; 1230 1231 lock_mode = xrep_dir_scan_ilock(rd, ip); 1232 1233 if (!xrep_dir_want_scan(rd, ip)) 1234 goto scan_done; 1235 1236 /* 1237 * If the extended attributes look as though they has been zapped by 1238 * the inode record repair code, we cannot scan for parent pointers. 1239 */ 1240 if (xchk_pptr_looks_zapped(ip)) { 1241 error = -EBUSY; 1242 goto scan_done; 1243 } 1244 1245 error = xchk_xattr_walk(rd->sc, ip, xrep_dir_scan_pptr, NULL, rd); 1246 if (error) 1247 goto scan_done; 1248 1249 if (S_ISDIR(VFS_I(ip)->i_mode)) { 1250 /* 1251 * If the directory looks as though it has been zapped by the 1252 * inode record repair code, we cannot scan for child dirents. 1253 */ 1254 if (xchk_dir_looks_zapped(ip)) { 1255 error = -EBUSY; 1256 goto scan_done; 1257 } 1258 1259 error = xchk_dir_walk(rd->sc, ip, xrep_dir_scan_dirent, rd); 1260 if (error) 1261 goto scan_done; 1262 } 1263 1264 scan_done: 1265 xchk_iscan_mark_visited(&rd->pscan.iscan, ip); 1266 xfs_iunlock(ip, lock_mode); 1267 return error; 1268 } 1269 1270 /* 1271 * Scan all files in the filesystem for parent pointers that we can turn into 1272 * replacement dirents, and a dirent that we can use to set the dotdot pointer. 1273 */ 1274 STATIC int 1275 xrep_dir_scan_dirtree( 1276 struct xrep_dir *rd) 1277 { 1278 struct xfs_scrub *sc = rd->sc; 1279 struct xfs_inode *ip; 1280 int error; 1281 1282 /* Roots of directory trees are their own parents. */ 1283 if (xchk_inode_is_dirtree_root(sc->ip)) 1284 xrep_findparent_scan_found(&rd->pscan, sc->ip->i_ino); 1285 1286 /* 1287 * Filesystem scans are time consuming. Drop the directory ILOCK and 1288 * all other resources for the duration of the scan and hope for the 1289 * best. The live update hooks will keep our scan information up to 1290 * date even though we've dropped the locks. 1291 */ 1292 xchk_trans_cancel(sc); 1293 if (sc->ilock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) 1294 xchk_iunlock(sc, sc->ilock_flags & (XFS_ILOCK_SHARED | 1295 XFS_ILOCK_EXCL)); 1296 xchk_trans_alloc_empty(sc); 1297 1298 while ((error = xchk_iscan_iter(&rd->pscan.iscan, &ip)) == 1) { 1299 bool flush; 1300 1301 error = xrep_dir_scan_file(rd, ip); 1302 xchk_irele(sc, ip); 1303 if (error) 1304 break; 1305 1306 /* Flush stashed dirent updates to constrain memory usage. */ 1307 mutex_lock(&rd->pscan.lock); 1308 flush = xrep_dir_want_flush_stashed(rd); 1309 mutex_unlock(&rd->pscan.lock); 1310 if (flush) { 1311 xchk_trans_cancel(sc); 1312 1313 error = xrep_tempfile_iolock_polled(sc); 1314 if (error) 1315 break; 1316 1317 error = xrep_dir_replay_updates(rd); 1318 xrep_tempfile_iounlock(sc); 1319 if (error) 1320 break; 1321 1322 xchk_trans_alloc_empty(sc); 1323 } 1324 1325 if (xchk_should_terminate(sc, &error)) 1326 break; 1327 } 1328 xchk_iscan_iter_finish(&rd->pscan.iscan); 1329 if (error) { 1330 /* 1331 * If we couldn't grab an inode that was busy with a state 1332 * change, change the error code so that we exit to userspace 1333 * as quickly as possible. 1334 */ 1335 if (error == -EBUSY) 1336 return -ECANCELED; 1337 return error; 1338 } 1339 1340 /* 1341 * Cancel the empty transaction so that we can (later) use the atomic 1342 * file mapping exchange functions to lock files and commit the new 1343 * directory. 1344 */ 1345 xchk_trans_cancel(rd->sc); 1346 return 0; 1347 } 1348 1349 /* 1350 * Capture dirent updates being made by other threads which are relevant to the 1351 * directory being repaired. 1352 */ 1353 STATIC int 1354 xrep_dir_live_update( 1355 struct notifier_block *nb, 1356 unsigned long action, 1357 void *data) 1358 { 1359 struct xfs_dir_update_params *p = data; 1360 struct xrep_dir *rd; 1361 struct xfs_scrub *sc; 1362 int error = 0; 1363 1364 rd = container_of(nb, struct xrep_dir, pscan.dhook.dirent_hook.nb); 1365 sc = rd->sc; 1366 1367 /* 1368 * This thread updated a child dirent in the directory that we're 1369 * rebuilding. Stash the update for replay against the temporary 1370 * directory. 1371 */ 1372 if (p->dp->i_ino == sc->ip->i_ino && 1373 xchk_iscan_want_live_update(&rd->pscan.iscan, p->ip->i_ino)) { 1374 mutex_lock(&rd->pscan.lock); 1375 if (p->delta > 0) 1376 error = xrep_dir_stash_createname(rd, p->name, 1377 p->ip->i_ino); 1378 else 1379 error = xrep_dir_stash_removename(rd, p->name, 1380 p->ip->i_ino); 1381 mutex_unlock(&rd->pscan.lock); 1382 if (error) 1383 goto out_abort; 1384 } 1385 1386 /* 1387 * This thread updated another directory's child dirent that points to 1388 * the directory that we're rebuilding, so remember the new dotdot 1389 * target. 1390 */ 1391 if (p->ip->i_ino == sc->ip->i_ino && 1392 xchk_iscan_want_live_update(&rd->pscan.iscan, p->dp->i_ino)) { 1393 if (p->delta > 0) { 1394 trace_xrep_dir_stash_createname(sc->tempip, 1395 &xfs_name_dotdot, 1396 p->dp->i_ino); 1397 1398 xrep_findparent_scan_found(&rd->pscan, p->dp->i_ino); 1399 } else { 1400 trace_xrep_dir_stash_removename(sc->tempip, 1401 &xfs_name_dotdot, 1402 rd->pscan.parent_ino); 1403 1404 xrep_findparent_scan_found(&rd->pscan, NULLFSINO); 1405 } 1406 } 1407 1408 return NOTIFY_DONE; 1409 out_abort: 1410 xchk_iscan_abort(&rd->pscan.iscan); 1411 return NOTIFY_DONE; 1412 } 1413 1414 /* 1415 * Free all the directory blocks and reset the data fork. The caller must 1416 * join the inode to the transaction. This function returns with the inode 1417 * joined to a clean scrub transaction. 1418 */ 1419 STATIC int 1420 xrep_dir_reset_fork( 1421 struct xrep_dir *rd, 1422 xfs_ino_t parent_ino) 1423 { 1424 struct xfs_scrub *sc = rd->sc; 1425 struct xfs_ifork *ifp = xfs_ifork_ptr(sc->tempip, XFS_DATA_FORK); 1426 int error; 1427 1428 /* Unmap all the directory buffers. */ 1429 if (xfs_ifork_has_extents(ifp)) { 1430 error = xrep_reap_ifork(sc, sc->tempip, XFS_DATA_FORK); 1431 if (error) 1432 return error; 1433 } 1434 1435 trace_xrep_dir_reset_fork(sc->tempip, parent_ino); 1436 1437 /* Reset the data fork to an empty data fork. */ 1438 xfs_idestroy_fork(ifp); 1439 ifp->if_bytes = 0; 1440 sc->tempip->i_disk_size = 0; 1441 1442 /* Reinitialize the short form directory. */ 1443 xrep_dir_init_args(rd, sc->tempip, NULL); 1444 return xfs_dir2_sf_create(&rd->args, parent_ino); 1445 } 1446 1447 /* 1448 * Prepare both inodes' directory forks for exchanging mappings. Promote the 1449 * tempfile from short format to leaf format, and if the file being repaired 1450 * has a short format data fork, turn it into an empty extent list. 1451 */ 1452 STATIC int 1453 xrep_dir_swap_prep( 1454 struct xfs_scrub *sc, 1455 bool temp_local, 1456 bool ip_local) 1457 { 1458 int error; 1459 1460 /* 1461 * If the tempfile's directory is in shortform format, convert that to 1462 * a single leaf extent so that we can use the atomic mapping exchange. 1463 */ 1464 if (temp_local) { 1465 struct xfs_da_args args = { 1466 .dp = sc->tempip, 1467 .geo = sc->mp->m_dir_geo, 1468 .whichfork = XFS_DATA_FORK, 1469 .trans = sc->tp, 1470 .total = 1, 1471 .owner = sc->ip->i_ino, 1472 }; 1473 1474 error = xfs_dir2_sf_to_block(&args); 1475 if (error) 1476 return error; 1477 1478 /* 1479 * Roll the deferred log items to get us back to a clean 1480 * transaction. 1481 */ 1482 error = xfs_defer_finish(&sc->tp); 1483 if (error) 1484 return error; 1485 } 1486 1487 /* 1488 * If the file being repaired had a shortform data fork, convert that 1489 * to an empty extent list in preparation for the atomic mapping 1490 * exchange. 1491 */ 1492 if (ip_local) { 1493 struct xfs_ifork *ifp; 1494 1495 ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); 1496 xfs_idestroy_fork(ifp); 1497 ifp->if_format = XFS_DINODE_FMT_EXTENTS; 1498 ifp->if_nextents = 0; 1499 ifp->if_bytes = 0; 1500 ifp->if_data = NULL; 1501 ifp->if_height = 0; 1502 1503 xfs_trans_log_inode(sc->tp, sc->ip, 1504 XFS_ILOG_CORE | XFS_ILOG_DDATA); 1505 } 1506 1507 return 0; 1508 } 1509 1510 /* 1511 * Replace the inode number of a directory entry. 1512 */ 1513 static int 1514 xrep_dir_replace( 1515 struct xrep_dir *rd, 1516 struct xfs_inode *dp, 1517 const struct xfs_name *name, 1518 xfs_ino_t inum, 1519 xfs_extlen_t total) 1520 { 1521 struct xfs_scrub *sc = rd->sc; 1522 int error; 1523 1524 ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); 1525 1526 error = xfs_dir_ino_validate(sc->mp, inum); 1527 if (error) 1528 return error; 1529 1530 xrep_dir_init_args(rd, dp, name); 1531 rd->args.inumber = inum; 1532 rd->args.total = total; 1533 return xfs_dir_replace_args(&rd->args); 1534 } 1535 1536 /* 1537 * Reset the link count of this directory and adjust the unlinked list pointers 1538 * as needed. 1539 */ 1540 STATIC int 1541 xrep_dir_set_nlink( 1542 struct xrep_dir *rd) 1543 { 1544 struct xfs_scrub *sc = rd->sc; 1545 struct xfs_inode *dp = sc->ip; 1546 struct xfs_perag *pag; 1547 unsigned int new_nlink = min_t(unsigned long long, 1548 rd->subdirs + 2, 1549 XFS_NLINK_PINNED); 1550 int error; 1551 1552 /* 1553 * The directory is not on the incore unlinked list, which means that 1554 * it needs to be reachable via the directory tree. Update the nlink 1555 * with our observed link count. If the directory has no parent, it 1556 * will be moved to the orphanage. 1557 */ 1558 if (!xfs_inode_on_unlinked_list(dp)) 1559 goto reset_nlink; 1560 1561 /* 1562 * The directory is on the unlinked list and we did not find any 1563 * dirents. Set the link count to zero and let the directory 1564 * inactivate when the last reference drops. 1565 */ 1566 if (rd->dirents == 0) { 1567 rd->needs_adoption = false; 1568 new_nlink = 0; 1569 goto reset_nlink; 1570 } 1571 1572 /* 1573 * The directory is on the unlinked list and we found dirents. This 1574 * directory needs to be reachable via the directory tree. Remove the 1575 * dir from the unlinked list and update nlink with the observed link 1576 * count. If the directory has no parent, it will be moved to the 1577 * orphanage. 1578 */ 1579 pag = xfs_perag_get(sc->mp, XFS_INO_TO_AGNO(sc->mp, dp->i_ino)); 1580 if (!pag) { 1581 ASSERT(0); 1582 return -EFSCORRUPTED; 1583 } 1584 1585 error = xfs_iunlink_remove(sc->tp, pag, dp); 1586 xfs_perag_put(pag); 1587 if (error) 1588 return error; 1589 1590 reset_nlink: 1591 if (VFS_I(dp)->i_nlink != new_nlink) 1592 set_nlink(VFS_I(dp), new_nlink); 1593 return 0; 1594 } 1595 1596 /* 1597 * Finish replaying stashed dirent updates, allocate a transaction for 1598 * exchanging data fork mappings, and take the ILOCKs of both directories 1599 * before we commit the new directory structure. 1600 */ 1601 STATIC int 1602 xrep_dir_finalize_tempdir( 1603 struct xrep_dir *rd) 1604 { 1605 struct xfs_scrub *sc = rd->sc; 1606 int error; 1607 1608 if (!xfs_has_parent(sc->mp)) 1609 return xrep_tempexch_trans_alloc(sc, XFS_DATA_FORK, &rd->tx); 1610 1611 /* 1612 * Repair relies on the ILOCK to quiesce all possible dirent updates. 1613 * Replay all queued dirent updates into the tempdir before exchanging 1614 * the contents, even if that means dropping the ILOCKs and the 1615 * transaction. 1616 */ 1617 do { 1618 error = xrep_dir_replay_updates(rd); 1619 if (error) 1620 return error; 1621 1622 error = xrep_tempexch_trans_alloc(sc, XFS_DATA_FORK, &rd->tx); 1623 if (error) 1624 return error; 1625 1626 if (xfarray_length(rd->dir_entries) == 0) 1627 break; 1628 1629 xchk_trans_cancel(sc); 1630 xrep_tempfile_iunlock_both(sc); 1631 } while (!xchk_should_terminate(sc, &error)); 1632 return error; 1633 } 1634 1635 /* Exchange the temporary directory's data fork with the one being repaired. */ 1636 STATIC int 1637 xrep_dir_swap( 1638 struct xrep_dir *rd) 1639 { 1640 struct xfs_scrub *sc = rd->sc; 1641 xfs_ino_t ino; 1642 bool ip_local, temp_local; 1643 int error = 0; 1644 1645 /* 1646 * If we never found the parent for this directory, temporarily assign 1647 * the root dir as the parent; we'll move this to the orphanage after 1648 * exchanging the dir contents. We hold the ILOCK of the dir being 1649 * repaired, so we're not worried about racy updates of dotdot. 1650 */ 1651 ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL); 1652 if (rd->pscan.parent_ino == NULLFSINO) { 1653 rd->needs_adoption = true; 1654 rd->pscan.parent_ino = rd->sc->mp->m_sb.sb_rootino; 1655 } 1656 1657 /* 1658 * Reset the temporary directory's '..' entry to point to the parent 1659 * that we found. The dirent replace code asserts if the dirent 1660 * already points at the new inumber, so we look it up here. 1661 * 1662 * It's also possible that this replacement could also expand a sf 1663 * tempdir into block format. 1664 */ 1665 error = xchk_dir_lookup(sc, rd->sc->tempip, &xfs_name_dotdot, &ino); 1666 if (error) 1667 return error; 1668 1669 if (rd->pscan.parent_ino != ino) { 1670 error = xrep_dir_replace(rd, rd->sc->tempip, &xfs_name_dotdot, 1671 rd->pscan.parent_ino, rd->tx.req.resblks); 1672 if (error) 1673 return error; 1674 } 1675 1676 /* 1677 * Changing the dot and dotdot entries could have changed the shape of 1678 * the directory, so we recompute these. 1679 */ 1680 ip_local = sc->ip->i_df.if_format == XFS_DINODE_FMT_LOCAL; 1681 temp_local = sc->tempip->i_df.if_format == XFS_DINODE_FMT_LOCAL; 1682 1683 /* 1684 * If the both files have a local format data fork and the rebuilt 1685 * directory data would fit in the repaired file's data fork, copy 1686 * the contents from the tempfile and update the directory link count. 1687 * We're done now. 1688 */ 1689 if (ip_local && temp_local && 1690 sc->tempip->i_disk_size <= xfs_inode_data_fork_size(sc->ip)) { 1691 xrep_tempfile_copyout_local(sc, XFS_DATA_FORK); 1692 return xrep_dir_set_nlink(rd); 1693 } 1694 1695 /* 1696 * Clean the transaction before we start working on exchanging 1697 * directory contents. 1698 */ 1699 error = xrep_tempfile_roll_trans(rd->sc); 1700 if (error) 1701 return error; 1702 1703 /* Otherwise, make sure both data forks are in block-mapping mode. */ 1704 error = xrep_dir_swap_prep(sc, temp_local, ip_local); 1705 if (error) 1706 return error; 1707 1708 /* 1709 * Set nlink of the directory in the same transaction sequence that 1710 * (atomically) commits the new directory data. 1711 */ 1712 error = xrep_dir_set_nlink(rd); 1713 if (error) 1714 return error; 1715 1716 return xrep_tempexch_contents(sc, &rd->tx); 1717 } 1718 1719 /* 1720 * Exchange the new directory contents (which we created in the tempfile) with 1721 * the directory being repaired. 1722 */ 1723 STATIC int 1724 xrep_dir_rebuild_tree( 1725 struct xrep_dir *rd) 1726 { 1727 struct xfs_scrub *sc = rd->sc; 1728 int error; 1729 1730 trace_xrep_dir_rebuild_tree(sc->ip, rd->pscan.parent_ino); 1731 1732 /* 1733 * Take the IOLOCK on the temporary file so that we can run dir 1734 * operations with the same locks held as we would for a normal file. 1735 * We still hold sc->ip's IOLOCK. 1736 */ 1737 error = xrep_tempfile_iolock_polled(rd->sc); 1738 if (error) 1739 return error; 1740 1741 /* 1742 * Allocate transaction, lock inodes, and make sure that we've replayed 1743 * all the stashed dirent updates to the tempdir. After this point, 1744 * we're ready to exchange data fork mappings. 1745 */ 1746 error = xrep_dir_finalize_tempdir(rd); 1747 if (error) 1748 return error; 1749 1750 if (xchk_iscan_aborted(&rd->pscan.iscan)) 1751 return -ECANCELED; 1752 1753 /* 1754 * Exchange the tempdir's data fork with the file being repaired. This 1755 * recreates the transaction and re-takes the ILOCK in the scrub 1756 * context. 1757 */ 1758 error = xrep_dir_swap(rd); 1759 if (error) 1760 return error; 1761 1762 /* 1763 * Release the old directory blocks and reset the data fork of the temp 1764 * directory to an empty shortform directory because inactivation does 1765 * nothing for directories. 1766 */ 1767 error = xrep_dir_reset_fork(rd, sc->mp->m_rootip->i_ino); 1768 if (error) 1769 return error; 1770 1771 /* 1772 * Roll to get a transaction without any inodes joined to it. Then we 1773 * can drop the tempfile's ILOCK and IOLOCK before doing more work on 1774 * the scrub target directory. 1775 */ 1776 error = xfs_trans_roll(&sc->tp); 1777 if (error) 1778 return error; 1779 1780 xrep_tempfile_iunlock(sc); 1781 xrep_tempfile_iounlock(sc); 1782 return 0; 1783 } 1784 1785 /* Set up the filesystem scan so we can regenerate directory entries. */ 1786 STATIC int 1787 xrep_dir_setup_scan( 1788 struct xrep_dir *rd) 1789 { 1790 struct xfs_scrub *sc = rd->sc; 1791 int error; 1792 1793 /* Set up some staging memory for salvaging dirents. */ 1794 error = xfarray_create("directory entries", 0, 1795 sizeof(struct xrep_dirent), &rd->dir_entries); 1796 if (error) 1797 return error; 1798 1799 error = xfblob_create("directory entry names", &rd->dir_names); 1800 if (error) 1801 goto out_xfarray; 1802 1803 if (xfs_has_parent(sc->mp)) 1804 error = __xrep_findparent_scan_start(sc, &rd->pscan, 1805 xrep_dir_live_update); 1806 else 1807 error = xrep_findparent_scan_start(sc, &rd->pscan); 1808 if (error) 1809 goto out_xfblob; 1810 1811 return 0; 1812 1813 out_xfblob: 1814 xfblob_destroy(rd->dir_names); 1815 rd->dir_names = NULL; 1816 out_xfarray: 1817 xfarray_destroy(rd->dir_entries); 1818 rd->dir_entries = NULL; 1819 return error; 1820 } 1821 1822 /* 1823 * Move the current file to the orphanage. 1824 * 1825 * Caller must hold IOLOCK_EXCL on @sc->ip, and no other inode locks. Upon 1826 * successful return, the scrub transaction will have enough extra reservation 1827 * to make the move; it will hold IOLOCK_EXCL and ILOCK_EXCL of @sc->ip and the 1828 * orphanage; and both inodes will be ijoined. 1829 */ 1830 STATIC int 1831 xrep_dir_move_to_orphanage( 1832 struct xrep_dir *rd) 1833 { 1834 struct xfs_scrub *sc = rd->sc; 1835 xfs_ino_t orig_parent, new_parent; 1836 int error; 1837 1838 /* 1839 * We are about to drop the ILOCK on sc->ip to lock the orphanage and 1840 * prepare for the adoption. Therefore, look up the old dotdot entry 1841 * for sc->ip so that we can compare it after we re-lock sc->ip. 1842 */ 1843 error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &orig_parent); 1844 if (error) 1845 return error; 1846 1847 /* 1848 * Drop the ILOCK on the scrub target and commit the transaction. 1849 * Adoption computes its own resource requirements and gathers the 1850 * necessary components. 1851 */ 1852 error = xrep_trans_commit(sc); 1853 if (error) 1854 return error; 1855 xchk_iunlock(sc, XFS_ILOCK_EXCL); 1856 1857 /* If we can take the orphanage's iolock then we're ready to move. */ 1858 if (!xrep_orphanage_ilock_nowait(sc, XFS_IOLOCK_EXCL)) { 1859 xchk_iunlock(sc, sc->ilock_flags); 1860 error = xrep_orphanage_iolock_two(sc); 1861 if (error) 1862 return error; 1863 } 1864 1865 /* Grab transaction and ILOCK the two files. */ 1866 error = xrep_adoption_trans_alloc(sc, &rd->adoption); 1867 if (error) 1868 return error; 1869 1870 error = xrep_adoption_compute_name(&rd->adoption, &rd->xname); 1871 if (error) 1872 return error; 1873 1874 /* 1875 * Now that we've reacquired the ILOCK on sc->ip, look up the dotdot 1876 * entry again. If the parent changed or the child was unlinked while 1877 * the child directory was unlocked, we don't need to move the child to 1878 * the orphanage after all. 1879 */ 1880 error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &new_parent); 1881 if (error) 1882 return error; 1883 1884 /* 1885 * Attach to the orphanage if we still have a linked directory and it 1886 * hasn't been moved. 1887 */ 1888 if (orig_parent == new_parent && VFS_I(sc->ip)->i_nlink > 0) { 1889 error = xrep_adoption_move(&rd->adoption); 1890 if (error) 1891 return error; 1892 } 1893 1894 /* 1895 * Launder the scrub transaction so we can drop the orphanage ILOCK 1896 * and IOLOCK. Return holding the scrub target's ILOCK and IOLOCK. 1897 */ 1898 error = xrep_adoption_trans_roll(&rd->adoption); 1899 if (error) 1900 return error; 1901 1902 xrep_orphanage_iunlock(sc, XFS_ILOCK_EXCL); 1903 xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL); 1904 return 0; 1905 } 1906 1907 /* 1908 * Repair the directory metadata. 1909 * 1910 * XXX: Directory entry buffers can be multiple fsblocks in size. The buffer 1911 * cache in XFS can't handle aliased multiblock buffers, so this might 1912 * misbehave if the directory blocks are crosslinked with other filesystem 1913 * metadata. 1914 * 1915 * XXX: Is it necessary to check the dcache for this directory to make sure 1916 * that we always recreate every cached entry? 1917 */ 1918 int 1919 xrep_directory( 1920 struct xfs_scrub *sc) 1921 { 1922 struct xrep_dir *rd = sc->buf; 1923 int error; 1924 1925 /* The rmapbt is required to reap the old data fork. */ 1926 if (!xfs_has_rmapbt(sc->mp)) 1927 return -EOPNOTSUPP; 1928 /* We require atomic file exchange range to rebuild anything. */ 1929 if (!xfs_has_exchange_range(sc->mp)) 1930 return -EOPNOTSUPP; 1931 1932 error = xrep_dir_setup_scan(rd); 1933 if (error) 1934 return error; 1935 1936 if (xfs_has_parent(sc->mp)) 1937 error = xrep_dir_scan_dirtree(rd); 1938 else 1939 error = xrep_dir_salvage_entries(rd); 1940 if (error) 1941 goto out_teardown; 1942 1943 /* Last chance to abort before we start committing fixes. */ 1944 if (xchk_should_terminate(sc, &error)) 1945 goto out_teardown; 1946 1947 error = xrep_dir_rebuild_tree(rd); 1948 if (error) 1949 goto out_teardown; 1950 1951 if (rd->needs_adoption) { 1952 if (!xrep_orphanage_can_adopt(rd->sc)) 1953 error = -EFSCORRUPTED; 1954 else 1955 error = xrep_dir_move_to_orphanage(rd); 1956 if (error) 1957 goto out_teardown; 1958 } 1959 1960 out_teardown: 1961 xrep_dir_teardown(sc); 1962 return error; 1963 } 1964