1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (c) 2020-2024 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_defer.h" 13 #include "xfs_bit.h" 14 #include "xfs_log_format.h" 15 #include "xfs_trans.h" 16 #include "xfs_sb.h" 17 #include "xfs_inode.h" 18 #include "xfs_icache.h" 19 #include "xfs_da_format.h" 20 #include "xfs_da_btree.h" 21 #include "xfs_dir2.h" 22 #include "xfs_dir2_priv.h" 23 #include "xfs_bmap.h" 24 #include "xfs_quota.h" 25 #include "xfs_bmap_btree.h" 26 #include "xfs_trans_space.h" 27 #include "xfs_bmap_util.h" 28 #include "xfs_exchmaps.h" 29 #include "xfs_exchrange.h" 30 #include "xfs_ag.h" 31 #include "xfs_parent.h" 32 #include "scrub/xfs_scrub.h" 33 #include "scrub/scrub.h" 34 #include "scrub/common.h" 35 #include "scrub/trace.h" 36 #include "scrub/repair.h" 37 #include "scrub/tempfile.h" 38 #include "scrub/tempexch.h" 39 #include "scrub/xfile.h" 40 #include "scrub/xfarray.h" 41 #include "scrub/xfblob.h" 42 #include "scrub/iscan.h" 43 #include "scrub/readdir.h" 44 #include "scrub/reap.h" 45 #include "scrub/findparent.h" 46 #include "scrub/orphanage.h" 47 #include "scrub/listxattr.h" 48 49 /* 50 * Directory Repair 51 * ================ 52 * 53 * We repair directories by reading the directory data blocks looking for 54 * directory entries that look salvageable (name passes verifiers, entry points 55 * to a valid allocated inode, etc). Each entry worth salvaging is stashed in 56 * memory, and the stashed entries are periodically replayed into a temporary 57 * directory to constrain memory use. Batching the construction of the 58 * temporary directory in this fashion reduces lock cycling of the directory 59 * being repaired and the temporary directory, and will later become important 60 * for parent pointer scanning. 61 * 62 * If parent pointers are enabled on this filesystem, we instead reconstruct 63 * the directory by visiting each parent pointer of each file in the filesystem 64 * and translating the relevant parent pointer records into dirents. In this 65 * case, it is advantageous to stash all directory entries created from parent 66 * pointers for a single child file before replaying them into the temporary 67 * directory. To save memory, the live filesystem scan reuses the findparent 68 * fields. Directory repair chooses either parent pointer scanning or 69 * directory entry salvaging, but not both. 70 * 71 * Directory entries added to the temporary directory do not elevate the link 72 * counts of the inodes found. When salvaging completes, the remaining stashed 73 * entries are replayed to the temporary directory. An atomic mapping exchange 74 * is used to commit the new directory blocks to the directory being repaired. 75 * This will disrupt readdir cursors. 76 * 77 * Locking Issues 78 * -------------- 79 * 80 * If /a, /a/b, and /c are all directories, the VFS does not take i_rwsem on 81 * /a/b for a "mv /a/b /c/" operation. This means that only b's ILOCK protects 82 * b's dotdot update. This is in contrast to every other dotdot update (link, 83 * remove, mkdir). If the repair code drops the ILOCK, it must either 84 * revalidate the dotdot entry or use dirent hooks to capture updates from 85 * other threads. 86 */ 87 88 /* Create a dirent in the tempdir. */ 89 #define XREP_DIRENT_ADD (1) 90 91 /* Remove a dirent from the tempdir. */ 92 #define XREP_DIRENT_REMOVE (2) 93 94 /* Directory entry to be restored in the new directory. */ 95 struct xrep_dirent { 96 /* Cookie for retrieval of the dirent name. */ 97 xfblob_cookie name_cookie; 98 99 /* Target inode number. */ 100 xfs_ino_t ino; 101 102 /* Length of the dirent name. */ 103 uint8_t namelen; 104 105 /* File type of the dirent. */ 106 uint8_t ftype; 107 108 /* XREP_DIRENT_{ADD,REMOVE} */ 109 uint8_t action; 110 }; 111 112 /* 113 * Stash up to 8 pages of recovered dirent data in dir_entries and dir_names 114 * before we write them to the temp dir. 115 */ 116 #define XREP_DIR_MAX_STASH_BYTES (PAGE_SIZE * 8) 117 118 struct xrep_dir { 119 struct xfs_scrub *sc; 120 121 /* Fixed-size array of xrep_dirent structures. */ 122 struct xfarray *dir_entries; 123 124 /* Blobs containing directory entry names. */ 125 struct xfblob *dir_names; 126 127 /* Information for exchanging data forks at the end. */ 128 struct xrep_tempexch tx; 129 130 /* Preallocated args struct for performing dir operations */ 131 struct xfs_da_args args; 132 133 /* 134 * Information used to scan the filesystem to find the inumber of the 135 * dotdot entry for this directory. For directory salvaging when 136 * parent pointers are not enabled, we use the findparent_* functions 137 * on this object and access only the parent_ino field directly. 138 * 139 * When parent pointers are enabled, however, the pptr scanner uses the 140 * iscan, hooks, lock, and parent_ino fields of this object directly. 141 * @pscan.lock coordinates access to dir_entries, dir_names, 142 * parent_ino, subdirs, dirents, and args. This reduces the memory 143 * requirements of this structure. 144 */ 145 struct xrep_parent_scan_info pscan; 146 147 /* 148 * Context information for attaching this directory to the lost+found 149 * if this directory does not have a parent. 150 */ 151 struct xrep_adoption adoption; 152 153 /* How many subdirectories did we find? */ 154 uint64_t subdirs; 155 156 /* How many dirents did we find? */ 157 unsigned int dirents; 158 159 /* Should we move this directory to the orphanage? */ 160 bool needs_adoption; 161 162 /* Directory entry name, plus the trailing null. */ 163 struct xfs_name xname; 164 unsigned char namebuf[MAXNAMELEN]; 165 }; 166 167 /* Tear down all the incore stuff we created. */ 168 static void 169 xrep_dir_teardown( 170 struct xfs_scrub *sc) 171 { 172 struct xrep_dir *rd = sc->buf; 173 174 xrep_findparent_scan_teardown(&rd->pscan); 175 xfblob_destroy(rd->dir_names); 176 xfarray_destroy(rd->dir_entries); 177 } 178 179 /* Set up for a directory repair. */ 180 int 181 xrep_setup_directory( 182 struct xfs_scrub *sc) 183 { 184 struct xrep_dir *rd; 185 int error; 186 187 xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS); 188 189 error = xrep_orphanage_try_create(sc); 190 if (error) 191 return error; 192 193 error = xrep_tempfile_create(sc, S_IFDIR); 194 if (error) 195 return error; 196 197 rd = kvzalloc(sizeof(struct xrep_dir), XCHK_GFP_FLAGS); 198 if (!rd) 199 return -ENOMEM; 200 rd->sc = sc; 201 rd->xname.name = rd->namebuf; 202 sc->buf = rd; 203 204 return 0; 205 } 206 207 /* 208 * Look up the dotdot entry and confirm that it's really the parent. 209 * Returns NULLFSINO if we don't know what to do. 210 */ 211 static inline xfs_ino_t 212 xrep_dir_lookup_parent( 213 struct xrep_dir *rd) 214 { 215 struct xfs_scrub *sc = rd->sc; 216 xfs_ino_t ino; 217 int error; 218 219 error = xfs_dir_lookup(sc->tp, sc->ip, &xfs_name_dotdot, &ino, NULL); 220 if (error) 221 return NULLFSINO; 222 if (!xfs_verify_dir_ino(sc->mp, ino)) 223 return NULLFSINO; 224 225 error = xrep_findparent_confirm(sc, &ino); 226 if (error) 227 return NULLFSINO; 228 229 return ino; 230 } 231 232 /* 233 * Look up '..' in the dentry cache and confirm that it's really the parent. 234 * Returns NULLFSINO if the dcache misses or if the hit is implausible. 235 */ 236 static inline xfs_ino_t 237 xrep_dir_dcache_parent( 238 struct xrep_dir *rd) 239 { 240 struct xfs_scrub *sc = rd->sc; 241 xfs_ino_t parent_ino; 242 int error; 243 244 parent_ino = xrep_findparent_from_dcache(sc); 245 if (parent_ino == NULLFSINO) 246 return parent_ino; 247 248 error = xrep_findparent_confirm(sc, &parent_ino); 249 if (error) 250 return NULLFSINO; 251 252 return parent_ino; 253 } 254 255 /* Try to find the parent of the directory being repaired. */ 256 STATIC int 257 xrep_dir_find_parent( 258 struct xrep_dir *rd) 259 { 260 xfs_ino_t ino; 261 262 ino = xrep_findparent_self_reference(rd->sc); 263 if (ino != NULLFSINO) { 264 xrep_findparent_scan_finish_early(&rd->pscan, ino); 265 return 0; 266 } 267 268 ino = xrep_dir_dcache_parent(rd); 269 if (ino != NULLFSINO) { 270 xrep_findparent_scan_finish_early(&rd->pscan, ino); 271 return 0; 272 } 273 274 ino = xrep_dir_lookup_parent(rd); 275 if (ino != NULLFSINO) { 276 xrep_findparent_scan_finish_early(&rd->pscan, ino); 277 return 0; 278 } 279 280 /* 281 * A full filesystem scan is the last resort. On a busy filesystem, 282 * the scan can fail with -EBUSY if we cannot grab IOLOCKs. That means 283 * that we don't know what who the parent is, so we should return to 284 * userspace. 285 */ 286 return xrep_findparent_scan(&rd->pscan); 287 } 288 289 /* 290 * Decide if we want to salvage this entry. We don't bother with oversized 291 * names or the dot entry. 292 */ 293 STATIC int 294 xrep_dir_want_salvage( 295 struct xrep_dir *rd, 296 const char *name, 297 int namelen, 298 xfs_ino_t ino) 299 { 300 struct xfs_mount *mp = rd->sc->mp; 301 302 /* No pointers to ourselves or to garbage. */ 303 if (ino == rd->sc->ip->i_ino) 304 return false; 305 if (!xfs_verify_dir_ino(mp, ino)) 306 return false; 307 308 /* No weird looking names or dot entries. */ 309 if (namelen >= MAXNAMELEN || namelen <= 0) 310 return false; 311 if (namelen == 1 && name[0] == '.') 312 return false; 313 if (!xfs_dir2_namecheck(name, namelen)) 314 return false; 315 316 return true; 317 } 318 319 /* 320 * Remember that we want to create a dirent in the tempdir. These stashed 321 * actions will be replayed later. 322 */ 323 STATIC int 324 xrep_dir_stash_createname( 325 struct xrep_dir *rd, 326 const struct xfs_name *name, 327 xfs_ino_t ino) 328 { 329 struct xrep_dirent dirent = { 330 .action = XREP_DIRENT_ADD, 331 .ino = ino, 332 .namelen = name->len, 333 .ftype = name->type, 334 }; 335 int error; 336 337 trace_xrep_dir_stash_createname(rd->sc->tempip, name, ino); 338 339 error = xfblob_storename(rd->dir_names, &dirent.name_cookie, name); 340 if (error) 341 return error; 342 343 return xfarray_append(rd->dir_entries, &dirent); 344 } 345 346 /* 347 * Remember that we want to remove a dirent from the tempdir. These stashed 348 * actions will be replayed later. 349 */ 350 STATIC int 351 xrep_dir_stash_removename( 352 struct xrep_dir *rd, 353 const struct xfs_name *name, 354 xfs_ino_t ino) 355 { 356 struct xrep_dirent dirent = { 357 .action = XREP_DIRENT_REMOVE, 358 .ino = ino, 359 .namelen = name->len, 360 .ftype = name->type, 361 }; 362 int error; 363 364 trace_xrep_dir_stash_removename(rd->sc->tempip, name, ino); 365 366 error = xfblob_storename(rd->dir_names, &dirent.name_cookie, name); 367 if (error) 368 return error; 369 370 return xfarray_append(rd->dir_entries, &dirent); 371 } 372 373 /* Allocate an in-core record to hold entries while we rebuild the dir data. */ 374 STATIC int 375 xrep_dir_salvage_entry( 376 struct xrep_dir *rd, 377 unsigned char *name, 378 unsigned int namelen, 379 xfs_ino_t ino) 380 { 381 struct xfs_name xname = { 382 .name = name, 383 }; 384 struct xfs_scrub *sc = rd->sc; 385 struct xfs_inode *ip; 386 unsigned int i = 0; 387 int error = 0; 388 389 if (xchk_should_terminate(sc, &error)) 390 return error; 391 392 /* 393 * Truncate the name to the first character that would trip namecheck. 394 * If we no longer have a name after that, ignore this entry. 395 */ 396 while (i < namelen && name[i] != 0 && name[i] != '/') 397 i++; 398 if (i == 0) 399 return 0; 400 xname.len = i; 401 402 /* Ignore '..' entries; we already picked the new parent. */ 403 if (xname.len == 2 && name[0] == '.' && name[1] == '.') { 404 trace_xrep_dir_salvaged_parent(sc->ip, ino); 405 return 0; 406 } 407 408 trace_xrep_dir_salvage_entry(sc->ip, &xname, ino); 409 410 /* 411 * Compute the ftype or dump the entry if we can't. We don't lock the 412 * inode because inodes can't change type while we have a reference. 413 */ 414 error = xchk_iget(sc, ino, &ip); 415 if (error) 416 return 0; 417 418 /* Don't mix metadata and regular directory trees. */ 419 if (xfs_is_metadir_inode(ip) != xfs_is_metadir_inode(rd->sc->ip)) { 420 xchk_irele(sc, ip); 421 return 0; 422 } 423 424 xname.type = xfs_mode_to_ftype(VFS_I(ip)->i_mode); 425 xchk_irele(sc, ip); 426 427 return xrep_dir_stash_createname(rd, &xname, ino); 428 } 429 430 /* Record a shortform directory entry for later reinsertion. */ 431 STATIC int 432 xrep_dir_salvage_sf_entry( 433 struct xrep_dir *rd, 434 struct xfs_dir2_sf_hdr *sfp, 435 struct xfs_dir2_sf_entry *sfep) 436 { 437 xfs_ino_t ino; 438 439 ino = xfs_dir2_sf_get_ino(rd->sc->mp, sfp, sfep); 440 if (!xrep_dir_want_salvage(rd, sfep->name, sfep->namelen, ino)) 441 return 0; 442 443 return xrep_dir_salvage_entry(rd, sfep->name, sfep->namelen, ino); 444 } 445 446 /* Record a regular directory entry for later reinsertion. */ 447 STATIC int 448 xrep_dir_salvage_data_entry( 449 struct xrep_dir *rd, 450 struct xfs_dir2_data_entry *dep) 451 { 452 xfs_ino_t ino; 453 454 ino = be64_to_cpu(dep->inumber); 455 if (!xrep_dir_want_salvage(rd, dep->name, dep->namelen, ino)) 456 return 0; 457 458 return xrep_dir_salvage_entry(rd, dep->name, dep->namelen, ino); 459 } 460 461 /* Try to recover block/data format directory entries. */ 462 STATIC int 463 xrep_dir_recover_data( 464 struct xrep_dir *rd, 465 struct xfs_buf *bp) 466 { 467 struct xfs_da_geometry *geo = rd->sc->mp->m_dir_geo; 468 unsigned int offset; 469 unsigned int end; 470 int error = 0; 471 472 /* 473 * Loop over the data portion of the block. 474 * Each object is a real entry (dep) or an unused one (dup). 475 */ 476 offset = geo->data_entry_offset; 477 end = min_t(unsigned int, BBTOB(bp->b_length), 478 xfs_dir3_data_end_offset(geo, bp->b_addr)); 479 480 while (offset < end) { 481 struct xfs_dir2_data_unused *dup = bp->b_addr + offset; 482 struct xfs_dir2_data_entry *dep = bp->b_addr + offset; 483 484 if (xchk_should_terminate(rd->sc, &error)) 485 return error; 486 487 /* Skip unused entries. */ 488 if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { 489 offset += be16_to_cpu(dup->length); 490 continue; 491 } 492 493 /* Don't walk off the end of the block. */ 494 offset += xfs_dir2_data_entsize(rd->sc->mp, dep->namelen); 495 if (offset > end) 496 break; 497 498 /* Ok, let's save this entry. */ 499 error = xrep_dir_salvage_data_entry(rd, dep); 500 if (error) 501 return error; 502 503 } 504 505 return 0; 506 } 507 508 /* Try to recover shortform directory entries. */ 509 STATIC int 510 xrep_dir_recover_sf( 511 struct xrep_dir *rd) 512 { 513 struct xfs_dir2_sf_hdr *hdr; 514 struct xfs_dir2_sf_entry *sfep; 515 struct xfs_dir2_sf_entry *next; 516 struct xfs_ifork *ifp; 517 xfs_ino_t ino; 518 unsigned char *end; 519 int error = 0; 520 521 ifp = xfs_ifork_ptr(rd->sc->ip, XFS_DATA_FORK); 522 hdr = ifp->if_data; 523 end = (unsigned char *)ifp->if_data + ifp->if_bytes; 524 525 ino = xfs_dir2_sf_get_parent_ino(hdr); 526 trace_xrep_dir_salvaged_parent(rd->sc->ip, ino); 527 528 sfep = xfs_dir2_sf_firstentry(hdr); 529 while ((unsigned char *)sfep < end) { 530 if (xchk_should_terminate(rd->sc, &error)) 531 return error; 532 533 next = xfs_dir2_sf_nextentry(rd->sc->mp, hdr, sfep); 534 if ((unsigned char *)next > end) 535 break; 536 537 /* Ok, let's save this entry. */ 538 error = xrep_dir_salvage_sf_entry(rd, hdr, sfep); 539 if (error) 540 return error; 541 542 sfep = next; 543 } 544 545 return 0; 546 } 547 548 /* 549 * Try to figure out the format of this directory from the data fork mappings 550 * and the directory size. If we can be reasonably sure of format, we can be 551 * more aggressive in salvaging directory entries. On return, @magic_guess 552 * will be set to DIR3_BLOCK_MAGIC if we think this is a "block format" 553 * directory; DIR3_DATA_MAGIC if we think this is a "data format" directory, 554 * and 0 if we can't tell. 555 */ 556 STATIC void 557 xrep_dir_guess_format( 558 struct xrep_dir *rd, 559 __be32 *magic_guess) 560 { 561 struct xfs_inode *dp = rd->sc->ip; 562 struct xfs_mount *mp = rd->sc->mp; 563 struct xfs_da_geometry *geo = mp->m_dir_geo; 564 xfs_fileoff_t last; 565 int error; 566 567 ASSERT(xfs_has_crc(mp)); 568 569 *magic_guess = 0; 570 571 /* 572 * If there's a single directory block and the directory size is 573 * exactly one block, this has to be a single block format directory. 574 */ 575 error = xfs_bmap_last_offset(dp, &last, XFS_DATA_FORK); 576 if (!error && XFS_FSB_TO_B(mp, last) == geo->blksize && 577 dp->i_disk_size == geo->blksize) { 578 *magic_guess = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC); 579 return; 580 } 581 582 /* 583 * If the last extent before the leaf offset matches the directory 584 * size and the directory size is larger than 1 block, this is a 585 * data format directory. 586 */ 587 last = geo->leafblk; 588 error = xfs_bmap_last_before(rd->sc->tp, dp, &last, XFS_DATA_FORK); 589 if (!error && 590 XFS_FSB_TO_B(mp, last) > geo->blksize && 591 XFS_FSB_TO_B(mp, last) == dp->i_disk_size) { 592 *magic_guess = cpu_to_be32(XFS_DIR3_DATA_MAGIC); 593 return; 594 } 595 } 596 597 /* Recover directory entries from a specific directory block. */ 598 STATIC int 599 xrep_dir_recover_dirblock( 600 struct xrep_dir *rd, 601 __be32 magic_guess, 602 xfs_dablk_t dabno) 603 { 604 struct xfs_dir2_data_hdr *hdr; 605 struct xfs_buf *bp; 606 __be32 oldmagic; 607 int error; 608 609 /* 610 * Try to read buffer. We invalidate them in the next step so we don't 611 * bother to set a buffer type or ops. 612 */ 613 error = xfs_da_read_buf(rd->sc->tp, rd->sc->ip, dabno, 614 XFS_DABUF_MAP_HOLE_OK, &bp, XFS_DATA_FORK, NULL); 615 if (error || !bp) 616 return error; 617 618 hdr = bp->b_addr; 619 oldmagic = hdr->magic; 620 621 trace_xrep_dir_recover_dirblock(rd->sc->ip, dabno, 622 be32_to_cpu(hdr->magic), be32_to_cpu(magic_guess)); 623 624 /* 625 * If we're sure of the block's format, proceed with the salvage 626 * operation using the specified magic number. 627 */ 628 if (magic_guess) { 629 hdr->magic = magic_guess; 630 goto recover; 631 } 632 633 /* 634 * If we couldn't guess what type of directory this is, then we will 635 * only salvage entries from directory blocks that match the magic 636 * number and pass verifiers. 637 */ 638 switch (hdr->magic) { 639 case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC): 640 case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC): 641 if (!xrep_buf_verify_struct(bp, &xfs_dir3_block_buf_ops)) 642 goto out; 643 if (xfs_dir3_block_header_check(bp, rd->sc->ip->i_ino) != NULL) 644 goto out; 645 break; 646 case cpu_to_be32(XFS_DIR2_DATA_MAGIC): 647 case cpu_to_be32(XFS_DIR3_DATA_MAGIC): 648 if (!xrep_buf_verify_struct(bp, &xfs_dir3_data_buf_ops)) 649 goto out; 650 if (xfs_dir3_data_header_check(bp, rd->sc->ip->i_ino) != NULL) 651 goto out; 652 break; 653 default: 654 goto out; 655 } 656 657 recover: 658 error = xrep_dir_recover_data(rd, bp); 659 660 out: 661 hdr->magic = oldmagic; 662 xfs_trans_brelse(rd->sc->tp, bp); 663 return error; 664 } 665 666 static inline void 667 xrep_dir_init_args( 668 struct xrep_dir *rd, 669 struct xfs_inode *dp, 670 const struct xfs_name *name) 671 { 672 memset(&rd->args, 0, sizeof(struct xfs_da_args)); 673 rd->args.geo = rd->sc->mp->m_dir_geo; 674 rd->args.whichfork = XFS_DATA_FORK; 675 rd->args.owner = rd->sc->ip->i_ino; 676 rd->args.trans = rd->sc->tp; 677 rd->args.dp = dp; 678 if (!name) 679 return; 680 rd->args.name = name->name; 681 rd->args.namelen = name->len; 682 rd->args.filetype = name->type; 683 rd->args.hashval = xfs_dir2_hashname(rd->sc->mp, name); 684 } 685 686 /* Replay a stashed createname into the temporary directory. */ 687 STATIC int 688 xrep_dir_replay_createname( 689 struct xrep_dir *rd, 690 const struct xfs_name *name, 691 xfs_ino_t inum, 692 xfs_extlen_t total) 693 { 694 struct xfs_scrub *sc = rd->sc; 695 struct xfs_inode *dp = rd->sc->tempip; 696 int error; 697 698 ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); 699 700 error = xfs_dir_ino_validate(sc->mp, inum); 701 if (error) 702 return error; 703 704 trace_xrep_dir_replay_createname(dp, name, inum); 705 706 xrep_dir_init_args(rd, dp, name); 707 rd->args.inumber = inum; 708 rd->args.total = total; 709 rd->args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; 710 return xfs_dir_createname_args(&rd->args); 711 } 712 713 /* Replay a stashed removename onto the temporary directory. */ 714 STATIC int 715 xrep_dir_replay_removename( 716 struct xrep_dir *rd, 717 const struct xfs_name *name, 718 xfs_extlen_t total) 719 { 720 struct xfs_inode *dp = rd->args.dp; 721 722 ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); 723 724 xrep_dir_init_args(rd, dp, name); 725 rd->args.op_flags = 0; 726 rd->args.total = total; 727 728 trace_xrep_dir_replay_removename(dp, name, 0); 729 return xfs_dir_removename_args(&rd->args); 730 } 731 732 /* 733 * Add this stashed incore directory entry to the temporary directory. 734 * The caller must hold the tempdir's IOLOCK, must not hold any ILOCKs, and 735 * must not be in transaction context. 736 */ 737 STATIC int 738 xrep_dir_replay_update( 739 struct xrep_dir *rd, 740 const struct xfs_name *xname, 741 const struct xrep_dirent *dirent) 742 { 743 struct xfs_mount *mp = rd->sc->mp; 744 #ifdef DEBUG 745 xfs_ino_t ino; 746 #endif 747 uint resblks; 748 int error; 749 750 resblks = xfs_link_space_res(mp, xname->len); 751 error = xchk_trans_alloc(rd->sc, resblks); 752 if (error) 753 return error; 754 755 /* Lock the temporary directory and join it to the transaction */ 756 xrep_tempfile_ilock(rd->sc); 757 xfs_trans_ijoin(rd->sc->tp, rd->sc->tempip, 0); 758 759 switch (dirent->action) { 760 case XREP_DIRENT_ADD: 761 /* 762 * Create a replacement dirent in the temporary directory. 763 * Note that _createname doesn't check for existing entries. 764 * There shouldn't be any in the temporary dir, but we'll 765 * verify this in debug mode. 766 */ 767 #ifdef DEBUG 768 error = xchk_dir_lookup(rd->sc, rd->sc->tempip, xname, &ino); 769 if (error != -ENOENT) { 770 ASSERT(error != -ENOENT); 771 goto out_cancel; 772 } 773 #endif 774 775 error = xrep_dir_replay_createname(rd, xname, dirent->ino, 776 resblks); 777 if (error) 778 goto out_cancel; 779 780 if (xname->type == XFS_DIR3_FT_DIR) 781 rd->subdirs++; 782 rd->dirents++; 783 break; 784 case XREP_DIRENT_REMOVE: 785 /* 786 * Remove a dirent from the temporary directory. Note that 787 * _removename doesn't check the inode target of the exist 788 * entry. There should be a perfect match in the temporary 789 * dir, but we'll verify this in debug mode. 790 */ 791 #ifdef DEBUG 792 error = xchk_dir_lookup(rd->sc, rd->sc->tempip, xname, &ino); 793 if (error) { 794 ASSERT(error != 0); 795 goto out_cancel; 796 } 797 if (ino != dirent->ino) { 798 ASSERT(ino == dirent->ino); 799 error = -EIO; 800 goto out_cancel; 801 } 802 #endif 803 804 error = xrep_dir_replay_removename(rd, xname, resblks); 805 if (error) 806 goto out_cancel; 807 808 if (xname->type == XFS_DIR3_FT_DIR) 809 rd->subdirs--; 810 rd->dirents--; 811 break; 812 default: 813 ASSERT(0); 814 error = -EIO; 815 goto out_cancel; 816 } 817 818 /* Commit and unlock. */ 819 error = xrep_trans_commit(rd->sc); 820 if (error) 821 return error; 822 823 xrep_tempfile_iunlock(rd->sc); 824 return 0; 825 out_cancel: 826 xchk_trans_cancel(rd->sc); 827 xrep_tempfile_iunlock(rd->sc); 828 return error; 829 } 830 831 /* 832 * Flush stashed incore dirent updates that have been recorded by the scanner. 833 * This is done to reduce the memory requirements of the directory rebuild, 834 * since directories can contain up to 32GB of directory data. 835 * 836 * Caller must not hold transactions or ILOCKs. Caller must hold the tempdir 837 * IOLOCK. 838 */ 839 STATIC int 840 xrep_dir_replay_updates( 841 struct xrep_dir *rd) 842 { 843 xfarray_idx_t array_cur; 844 int error; 845 846 /* Add all the salvaged dirents to the temporary directory. */ 847 mutex_lock(&rd->pscan.lock); 848 foreach_xfarray_idx(rd->dir_entries, array_cur) { 849 struct xrep_dirent dirent; 850 851 error = xfarray_load(rd->dir_entries, array_cur, &dirent); 852 if (error) 853 goto out_unlock; 854 855 error = xfblob_loadname(rd->dir_names, dirent.name_cookie, 856 &rd->xname, dirent.namelen); 857 if (error) 858 goto out_unlock; 859 rd->xname.type = dirent.ftype; 860 mutex_unlock(&rd->pscan.lock); 861 862 error = xrep_dir_replay_update(rd, &rd->xname, &dirent); 863 if (error) 864 return error; 865 mutex_lock(&rd->pscan.lock); 866 } 867 868 /* Empty out both arrays now that we've added the entries. */ 869 xfarray_truncate(rd->dir_entries); 870 xfblob_truncate(rd->dir_names); 871 mutex_unlock(&rd->pscan.lock); 872 return 0; 873 out_unlock: 874 mutex_unlock(&rd->pscan.lock); 875 return error; 876 } 877 878 /* 879 * Periodically flush stashed directory entries to the temporary dir. This 880 * is done to reduce the memory requirements of the directory rebuild, since 881 * directories can contain up to 32GB of directory data. 882 */ 883 STATIC int 884 xrep_dir_flush_stashed( 885 struct xrep_dir *rd) 886 { 887 int error; 888 889 /* 890 * Entering this function, the scrub context has a reference to the 891 * inode being repaired, the temporary file, and a scrub transaction 892 * that we use during dirent salvaging to avoid livelocking if there 893 * are cycles in the directory structures. We hold ILOCK_EXCL on both 894 * the inode being repaired and the temporary file, though they are 895 * not ijoined to the scrub transaction. 896 * 897 * To constrain kernel memory use, we occasionally write salvaged 898 * dirents from the xfarray and xfblob structures into the temporary 899 * directory in preparation for exchanging the directory structures at 900 * the end. Updating the temporary file requires a transaction, so we 901 * commit the scrub transaction and drop the two ILOCKs so that 902 * we can allocate whatever transaction we want. 903 * 904 * We still hold IOLOCK_EXCL on the inode being repaired, which 905 * prevents anyone from accessing the damaged directory data while we 906 * repair it. 907 */ 908 error = xrep_trans_commit(rd->sc); 909 if (error) 910 return error; 911 xchk_iunlock(rd->sc, XFS_ILOCK_EXCL); 912 913 /* 914 * Take the IOLOCK of the temporary file while we modify dirents. This 915 * isn't strictly required because the temporary file is never revealed 916 * to userspace, but we follow the same locking rules. We still hold 917 * sc->ip's IOLOCK. 918 */ 919 error = xrep_tempfile_iolock_polled(rd->sc); 920 if (error) 921 return error; 922 923 /* Write to the tempdir all the updates that we've stashed. */ 924 error = xrep_dir_replay_updates(rd); 925 xrep_tempfile_iounlock(rd->sc); 926 if (error) 927 return error; 928 929 /* 930 * Recreate the salvage transaction and relock the dir we're salvaging. 931 */ 932 error = xchk_trans_alloc(rd->sc, 0); 933 if (error) 934 return error; 935 xchk_ilock(rd->sc, XFS_ILOCK_EXCL); 936 return 0; 937 } 938 939 /* Decide if we've stashed too much dirent data in memory. */ 940 static inline bool 941 xrep_dir_want_flush_stashed( 942 struct xrep_dir *rd) 943 { 944 unsigned long long bytes; 945 946 bytes = xfarray_bytes(rd->dir_entries) + xfblob_bytes(rd->dir_names); 947 return bytes > XREP_DIR_MAX_STASH_BYTES; 948 } 949 950 /* Extract as many directory entries as we can. */ 951 STATIC int 952 xrep_dir_recover( 953 struct xrep_dir *rd) 954 { 955 struct xfs_bmbt_irec got; 956 struct xfs_scrub *sc = rd->sc; 957 struct xfs_da_geometry *geo = sc->mp->m_dir_geo; 958 xfs_fileoff_t offset; 959 xfs_dablk_t dabno; 960 __be32 magic_guess; 961 int nmap; 962 int error; 963 964 xrep_dir_guess_format(rd, &magic_guess); 965 966 /* Iterate each directory data block in the data fork. */ 967 for (offset = 0; 968 offset < geo->leafblk; 969 offset = got.br_startoff + got.br_blockcount) { 970 nmap = 1; 971 error = xfs_bmapi_read(sc->ip, offset, geo->leafblk - offset, 972 &got, &nmap, 0); 973 if (error) 974 return error; 975 if (nmap != 1) 976 return -EFSCORRUPTED; 977 if (!xfs_bmap_is_written_extent(&got)) 978 continue; 979 980 for (dabno = round_up(got.br_startoff, geo->fsbcount); 981 dabno < got.br_startoff + got.br_blockcount; 982 dabno += geo->fsbcount) { 983 if (xchk_should_terminate(rd->sc, &error)) 984 return error; 985 986 error = xrep_dir_recover_dirblock(rd, 987 magic_guess, dabno); 988 if (error) 989 return error; 990 991 /* Flush dirents to constrain memory usage. */ 992 if (xrep_dir_want_flush_stashed(rd)) { 993 error = xrep_dir_flush_stashed(rd); 994 if (error) 995 return error; 996 } 997 } 998 } 999 1000 return 0; 1001 } 1002 1003 /* 1004 * Find all the directory entries for this inode by scraping them out of the 1005 * directory leaf blocks by hand, and flushing them into the temp dir. 1006 */ 1007 STATIC int 1008 xrep_dir_find_entries( 1009 struct xrep_dir *rd) 1010 { 1011 struct xfs_inode *dp = rd->sc->ip; 1012 int error; 1013 1014 /* 1015 * Salvage directory entries from the old directory, and write them to 1016 * the temporary directory. 1017 */ 1018 if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { 1019 error = xrep_dir_recover_sf(rd); 1020 } else { 1021 error = xfs_iread_extents(rd->sc->tp, dp, XFS_DATA_FORK); 1022 if (error) 1023 return error; 1024 1025 error = xrep_dir_recover(rd); 1026 } 1027 if (error) 1028 return error; 1029 1030 return xrep_dir_flush_stashed(rd); 1031 } 1032 1033 /* Scan all files in the filesystem for dirents. */ 1034 STATIC int 1035 xrep_dir_salvage_entries( 1036 struct xrep_dir *rd) 1037 { 1038 struct xfs_scrub *sc = rd->sc; 1039 int error; 1040 1041 /* 1042 * Drop the ILOCK on this directory so that we can scan for this 1043 * directory's parent. Figure out who is going to be the parent of 1044 * this directory, then retake the ILOCK so that we can salvage 1045 * directory entries. 1046 */ 1047 xchk_iunlock(sc, XFS_ILOCK_EXCL); 1048 error = xrep_dir_find_parent(rd); 1049 xchk_ilock(sc, XFS_ILOCK_EXCL); 1050 if (error) 1051 return error; 1052 1053 /* 1054 * Collect directory entries by parsing raw leaf blocks to salvage 1055 * whatever we can. When we're done, free the staging memory before 1056 * exchanging the directories to reduce memory usage. 1057 */ 1058 error = xrep_dir_find_entries(rd); 1059 if (error) 1060 return error; 1061 1062 /* 1063 * Cancel the repair transaction and drop the ILOCK so that we can 1064 * (later) use the atomic mapping exchange functions to compute the 1065 * correct block reservations and re-lock the inodes. 1066 * 1067 * We still hold IOLOCK_EXCL (aka i_rwsem) which will prevent directory 1068 * modifications, but there's nothing to prevent userspace from reading 1069 * the directory until we're ready for the exchange operation. Reads 1070 * will return -EIO without shutting down the fs, so we're ok with 1071 * that. 1072 * 1073 * The VFS can change dotdot on us, but the findparent scan will keep 1074 * our incore parent inode up to date. See the note on locking issues 1075 * for more details. 1076 */ 1077 error = xrep_trans_commit(sc); 1078 if (error) 1079 return error; 1080 1081 xchk_iunlock(sc, XFS_ILOCK_EXCL); 1082 return 0; 1083 } 1084 1085 1086 /* 1087 * Examine a parent pointer of a file. If it leads us back to the directory 1088 * that we're rebuilding, create an incore dirent from the parent pointer and 1089 * stash it. 1090 */ 1091 STATIC int 1092 xrep_dir_scan_pptr( 1093 struct xfs_scrub *sc, 1094 struct xfs_inode *ip, 1095 unsigned int attr_flags, 1096 const unsigned char *name, 1097 unsigned int namelen, 1098 const void *value, 1099 unsigned int valuelen, 1100 void *priv) 1101 { 1102 struct xfs_name xname = { 1103 .name = name, 1104 .len = namelen, 1105 .type = xfs_mode_to_ftype(VFS_I(ip)->i_mode), 1106 }; 1107 xfs_ino_t parent_ino; 1108 uint32_t parent_gen; 1109 struct xrep_dir *rd = priv; 1110 int error; 1111 1112 if (!(attr_flags & XFS_ATTR_PARENT)) 1113 return 0; 1114 1115 /* 1116 * Ignore parent pointers that point back to a different dir, list the 1117 * wrong generation number, or are invalid. 1118 */ 1119 error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value, 1120 valuelen, &parent_ino, &parent_gen); 1121 if (error) 1122 return error; 1123 1124 if (parent_ino != sc->ip->i_ino || 1125 parent_gen != VFS_I(sc->ip)->i_generation) 1126 return 0; 1127 1128 mutex_lock(&rd->pscan.lock); 1129 error = xrep_dir_stash_createname(rd, &xname, ip->i_ino); 1130 mutex_unlock(&rd->pscan.lock); 1131 return error; 1132 } 1133 1134 /* 1135 * If this child dirent points to the directory being repaired, remember that 1136 * fact so that we can reset the dotdot entry if necessary. 1137 */ 1138 STATIC int 1139 xrep_dir_scan_dirent( 1140 struct xfs_scrub *sc, 1141 struct xfs_inode *dp, 1142 xfs_dir2_dataptr_t dapos, 1143 const struct xfs_name *name, 1144 xfs_ino_t ino, 1145 void *priv) 1146 { 1147 struct xrep_dir *rd = priv; 1148 1149 /* Dirent doesn't point to this directory. */ 1150 if (ino != rd->sc->ip->i_ino) 1151 return 0; 1152 1153 /* Ignore garbage inum. */ 1154 if (!xfs_verify_dir_ino(rd->sc->mp, ino)) 1155 return 0; 1156 1157 /* No weird looking names. */ 1158 if (name->len >= MAXNAMELEN || name->len <= 0) 1159 return 0; 1160 1161 /* Don't pick up dot or dotdot entries; we only want child dirents. */ 1162 if (xfs_dir2_samename(name, &xfs_name_dotdot) || 1163 xfs_dir2_samename(name, &xfs_name_dot)) 1164 return 0; 1165 1166 trace_xrep_dir_stash_createname(sc->tempip, &xfs_name_dotdot, 1167 dp->i_ino); 1168 1169 xrep_findparent_scan_found(&rd->pscan, dp->i_ino); 1170 return 0; 1171 } 1172 1173 /* 1174 * Decide if we want to look for child dirents or parent pointers in this file. 1175 * Skip the dir being repaired and any files being used to stage repairs. 1176 */ 1177 static inline bool 1178 xrep_dir_want_scan( 1179 struct xrep_dir *rd, 1180 const struct xfs_inode *ip) 1181 { 1182 return ip != rd->sc->ip && !xrep_is_tempfile(ip); 1183 } 1184 1185 /* 1186 * Take ILOCK on a file that we want to scan. 1187 * 1188 * Select ILOCK_EXCL if the file is a directory with an unloaded data bmbt or 1189 * has an unloaded attr bmbt. Otherwise, take ILOCK_SHARED. 1190 */ 1191 static inline unsigned int 1192 xrep_dir_scan_ilock( 1193 struct xrep_dir *rd, 1194 struct xfs_inode *ip) 1195 { 1196 uint lock_mode = XFS_ILOCK_SHARED; 1197 1198 /* Need to take the shared ILOCK to advance the iscan cursor. */ 1199 if (!xrep_dir_want_scan(rd, ip)) 1200 goto lock; 1201 1202 if (S_ISDIR(VFS_I(ip)->i_mode) && xfs_need_iread_extents(&ip->i_df)) { 1203 lock_mode = XFS_ILOCK_EXCL; 1204 goto lock; 1205 } 1206 1207 if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af)) 1208 lock_mode = XFS_ILOCK_EXCL; 1209 1210 lock: 1211 xfs_ilock(ip, lock_mode); 1212 return lock_mode; 1213 } 1214 1215 /* 1216 * Scan this file for relevant child dirents or parent pointers that point to 1217 * the directory we're rebuilding. 1218 */ 1219 STATIC int 1220 xrep_dir_scan_file( 1221 struct xrep_dir *rd, 1222 struct xfs_inode *ip) 1223 { 1224 unsigned int lock_mode; 1225 int error = 0; 1226 1227 lock_mode = xrep_dir_scan_ilock(rd, ip); 1228 1229 if (!xrep_dir_want_scan(rd, ip)) 1230 goto scan_done; 1231 1232 /* 1233 * If the extended attributes look as though they has been zapped by 1234 * the inode record repair code, we cannot scan for parent pointers. 1235 */ 1236 if (xchk_pptr_looks_zapped(ip)) { 1237 error = -EBUSY; 1238 goto scan_done; 1239 } 1240 1241 error = xchk_xattr_walk(rd->sc, ip, xrep_dir_scan_pptr, NULL, rd); 1242 if (error) 1243 goto scan_done; 1244 1245 if (S_ISDIR(VFS_I(ip)->i_mode)) { 1246 /* 1247 * If the directory looks as though it has been zapped by the 1248 * inode record repair code, we cannot scan for child dirents. 1249 */ 1250 if (xchk_dir_looks_zapped(ip)) { 1251 error = -EBUSY; 1252 goto scan_done; 1253 } 1254 1255 error = xchk_dir_walk(rd->sc, ip, xrep_dir_scan_dirent, rd); 1256 if (error) 1257 goto scan_done; 1258 } 1259 1260 scan_done: 1261 xchk_iscan_mark_visited(&rd->pscan.iscan, ip); 1262 xfs_iunlock(ip, lock_mode); 1263 return error; 1264 } 1265 1266 /* 1267 * Scan all files in the filesystem for parent pointers that we can turn into 1268 * replacement dirents, and a dirent that we can use to set the dotdot pointer. 1269 */ 1270 STATIC int 1271 xrep_dir_scan_dirtree( 1272 struct xrep_dir *rd) 1273 { 1274 struct xfs_scrub *sc = rd->sc; 1275 struct xfs_inode *ip; 1276 int error; 1277 1278 /* Roots of directory trees are their own parents. */ 1279 if (xchk_inode_is_dirtree_root(sc->ip)) 1280 xrep_findparent_scan_found(&rd->pscan, sc->ip->i_ino); 1281 1282 /* 1283 * Filesystem scans are time consuming. Drop the directory ILOCK and 1284 * all other resources for the duration of the scan and hope for the 1285 * best. The live update hooks will keep our scan information up to 1286 * date even though we've dropped the locks. 1287 */ 1288 xchk_trans_cancel(sc); 1289 if (sc->ilock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) 1290 xchk_iunlock(sc, sc->ilock_flags & (XFS_ILOCK_SHARED | 1291 XFS_ILOCK_EXCL)); 1292 error = xchk_trans_alloc_empty(sc); 1293 if (error) 1294 return error; 1295 1296 while ((error = xchk_iscan_iter(&rd->pscan.iscan, &ip)) == 1) { 1297 bool flush; 1298 1299 error = xrep_dir_scan_file(rd, ip); 1300 xchk_irele(sc, ip); 1301 if (error) 1302 break; 1303 1304 /* Flush stashed dirent updates to constrain memory usage. */ 1305 mutex_lock(&rd->pscan.lock); 1306 flush = xrep_dir_want_flush_stashed(rd); 1307 mutex_unlock(&rd->pscan.lock); 1308 if (flush) { 1309 xchk_trans_cancel(sc); 1310 1311 error = xrep_tempfile_iolock_polled(sc); 1312 if (error) 1313 break; 1314 1315 error = xrep_dir_replay_updates(rd); 1316 xrep_tempfile_iounlock(sc); 1317 if (error) 1318 break; 1319 1320 error = xchk_trans_alloc_empty(sc); 1321 if (error) 1322 break; 1323 } 1324 1325 if (xchk_should_terminate(sc, &error)) 1326 break; 1327 } 1328 xchk_iscan_iter_finish(&rd->pscan.iscan); 1329 if (error) { 1330 /* 1331 * If we couldn't grab an inode that was busy with a state 1332 * change, change the error code so that we exit to userspace 1333 * as quickly as possible. 1334 */ 1335 if (error == -EBUSY) 1336 return -ECANCELED; 1337 return error; 1338 } 1339 1340 /* 1341 * Cancel the empty transaction so that we can (later) use the atomic 1342 * file mapping exchange functions to lock files and commit the new 1343 * directory. 1344 */ 1345 xchk_trans_cancel(rd->sc); 1346 return 0; 1347 } 1348 1349 /* 1350 * Capture dirent updates being made by other threads which are relevant to the 1351 * directory being repaired. 1352 */ 1353 STATIC int 1354 xrep_dir_live_update( 1355 struct notifier_block *nb, 1356 unsigned long action, 1357 void *data) 1358 { 1359 struct xfs_dir_update_params *p = data; 1360 struct xrep_dir *rd; 1361 struct xfs_scrub *sc; 1362 int error = 0; 1363 1364 rd = container_of(nb, struct xrep_dir, pscan.dhook.dirent_hook.nb); 1365 sc = rd->sc; 1366 1367 /* 1368 * This thread updated a child dirent in the directory that we're 1369 * rebuilding. Stash the update for replay against the temporary 1370 * directory. 1371 */ 1372 if (p->dp->i_ino == sc->ip->i_ino && 1373 xchk_iscan_want_live_update(&rd->pscan.iscan, p->ip->i_ino)) { 1374 mutex_lock(&rd->pscan.lock); 1375 if (p->delta > 0) 1376 error = xrep_dir_stash_createname(rd, p->name, 1377 p->ip->i_ino); 1378 else 1379 error = xrep_dir_stash_removename(rd, p->name, 1380 p->ip->i_ino); 1381 mutex_unlock(&rd->pscan.lock); 1382 if (error) 1383 goto out_abort; 1384 } 1385 1386 /* 1387 * This thread updated another directory's child dirent that points to 1388 * the directory that we're rebuilding, so remember the new dotdot 1389 * target. 1390 */ 1391 if (p->ip->i_ino == sc->ip->i_ino && 1392 xchk_iscan_want_live_update(&rd->pscan.iscan, p->dp->i_ino)) { 1393 if (p->delta > 0) { 1394 trace_xrep_dir_stash_createname(sc->tempip, 1395 &xfs_name_dotdot, 1396 p->dp->i_ino); 1397 1398 xrep_findparent_scan_found(&rd->pscan, p->dp->i_ino); 1399 } else { 1400 trace_xrep_dir_stash_removename(sc->tempip, 1401 &xfs_name_dotdot, 1402 rd->pscan.parent_ino); 1403 1404 xrep_findparent_scan_found(&rd->pscan, NULLFSINO); 1405 } 1406 } 1407 1408 return NOTIFY_DONE; 1409 out_abort: 1410 xchk_iscan_abort(&rd->pscan.iscan); 1411 return NOTIFY_DONE; 1412 } 1413 1414 /* 1415 * Free all the directory blocks and reset the data fork. The caller must 1416 * join the inode to the transaction. This function returns with the inode 1417 * joined to a clean scrub transaction. 1418 */ 1419 STATIC int 1420 xrep_dir_reset_fork( 1421 struct xrep_dir *rd, 1422 xfs_ino_t parent_ino) 1423 { 1424 struct xfs_scrub *sc = rd->sc; 1425 struct xfs_ifork *ifp = xfs_ifork_ptr(sc->tempip, XFS_DATA_FORK); 1426 int error; 1427 1428 /* Unmap all the directory buffers. */ 1429 if (xfs_ifork_has_extents(ifp)) { 1430 error = xrep_reap_ifork(sc, sc->tempip, XFS_DATA_FORK); 1431 if (error) 1432 return error; 1433 } 1434 1435 trace_xrep_dir_reset_fork(sc->tempip, parent_ino); 1436 1437 /* Reset the data fork to an empty data fork. */ 1438 xfs_idestroy_fork(ifp); 1439 ifp->if_bytes = 0; 1440 sc->tempip->i_disk_size = 0; 1441 1442 /* Reinitialize the short form directory. */ 1443 xrep_dir_init_args(rd, sc->tempip, NULL); 1444 return xfs_dir2_sf_create(&rd->args, parent_ino); 1445 } 1446 1447 /* 1448 * Prepare both inodes' directory forks for exchanging mappings. Promote the 1449 * tempfile from short format to leaf format, and if the file being repaired 1450 * has a short format data fork, turn it into an empty extent list. 1451 */ 1452 STATIC int 1453 xrep_dir_swap_prep( 1454 struct xfs_scrub *sc, 1455 bool temp_local, 1456 bool ip_local) 1457 { 1458 int error; 1459 1460 /* 1461 * If the tempfile's directory is in shortform format, convert that to 1462 * a single leaf extent so that we can use the atomic mapping exchange. 1463 */ 1464 if (temp_local) { 1465 struct xfs_da_args args = { 1466 .dp = sc->tempip, 1467 .geo = sc->mp->m_dir_geo, 1468 .whichfork = XFS_DATA_FORK, 1469 .trans = sc->tp, 1470 .total = 1, 1471 .owner = sc->ip->i_ino, 1472 }; 1473 1474 error = xfs_dir2_sf_to_block(&args); 1475 if (error) 1476 return error; 1477 1478 /* 1479 * Roll the deferred log items to get us back to a clean 1480 * transaction. 1481 */ 1482 error = xfs_defer_finish(&sc->tp); 1483 if (error) 1484 return error; 1485 } 1486 1487 /* 1488 * If the file being repaired had a shortform data fork, convert that 1489 * to an empty extent list in preparation for the atomic mapping 1490 * exchange. 1491 */ 1492 if (ip_local) { 1493 struct xfs_ifork *ifp; 1494 1495 ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); 1496 xfs_idestroy_fork(ifp); 1497 ifp->if_format = XFS_DINODE_FMT_EXTENTS; 1498 ifp->if_nextents = 0; 1499 ifp->if_bytes = 0; 1500 ifp->if_data = NULL; 1501 ifp->if_height = 0; 1502 1503 xfs_trans_log_inode(sc->tp, sc->ip, 1504 XFS_ILOG_CORE | XFS_ILOG_DDATA); 1505 } 1506 1507 return 0; 1508 } 1509 1510 /* 1511 * Replace the inode number of a directory entry. 1512 */ 1513 static int 1514 xrep_dir_replace( 1515 struct xrep_dir *rd, 1516 struct xfs_inode *dp, 1517 const struct xfs_name *name, 1518 xfs_ino_t inum, 1519 xfs_extlen_t total) 1520 { 1521 struct xfs_scrub *sc = rd->sc; 1522 int error; 1523 1524 ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); 1525 1526 error = xfs_dir_ino_validate(sc->mp, inum); 1527 if (error) 1528 return error; 1529 1530 xrep_dir_init_args(rd, dp, name); 1531 rd->args.inumber = inum; 1532 rd->args.total = total; 1533 return xfs_dir_replace_args(&rd->args); 1534 } 1535 1536 /* 1537 * Reset the link count of this directory and adjust the unlinked list pointers 1538 * as needed. 1539 */ 1540 STATIC int 1541 xrep_dir_set_nlink( 1542 struct xrep_dir *rd) 1543 { 1544 struct xfs_scrub *sc = rd->sc; 1545 struct xfs_inode *dp = sc->ip; 1546 struct xfs_perag *pag; 1547 unsigned int new_nlink = min_t(unsigned long long, 1548 rd->subdirs + 2, 1549 XFS_NLINK_PINNED); 1550 int error; 1551 1552 /* 1553 * The directory is not on the incore unlinked list, which means that 1554 * it needs to be reachable via the directory tree. Update the nlink 1555 * with our observed link count. If the directory has no parent, it 1556 * will be moved to the orphanage. 1557 */ 1558 if (!xfs_inode_on_unlinked_list(dp)) 1559 goto reset_nlink; 1560 1561 /* 1562 * The directory is on the unlinked list and we did not find any 1563 * dirents. Set the link count to zero and let the directory 1564 * inactivate when the last reference drops. 1565 */ 1566 if (rd->dirents == 0) { 1567 rd->needs_adoption = false; 1568 new_nlink = 0; 1569 goto reset_nlink; 1570 } 1571 1572 /* 1573 * The directory is on the unlinked list and we found dirents. This 1574 * directory needs to be reachable via the directory tree. Remove the 1575 * dir from the unlinked list and update nlink with the observed link 1576 * count. If the directory has no parent, it will be moved to the 1577 * orphanage. 1578 */ 1579 pag = xfs_perag_get(sc->mp, XFS_INO_TO_AGNO(sc->mp, dp->i_ino)); 1580 if (!pag) { 1581 ASSERT(0); 1582 return -EFSCORRUPTED; 1583 } 1584 1585 error = xfs_iunlink_remove(sc->tp, pag, dp); 1586 xfs_perag_put(pag); 1587 if (error) 1588 return error; 1589 1590 reset_nlink: 1591 if (VFS_I(dp)->i_nlink != new_nlink) 1592 set_nlink(VFS_I(dp), new_nlink); 1593 return 0; 1594 } 1595 1596 /* 1597 * Finish replaying stashed dirent updates, allocate a transaction for 1598 * exchanging data fork mappings, and take the ILOCKs of both directories 1599 * before we commit the new directory structure. 1600 */ 1601 STATIC int 1602 xrep_dir_finalize_tempdir( 1603 struct xrep_dir *rd) 1604 { 1605 struct xfs_scrub *sc = rd->sc; 1606 int error; 1607 1608 if (!xfs_has_parent(sc->mp)) 1609 return xrep_tempexch_trans_alloc(sc, XFS_DATA_FORK, &rd->tx); 1610 1611 /* 1612 * Repair relies on the ILOCK to quiesce all possible dirent updates. 1613 * Replay all queued dirent updates into the tempdir before exchanging 1614 * the contents, even if that means dropping the ILOCKs and the 1615 * transaction. 1616 */ 1617 do { 1618 error = xrep_dir_replay_updates(rd); 1619 if (error) 1620 return error; 1621 1622 error = xrep_tempexch_trans_alloc(sc, XFS_DATA_FORK, &rd->tx); 1623 if (error) 1624 return error; 1625 1626 if (xfarray_length(rd->dir_entries) == 0) 1627 break; 1628 1629 xchk_trans_cancel(sc); 1630 xrep_tempfile_iunlock_both(sc); 1631 } while (!xchk_should_terminate(sc, &error)); 1632 return error; 1633 } 1634 1635 /* Exchange the temporary directory's data fork with the one being repaired. */ 1636 STATIC int 1637 xrep_dir_swap( 1638 struct xrep_dir *rd) 1639 { 1640 struct xfs_scrub *sc = rd->sc; 1641 xfs_ino_t ino; 1642 bool ip_local, temp_local; 1643 int error = 0; 1644 1645 /* 1646 * If we never found the parent for this directory, temporarily assign 1647 * the root dir as the parent; we'll move this to the orphanage after 1648 * exchanging the dir contents. We hold the ILOCK of the dir being 1649 * repaired, so we're not worried about racy updates of dotdot. 1650 */ 1651 ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL); 1652 if (rd->pscan.parent_ino == NULLFSINO) { 1653 rd->needs_adoption = true; 1654 rd->pscan.parent_ino = rd->sc->mp->m_sb.sb_rootino; 1655 } 1656 1657 /* 1658 * Reset the temporary directory's '..' entry to point to the parent 1659 * that we found. The dirent replace code asserts if the dirent 1660 * already points at the new inumber, so we look it up here. 1661 * 1662 * It's also possible that this replacement could also expand a sf 1663 * tempdir into block format. 1664 */ 1665 error = xchk_dir_lookup(sc, rd->sc->tempip, &xfs_name_dotdot, &ino); 1666 if (error) 1667 return error; 1668 1669 if (rd->pscan.parent_ino != ino) { 1670 error = xrep_dir_replace(rd, rd->sc->tempip, &xfs_name_dotdot, 1671 rd->pscan.parent_ino, rd->tx.req.resblks); 1672 if (error) 1673 return error; 1674 } 1675 1676 /* 1677 * Changing the dot and dotdot entries could have changed the shape of 1678 * the directory, so we recompute these. 1679 */ 1680 ip_local = sc->ip->i_df.if_format == XFS_DINODE_FMT_LOCAL; 1681 temp_local = sc->tempip->i_df.if_format == XFS_DINODE_FMT_LOCAL; 1682 1683 /* 1684 * If the both files have a local format data fork and the rebuilt 1685 * directory data would fit in the repaired file's data fork, copy 1686 * the contents from the tempfile and update the directory link count. 1687 * We're done now. 1688 */ 1689 if (ip_local && temp_local && 1690 sc->tempip->i_disk_size <= xfs_inode_data_fork_size(sc->ip)) { 1691 xrep_tempfile_copyout_local(sc, XFS_DATA_FORK); 1692 return xrep_dir_set_nlink(rd); 1693 } 1694 1695 /* 1696 * Clean the transaction before we start working on exchanging 1697 * directory contents. 1698 */ 1699 error = xrep_tempfile_roll_trans(rd->sc); 1700 if (error) 1701 return error; 1702 1703 /* Otherwise, make sure both data forks are in block-mapping mode. */ 1704 error = xrep_dir_swap_prep(sc, temp_local, ip_local); 1705 if (error) 1706 return error; 1707 1708 /* 1709 * Set nlink of the directory in the same transaction sequence that 1710 * (atomically) commits the new directory data. 1711 */ 1712 error = xrep_dir_set_nlink(rd); 1713 if (error) 1714 return error; 1715 1716 return xrep_tempexch_contents(sc, &rd->tx); 1717 } 1718 1719 /* 1720 * Exchange the new directory contents (which we created in the tempfile) with 1721 * the directory being repaired. 1722 */ 1723 STATIC int 1724 xrep_dir_rebuild_tree( 1725 struct xrep_dir *rd) 1726 { 1727 struct xfs_scrub *sc = rd->sc; 1728 int error; 1729 1730 trace_xrep_dir_rebuild_tree(sc->ip, rd->pscan.parent_ino); 1731 1732 /* 1733 * Take the IOLOCK on the temporary file so that we can run dir 1734 * operations with the same locks held as we would for a normal file. 1735 * We still hold sc->ip's IOLOCK. 1736 */ 1737 error = xrep_tempfile_iolock_polled(rd->sc); 1738 if (error) 1739 return error; 1740 1741 /* 1742 * Allocate transaction, lock inodes, and make sure that we've replayed 1743 * all the stashed dirent updates to the tempdir. After this point, 1744 * we're ready to exchange data fork mappings. 1745 */ 1746 error = xrep_dir_finalize_tempdir(rd); 1747 if (error) 1748 return error; 1749 1750 if (xchk_iscan_aborted(&rd->pscan.iscan)) 1751 return -ECANCELED; 1752 1753 /* 1754 * Exchange the tempdir's data fork with the file being repaired. This 1755 * recreates the transaction and re-takes the ILOCK in the scrub 1756 * context. 1757 */ 1758 error = xrep_dir_swap(rd); 1759 if (error) 1760 return error; 1761 1762 /* 1763 * Release the old directory blocks and reset the data fork of the temp 1764 * directory to an empty shortform directory because inactivation does 1765 * nothing for directories. 1766 */ 1767 error = xrep_dir_reset_fork(rd, sc->mp->m_rootip->i_ino); 1768 if (error) 1769 return error; 1770 1771 /* 1772 * Roll to get a transaction without any inodes joined to it. Then we 1773 * can drop the tempfile's ILOCK and IOLOCK before doing more work on 1774 * the scrub target directory. 1775 */ 1776 error = xfs_trans_roll(&sc->tp); 1777 if (error) 1778 return error; 1779 1780 xrep_tempfile_iunlock(sc); 1781 xrep_tempfile_iounlock(sc); 1782 return 0; 1783 } 1784 1785 /* Set up the filesystem scan so we can regenerate directory entries. */ 1786 STATIC int 1787 xrep_dir_setup_scan( 1788 struct xrep_dir *rd) 1789 { 1790 struct xfs_scrub *sc = rd->sc; 1791 char *descr; 1792 int error; 1793 1794 /* Set up some staging memory for salvaging dirents. */ 1795 descr = xchk_xfile_ino_descr(sc, "directory entries"); 1796 error = xfarray_create(descr, 0, sizeof(struct xrep_dirent), 1797 &rd->dir_entries); 1798 kfree(descr); 1799 if (error) 1800 return error; 1801 1802 descr = xchk_xfile_ino_descr(sc, "directory entry names"); 1803 error = xfblob_create(descr, &rd->dir_names); 1804 kfree(descr); 1805 if (error) 1806 goto out_xfarray; 1807 1808 if (xfs_has_parent(sc->mp)) 1809 error = __xrep_findparent_scan_start(sc, &rd->pscan, 1810 xrep_dir_live_update); 1811 else 1812 error = xrep_findparent_scan_start(sc, &rd->pscan); 1813 if (error) 1814 goto out_xfblob; 1815 1816 return 0; 1817 1818 out_xfblob: 1819 xfblob_destroy(rd->dir_names); 1820 rd->dir_names = NULL; 1821 out_xfarray: 1822 xfarray_destroy(rd->dir_entries); 1823 rd->dir_entries = NULL; 1824 return error; 1825 } 1826 1827 /* 1828 * Move the current file to the orphanage. 1829 * 1830 * Caller must hold IOLOCK_EXCL on @sc->ip, and no other inode locks. Upon 1831 * successful return, the scrub transaction will have enough extra reservation 1832 * to make the move; it will hold IOLOCK_EXCL and ILOCK_EXCL of @sc->ip and the 1833 * orphanage; and both inodes will be ijoined. 1834 */ 1835 STATIC int 1836 xrep_dir_move_to_orphanage( 1837 struct xrep_dir *rd) 1838 { 1839 struct xfs_scrub *sc = rd->sc; 1840 xfs_ino_t orig_parent, new_parent; 1841 int error; 1842 1843 /* 1844 * We are about to drop the ILOCK on sc->ip to lock the orphanage and 1845 * prepare for the adoption. Therefore, look up the old dotdot entry 1846 * for sc->ip so that we can compare it after we re-lock sc->ip. 1847 */ 1848 error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &orig_parent); 1849 if (error) 1850 return error; 1851 1852 /* 1853 * Drop the ILOCK on the scrub target and commit the transaction. 1854 * Adoption computes its own resource requirements and gathers the 1855 * necessary components. 1856 */ 1857 error = xrep_trans_commit(sc); 1858 if (error) 1859 return error; 1860 xchk_iunlock(sc, XFS_ILOCK_EXCL); 1861 1862 /* If we can take the orphanage's iolock then we're ready to move. */ 1863 if (!xrep_orphanage_ilock_nowait(sc, XFS_IOLOCK_EXCL)) { 1864 xchk_iunlock(sc, sc->ilock_flags); 1865 error = xrep_orphanage_iolock_two(sc); 1866 if (error) 1867 return error; 1868 } 1869 1870 /* Grab transaction and ILOCK the two files. */ 1871 error = xrep_adoption_trans_alloc(sc, &rd->adoption); 1872 if (error) 1873 return error; 1874 1875 error = xrep_adoption_compute_name(&rd->adoption, &rd->xname); 1876 if (error) 1877 return error; 1878 1879 /* 1880 * Now that we've reacquired the ILOCK on sc->ip, look up the dotdot 1881 * entry again. If the parent changed or the child was unlinked while 1882 * the child directory was unlocked, we don't need to move the child to 1883 * the orphanage after all. 1884 */ 1885 error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &new_parent); 1886 if (error) 1887 return error; 1888 1889 /* 1890 * Attach to the orphanage if we still have a linked directory and it 1891 * hasn't been moved. 1892 */ 1893 if (orig_parent == new_parent && VFS_I(sc->ip)->i_nlink > 0) { 1894 error = xrep_adoption_move(&rd->adoption); 1895 if (error) 1896 return error; 1897 } 1898 1899 /* 1900 * Launder the scrub transaction so we can drop the orphanage ILOCK 1901 * and IOLOCK. Return holding the scrub target's ILOCK and IOLOCK. 1902 */ 1903 error = xrep_adoption_trans_roll(&rd->adoption); 1904 if (error) 1905 return error; 1906 1907 xrep_orphanage_iunlock(sc, XFS_ILOCK_EXCL); 1908 xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL); 1909 return 0; 1910 } 1911 1912 /* 1913 * Repair the directory metadata. 1914 * 1915 * XXX: Directory entry buffers can be multiple fsblocks in size. The buffer 1916 * cache in XFS can't handle aliased multiblock buffers, so this might 1917 * misbehave if the directory blocks are crosslinked with other filesystem 1918 * metadata. 1919 * 1920 * XXX: Is it necessary to check the dcache for this directory to make sure 1921 * that we always recreate every cached entry? 1922 */ 1923 int 1924 xrep_directory( 1925 struct xfs_scrub *sc) 1926 { 1927 struct xrep_dir *rd = sc->buf; 1928 int error; 1929 1930 /* The rmapbt is required to reap the old data fork. */ 1931 if (!xfs_has_rmapbt(sc->mp)) 1932 return -EOPNOTSUPP; 1933 /* We require atomic file exchange range to rebuild anything. */ 1934 if (!xfs_has_exchange_range(sc->mp)) 1935 return -EOPNOTSUPP; 1936 1937 error = xrep_dir_setup_scan(rd); 1938 if (error) 1939 return error; 1940 1941 if (xfs_has_parent(sc->mp)) 1942 error = xrep_dir_scan_dirtree(rd); 1943 else 1944 error = xrep_dir_salvage_entries(rd); 1945 if (error) 1946 goto out_teardown; 1947 1948 /* Last chance to abort before we start committing fixes. */ 1949 if (xchk_should_terminate(sc, &error)) 1950 goto out_teardown; 1951 1952 error = xrep_dir_rebuild_tree(rd); 1953 if (error) 1954 goto out_teardown; 1955 1956 if (rd->needs_adoption) { 1957 if (!xrep_orphanage_can_adopt(rd->sc)) 1958 error = -EFSCORRUPTED; 1959 else 1960 error = xrep_dir_move_to_orphanage(rd); 1961 if (error) 1962 goto out_teardown; 1963 } 1964 1965 out_teardown: 1966 xrep_dir_teardown(sc); 1967 return error; 1968 } 1969