1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "bkey_buf.h" 5 #include "btree_update.h" 6 #include "buckets.h" 7 #include "darray.h" 8 #include "dirent.h" 9 #include "error.h" 10 #include "fs-common.h" 11 #include "fsck.h" 12 #include "inode.h" 13 #include "keylist.h" 14 #include "recovery.h" 15 #include "snapshot.h" 16 #include "super.h" 17 #include "xattr.h" 18 19 #include <linux/bsearch.h> 20 #include <linux/dcache.h> /* struct qstr */ 21 22 #define QSTR(n) { { { .len = strlen(n) } }, .name = n } 23 24 /* 25 * XXX: this is handling transaction restarts without returning 26 * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore: 27 */ 28 static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum, 29 u32 snapshot) 30 { 31 struct btree_iter iter; 32 struct bkey_s_c k; 33 u64 sectors = 0; 34 int ret; 35 36 for_each_btree_key_upto(trans, iter, BTREE_ID_extents, 37 SPOS(inum, 0, snapshot), 38 POS(inum, U64_MAX), 39 0, k, ret) 40 if (bkey_extent_is_allocation(k.k)) 41 sectors += k.k->size; 42 43 bch2_trans_iter_exit(trans, &iter); 44 45 return ret ?: sectors; 46 } 47 48 static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum, 49 u32 snapshot) 50 { 51 struct btree_iter iter; 52 struct bkey_s_c k; 53 struct bkey_s_c_dirent d; 54 u64 subdirs = 0; 55 int ret; 56 57 for_each_btree_key_upto(trans, iter, BTREE_ID_dirents, 58 SPOS(inum, 0, snapshot), 59 POS(inum, U64_MAX), 60 0, k, ret) { 61 if (k.k->type != KEY_TYPE_dirent) 62 continue; 63 64 d = bkey_s_c_to_dirent(k); 65 if (d.v->d_type == DT_DIR) 66 subdirs++; 67 } 68 bch2_trans_iter_exit(trans, &iter); 69 70 return ret ?: subdirs; 71 } 72 73 static int __snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot, 74 u32 *subvol) 75 { 76 struct bch_snapshot s; 77 int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, 78 POS(0, snapshot), 0, 79 snapshot, &s); 80 if (!ret) 81 *subvol = le32_to_cpu(s.subvol); 82 else if (bch2_err_matches(ret, ENOENT)) 83 bch_err(trans->c, "snapshot %u not found", snapshot); 84 return ret; 85 86 } 87 88 static int __subvol_lookup(struct btree_trans *trans, u32 subvol, 89 u32 *snapshot, u64 *inum) 90 { 91 struct bch_subvolume s; 92 int ret; 93 94 ret = bch2_subvolume_get(trans, subvol, false, 0, &s); 95 96 *snapshot = le32_to_cpu(s.snapshot); 97 *inum = le64_to_cpu(s.inode); 98 return ret; 99 } 100 101 static int subvol_lookup(struct btree_trans *trans, u32 subvol, 102 u32 *snapshot, u64 *inum) 103 { 104 return lockrestart_do(trans, __subvol_lookup(trans, subvol, snapshot, inum)); 105 } 106 107 static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, 108 struct bch_inode_unpacked *inode) 109 { 110 struct btree_iter iter; 111 struct bkey_s_c k; 112 int ret; 113 114 bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, 115 POS(0, inode_nr), 116 BTREE_ITER_ALL_SNAPSHOTS); 117 k = bch2_btree_iter_peek(&iter); 118 ret = bkey_err(k); 119 if (ret) 120 goto err; 121 122 if (!k.k || !bkey_eq(k.k->p, POS(0, inode_nr))) { 123 ret = -BCH_ERR_ENOENT_inode; 124 goto err; 125 } 126 127 ret = bch2_inode_unpack(k, inode); 128 err: 129 bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr); 130 bch2_trans_iter_exit(trans, &iter); 131 return ret; 132 } 133 134 static int __lookup_inode(struct btree_trans *trans, u64 inode_nr, 135 struct bch_inode_unpacked *inode, 136 u32 *snapshot) 137 { 138 struct btree_iter iter; 139 struct bkey_s_c k; 140 int ret; 141 142 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, 143 SPOS(0, inode_nr, *snapshot), 0); 144 ret = bkey_err(k); 145 if (ret) 146 goto err; 147 148 ret = bkey_is_inode(k.k) 149 ? bch2_inode_unpack(k, inode) 150 : -BCH_ERR_ENOENT_inode; 151 if (!ret) 152 *snapshot = iter.pos.snapshot; 153 err: 154 bch_err_msg(trans->c, ret, "fetching inode %llu:%u", inode_nr, *snapshot); 155 bch2_trans_iter_exit(trans, &iter); 156 return ret; 157 } 158 159 static int lookup_inode(struct btree_trans *trans, u64 inode_nr, 160 struct bch_inode_unpacked *inode, 161 u32 *snapshot) 162 { 163 return lockrestart_do(trans, __lookup_inode(trans, inode_nr, inode, snapshot)); 164 } 165 166 static int __lookup_dirent(struct btree_trans *trans, 167 struct bch_hash_info hash_info, 168 subvol_inum dir, struct qstr *name, 169 u64 *target, unsigned *type) 170 { 171 struct btree_iter iter; 172 struct bkey_s_c_dirent d; 173 int ret; 174 175 ret = bch2_hash_lookup(trans, &iter, bch2_dirent_hash_desc, 176 &hash_info, dir, name, 0); 177 if (ret) 178 return ret; 179 180 d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter)); 181 *target = le64_to_cpu(d.v->d_inum); 182 *type = d.v->d_type; 183 bch2_trans_iter_exit(trans, &iter); 184 return 0; 185 } 186 187 static int __write_inode(struct btree_trans *trans, 188 struct bch_inode_unpacked *inode, 189 u32 snapshot) 190 { 191 struct bkey_inode_buf *inode_p = 192 bch2_trans_kmalloc(trans, sizeof(*inode_p)); 193 194 if (IS_ERR(inode_p)) 195 return PTR_ERR(inode_p); 196 197 bch2_inode_pack(inode_p, inode); 198 inode_p->inode.k.p.snapshot = snapshot; 199 200 return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes, 201 &inode_p->inode.k_i, 202 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); 203 } 204 205 static int fsck_write_inode(struct btree_trans *trans, 206 struct bch_inode_unpacked *inode, 207 u32 snapshot) 208 { 209 int ret = commit_do(trans, NULL, NULL, 210 BTREE_INSERT_NOFAIL| 211 BTREE_INSERT_LAZY_RW, 212 __write_inode(trans, inode, snapshot)); 213 if (ret) 214 bch_err_fn(trans->c, ret); 215 return ret; 216 } 217 218 static int __remove_dirent(struct btree_trans *trans, struct bpos pos) 219 { 220 struct bch_fs *c = trans->c; 221 struct btree_iter iter; 222 struct bch_inode_unpacked dir_inode; 223 struct bch_hash_info dir_hash_info; 224 int ret; 225 226 ret = lookup_first_inode(trans, pos.inode, &dir_inode); 227 if (ret) 228 goto err; 229 230 dir_hash_info = bch2_hash_info_init(c, &dir_inode); 231 232 bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT); 233 234 ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc, 235 &dir_hash_info, &iter, 236 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); 237 bch2_trans_iter_exit(trans, &iter); 238 err: 239 bch_err_fn(c, ret); 240 return ret; 241 } 242 243 /* Get lost+found, create if it doesn't exist: */ 244 static int lookup_lostfound(struct btree_trans *trans, u32 subvol, 245 struct bch_inode_unpacked *lostfound) 246 { 247 struct bch_fs *c = trans->c; 248 struct bch_inode_unpacked root; 249 struct bch_hash_info root_hash_info; 250 struct qstr lostfound_str = QSTR("lost+found"); 251 subvol_inum root_inum = { .subvol = subvol }; 252 u64 inum = 0; 253 unsigned d_type = 0; 254 u32 snapshot; 255 int ret; 256 257 ret = __subvol_lookup(trans, subvol, &snapshot, &root_inum.inum); 258 if (ret) 259 return ret; 260 261 ret = __lookup_inode(trans, root_inum.inum, &root, &snapshot); 262 if (ret) 263 return ret; 264 265 root_hash_info = bch2_hash_info_init(c, &root); 266 267 ret = __lookup_dirent(trans, root_hash_info, root_inum, 268 &lostfound_str, &inum, &d_type); 269 if (bch2_err_matches(ret, ENOENT)) { 270 bch_notice(c, "creating lost+found"); 271 goto create_lostfound; 272 } 273 274 bch_err_fn(c, ret); 275 if (ret) 276 return ret; 277 278 if (d_type != DT_DIR) { 279 bch_err(c, "error looking up lost+found: not a directory"); 280 return -BCH_ERR_ENOENT_not_directory; 281 } 282 283 /* 284 * The bch2_check_dirents pass has already run, dangling dirents 285 * shouldn't exist here: 286 */ 287 return __lookup_inode(trans, inum, lostfound, &snapshot); 288 289 create_lostfound: 290 bch2_inode_init_early(c, lostfound); 291 292 ret = bch2_create_trans(trans, root_inum, &root, 293 lostfound, &lostfound_str, 294 0, 0, S_IFDIR|0700, 0, NULL, NULL, 295 (subvol_inum) { }, 0); 296 bch_err_msg(c, ret, "creating lost+found"); 297 return ret; 298 } 299 300 static int __reattach_inode(struct btree_trans *trans, 301 struct bch_inode_unpacked *inode, 302 u32 inode_snapshot) 303 { 304 struct bch_hash_info dir_hash; 305 struct bch_inode_unpacked lostfound; 306 char name_buf[20]; 307 struct qstr name; 308 u64 dir_offset = 0; 309 u32 subvol; 310 int ret; 311 312 ret = __snapshot_lookup_subvol(trans, inode_snapshot, &subvol); 313 if (ret) 314 return ret; 315 316 ret = lookup_lostfound(trans, subvol, &lostfound); 317 if (ret) 318 return ret; 319 320 if (S_ISDIR(inode->bi_mode)) { 321 lostfound.bi_nlink++; 322 323 ret = __write_inode(trans, &lostfound, U32_MAX); 324 if (ret) 325 return ret; 326 } 327 328 dir_hash = bch2_hash_info_init(trans->c, &lostfound); 329 330 snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum); 331 name = (struct qstr) QSTR(name_buf); 332 333 ret = bch2_dirent_create(trans, 334 (subvol_inum) { 335 .subvol = subvol, 336 .inum = lostfound.bi_inum, 337 }, 338 &dir_hash, 339 inode_d_type(inode), 340 &name, inode->bi_inum, &dir_offset, 341 BCH_HASH_SET_MUST_CREATE); 342 if (ret) 343 return ret; 344 345 inode->bi_dir = lostfound.bi_inum; 346 inode->bi_dir_offset = dir_offset; 347 348 return __write_inode(trans, inode, inode_snapshot); 349 } 350 351 static int reattach_inode(struct btree_trans *trans, 352 struct bch_inode_unpacked *inode, 353 u32 inode_snapshot) 354 { 355 int ret = commit_do(trans, NULL, NULL, 356 BTREE_INSERT_LAZY_RW| 357 BTREE_INSERT_NOFAIL, 358 __reattach_inode(trans, inode, inode_snapshot)); 359 bch_err_msg(trans->c, ret, "reattaching inode %llu", inode->bi_inum); 360 return ret; 361 } 362 363 static int remove_backpointer(struct btree_trans *trans, 364 struct bch_inode_unpacked *inode) 365 { 366 struct btree_iter iter; 367 struct bkey_s_c_dirent d; 368 int ret; 369 370 d = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents, 371 POS(inode->bi_dir, inode->bi_dir_offset), 0, 372 dirent); 373 ret = bkey_err(d) ?: 374 __remove_dirent(trans, d.k->p); 375 bch2_trans_iter_exit(trans, &iter); 376 return ret; 377 } 378 379 struct snapshots_seen_entry { 380 u32 id; 381 u32 equiv; 382 }; 383 384 struct snapshots_seen { 385 struct bpos pos; 386 DARRAY(struct snapshots_seen_entry) ids; 387 }; 388 389 static inline void snapshots_seen_exit(struct snapshots_seen *s) 390 { 391 darray_exit(&s->ids); 392 } 393 394 static inline void snapshots_seen_init(struct snapshots_seen *s) 395 { 396 memset(s, 0, sizeof(*s)); 397 } 398 399 static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id) 400 { 401 struct snapshots_seen_entry *i, n = { 402 .id = id, 403 .equiv = bch2_snapshot_equiv(c, id), 404 }; 405 int ret = 0; 406 407 darray_for_each(s->ids, i) { 408 if (i->id == id) 409 return 0; 410 if (i->id > id) 411 break; 412 } 413 414 ret = darray_insert_item(&s->ids, i - s->ids.data, n); 415 if (ret) 416 bch_err(c, "error reallocating snapshots_seen table (size %zu)", 417 s->ids.size); 418 return ret; 419 } 420 421 static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, 422 enum btree_id btree_id, struct bpos pos) 423 { 424 struct snapshots_seen_entry *i, n = { 425 .id = pos.snapshot, 426 .equiv = bch2_snapshot_equiv(c, pos.snapshot), 427 }; 428 int ret = 0; 429 430 if (!bkey_eq(s->pos, pos)) 431 s->ids.nr = 0; 432 433 s->pos = pos; 434 s->pos.snapshot = n.equiv; 435 436 darray_for_each(s->ids, i) { 437 if (i->id == n.id) 438 return 0; 439 440 /* 441 * We currently don't rigorously track for snapshot cleanup 442 * needing to be run, so it shouldn't be a fsck error yet: 443 */ 444 if (i->equiv == n.equiv) { 445 bch_err(c, "snapshot deletion did not finish:\n" 446 " duplicate keys in btree %s at %llu:%llu snapshots %u, %u (equiv %u)\n", 447 bch2_btree_ids[btree_id], 448 pos.inode, pos.offset, 449 i->id, n.id, n.equiv); 450 return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_delete_dead_snapshots); 451 } 452 } 453 454 ret = darray_push(&s->ids, n); 455 if (ret) 456 bch_err(c, "error reallocating snapshots_seen table (size %zu)", 457 s->ids.size); 458 return ret; 459 } 460 461 /** 462 * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor, 463 * and @ancestor hasn't been overwritten in @seen 464 * 465 * @c: filesystem handle 466 * @seen: list of snapshot ids already seen at current position 467 * @id: descendent snapshot id 468 * @ancestor: ancestor snapshot id 469 * 470 * Returns: whether key in @ancestor snapshot is visible in @id snapshot 471 */ 472 static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen, 473 u32 id, u32 ancestor) 474 { 475 ssize_t i; 476 477 EBUG_ON(id > ancestor); 478 EBUG_ON(!bch2_snapshot_is_equiv(c, id)); 479 EBUG_ON(!bch2_snapshot_is_equiv(c, ancestor)); 480 481 /* @ancestor should be the snapshot most recently added to @seen */ 482 EBUG_ON(ancestor != seen->pos.snapshot); 483 EBUG_ON(ancestor != seen->ids.data[seen->ids.nr - 1].equiv); 484 485 if (id == ancestor) 486 return true; 487 488 if (!bch2_snapshot_is_ancestor(c, id, ancestor)) 489 return false; 490 491 /* 492 * We know that @id is a descendant of @ancestor, we're checking if 493 * we've seen a key that overwrote @ancestor - i.e. also a descendent of 494 * @ascestor and with @id as a descendent. 495 * 496 * But we already know that we're scanning IDs between @id and @ancestor 497 * numerically, since snapshot ID lists are kept sorted, so if we find 498 * an id that's an ancestor of @id we're done: 499 */ 500 501 for (i = seen->ids.nr - 2; 502 i >= 0 && seen->ids.data[i].equiv >= id; 503 --i) 504 if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i].equiv)) 505 return false; 506 507 return true; 508 } 509 510 /** 511 * ref_visible - given a key with snapshot id @src that points to a key with 512 * snapshot id @dst, test whether there is some snapshot in which @dst is 513 * visible. 514 * 515 * @c: filesystem handle 516 * @s: list of snapshot IDs already seen at @src 517 * @src: snapshot ID of src key 518 * @dst: snapshot ID of dst key 519 * Returns: true if there is some snapshot in which @dst is visible 520 * 521 * Assumes we're visiting @src keys in natural key order 522 */ 523 static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s, 524 u32 src, u32 dst) 525 { 526 return dst <= src 527 ? key_visible_in_snapshot(c, s, dst, src) 528 : bch2_snapshot_is_ancestor(c, src, dst); 529 } 530 531 static int ref_visible2(struct bch_fs *c, 532 u32 src, struct snapshots_seen *src_seen, 533 u32 dst, struct snapshots_seen *dst_seen) 534 { 535 src = bch2_snapshot_equiv(c, src); 536 dst = bch2_snapshot_equiv(c, dst); 537 538 if (dst > src) { 539 swap(dst, src); 540 swap(dst_seen, src_seen); 541 } 542 return key_visible_in_snapshot(c, src_seen, dst, src); 543 } 544 545 #define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \ 546 for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && \ 547 (_i)->snapshot <= (_snapshot); _i++) \ 548 if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot)) 549 550 struct inode_walker_entry { 551 struct bch_inode_unpacked inode; 552 u32 snapshot; 553 bool seen_this_pos; 554 u64 count; 555 }; 556 557 struct inode_walker { 558 bool first_this_inode; 559 bool recalculate_sums; 560 struct bpos last_pos; 561 562 DARRAY(struct inode_walker_entry) inodes; 563 }; 564 565 static void inode_walker_exit(struct inode_walker *w) 566 { 567 darray_exit(&w->inodes); 568 } 569 570 static struct inode_walker inode_walker_init(void) 571 { 572 return (struct inode_walker) { 0, }; 573 } 574 575 static int add_inode(struct bch_fs *c, struct inode_walker *w, 576 struct bkey_s_c inode) 577 { 578 struct bch_inode_unpacked u; 579 580 BUG_ON(bch2_inode_unpack(inode, &u)); 581 582 return darray_push(&w->inodes, ((struct inode_walker_entry) { 583 .inode = u, 584 .snapshot = bch2_snapshot_equiv(c, inode.k->p.snapshot), 585 })); 586 } 587 588 static int get_inodes_all_snapshots(struct btree_trans *trans, 589 struct inode_walker *w, u64 inum) 590 { 591 struct bch_fs *c = trans->c; 592 struct btree_iter iter; 593 struct bkey_s_c k; 594 u32 restart_count = trans->restart_count; 595 int ret; 596 597 w->recalculate_sums = false; 598 w->inodes.nr = 0; 599 600 for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum), 601 BTREE_ITER_ALL_SNAPSHOTS, k, ret) { 602 if (k.k->p.offset != inum) 603 break; 604 605 if (bkey_is_inode(k.k)) 606 add_inode(c, w, k); 607 } 608 bch2_trans_iter_exit(trans, &iter); 609 610 if (ret) 611 return ret; 612 613 w->first_this_inode = true; 614 615 return trans_was_restarted(trans, restart_count); 616 } 617 618 static struct inode_walker_entry * 619 lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, 620 u32 snapshot, bool is_whiteout) 621 { 622 struct inode_walker_entry *i; 623 624 snapshot = bch2_snapshot_equiv(c, snapshot); 625 626 darray_for_each(w->inodes, i) 627 if (bch2_snapshot_is_ancestor(c, snapshot, i->snapshot)) 628 goto found; 629 630 return NULL; 631 found: 632 BUG_ON(snapshot > i->snapshot); 633 634 if (snapshot != i->snapshot && !is_whiteout) { 635 struct inode_walker_entry new = *i; 636 size_t pos; 637 int ret; 638 639 new.snapshot = snapshot; 640 new.count = 0; 641 642 bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u", 643 w->last_pos.inode, snapshot, i->snapshot); 644 645 while (i > w->inodes.data && i[-1].snapshot > snapshot) 646 --i; 647 648 pos = i - w->inodes.data; 649 ret = darray_insert_item(&w->inodes, pos, new); 650 if (ret) 651 return ERR_PTR(ret); 652 653 i = w->inodes.data + pos; 654 } 655 656 return i; 657 } 658 659 static struct inode_walker_entry *walk_inode(struct btree_trans *trans, 660 struct inode_walker *w, struct bpos pos, 661 bool is_whiteout) 662 { 663 if (w->last_pos.inode != pos.inode) { 664 int ret = get_inodes_all_snapshots(trans, w, pos.inode); 665 if (ret) 666 return ERR_PTR(ret); 667 } else if (bkey_cmp(w->last_pos, pos)) { 668 struct inode_walker_entry *i; 669 670 darray_for_each(w->inodes, i) 671 i->seen_this_pos = false; 672 673 } 674 675 w->last_pos = pos; 676 677 return lookup_inode_for_snapshot(trans->c, w, pos.snapshot, is_whiteout); 678 } 679 680 static int __get_visible_inodes(struct btree_trans *trans, 681 struct inode_walker *w, 682 struct snapshots_seen *s, 683 u64 inum) 684 { 685 struct bch_fs *c = trans->c; 686 struct btree_iter iter; 687 struct bkey_s_c k; 688 int ret; 689 690 w->inodes.nr = 0; 691 692 for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum), 693 BTREE_ITER_ALL_SNAPSHOTS, k, ret) { 694 u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot); 695 696 if (k.k->p.offset != inum) 697 break; 698 699 if (!ref_visible(c, s, s->pos.snapshot, equiv)) 700 continue; 701 702 if (bkey_is_inode(k.k)) 703 add_inode(c, w, k); 704 705 if (equiv >= s->pos.snapshot) 706 break; 707 } 708 bch2_trans_iter_exit(trans, &iter); 709 710 return ret; 711 } 712 713 static int check_key_has_snapshot(struct btree_trans *trans, 714 struct btree_iter *iter, 715 struct bkey_s_c k) 716 { 717 struct bch_fs *c = trans->c; 718 struct printbuf buf = PRINTBUF; 719 int ret = 0; 720 721 if (mustfix_fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c, 722 "key in missing snapshot: %s", 723 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) 724 ret = bch2_btree_delete_at(trans, iter, 725 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1; 726 fsck_err: 727 printbuf_exit(&buf); 728 return ret; 729 } 730 731 static int hash_redo_key(struct btree_trans *trans, 732 const struct bch_hash_desc desc, 733 struct bch_hash_info *hash_info, 734 struct btree_iter *k_iter, struct bkey_s_c k) 735 { 736 struct bkey_i *delete; 737 struct bkey_i *tmp; 738 739 delete = bch2_trans_kmalloc(trans, sizeof(*delete)); 740 if (IS_ERR(delete)) 741 return PTR_ERR(delete); 742 743 tmp = bch2_bkey_make_mut_noupdate(trans, k); 744 if (IS_ERR(tmp)) 745 return PTR_ERR(tmp); 746 747 bkey_init(&delete->k); 748 delete->k.p = k_iter->pos; 749 return bch2_btree_iter_traverse(k_iter) ?: 750 bch2_trans_update(trans, k_iter, delete, 0) ?: 751 bch2_hash_set_snapshot(trans, desc, hash_info, 752 (subvol_inum) { 0, k.k->p.inode }, 753 k.k->p.snapshot, tmp, 754 BCH_HASH_SET_MUST_CREATE, 755 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 756 bch2_trans_commit(trans, NULL, NULL, 757 BTREE_INSERT_NOFAIL| 758 BTREE_INSERT_LAZY_RW); 759 } 760 761 static int hash_check_key(struct btree_trans *trans, 762 const struct bch_hash_desc desc, 763 struct bch_hash_info *hash_info, 764 struct btree_iter *k_iter, struct bkey_s_c hash_k) 765 { 766 struct bch_fs *c = trans->c; 767 struct btree_iter iter = { NULL }; 768 struct printbuf buf = PRINTBUF; 769 struct bkey_s_c k; 770 u64 hash; 771 int ret = 0; 772 773 if (hash_k.k->type != desc.key_type) 774 return 0; 775 776 hash = desc.hash_bkey(hash_info, hash_k); 777 778 if (likely(hash == hash_k.k->p.offset)) 779 return 0; 780 781 if (hash_k.k->p.offset < hash) 782 goto bad_hash; 783 784 for_each_btree_key_norestart(trans, iter, desc.btree_id, 785 SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot), 786 BTREE_ITER_SLOTS, k, ret) { 787 if (bkey_eq(k.k->p, hash_k.k->p)) 788 break; 789 790 if (fsck_err_on(k.k->type == desc.key_type && 791 !desc.cmp_bkey(k, hash_k), c, 792 "duplicate hash table keys:\n%s", 793 (printbuf_reset(&buf), 794 bch2_bkey_val_to_text(&buf, c, hash_k), 795 buf.buf))) { 796 ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1; 797 break; 798 } 799 800 if (bkey_deleted(k.k)) { 801 bch2_trans_iter_exit(trans, &iter); 802 goto bad_hash; 803 } 804 } 805 out: 806 bch2_trans_iter_exit(trans, &iter); 807 printbuf_exit(&buf); 808 return ret; 809 bad_hash: 810 if (fsck_err(c, "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s", 811 bch2_btree_ids[desc.btree_id], hash_k.k->p.inode, hash_k.k->p.offset, hash, 812 (printbuf_reset(&buf), 813 bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) { 814 ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k); 815 bch_err_fn(c, ret); 816 if (ret) 817 return ret; 818 ret = -BCH_ERR_transaction_restart_nested; 819 } 820 fsck_err: 821 goto out; 822 } 823 824 static int check_inode(struct btree_trans *trans, 825 struct btree_iter *iter, 826 struct bkey_s_c k, 827 struct bch_inode_unpacked *prev, 828 struct snapshots_seen *s, 829 bool full) 830 { 831 struct bch_fs *c = trans->c; 832 struct bch_inode_unpacked u; 833 bool do_update = false; 834 int ret; 835 836 ret = check_key_has_snapshot(trans, iter, k); 837 if (ret < 0) 838 goto err; 839 if (ret) 840 return 0; 841 842 ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); 843 if (ret) 844 goto err; 845 846 if (!bkey_is_inode(k.k)) 847 return 0; 848 849 BUG_ON(bch2_inode_unpack(k, &u)); 850 851 if (!full && 852 !(u.bi_flags & (BCH_INODE_I_SIZE_DIRTY| 853 BCH_INODE_I_SECTORS_DIRTY| 854 BCH_INODE_UNLINKED))) 855 return 0; 856 857 if (prev->bi_inum != u.bi_inum) 858 *prev = u; 859 860 if (fsck_err_on(prev->bi_hash_seed != u.bi_hash_seed || 861 inode_d_type(prev) != inode_d_type(&u), c, 862 "inodes in different snapshots don't match")) { 863 bch_err(c, "repair not implemented yet"); 864 return -EINVAL; 865 } 866 867 if ((u.bi_flags & (BCH_INODE_I_SIZE_DIRTY|BCH_INODE_UNLINKED)) && 868 bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) { 869 struct bpos new_min_pos; 870 871 ret = bch2_propagate_key_to_snapshot_leaves(trans, iter->btree_id, k, &new_min_pos); 872 if (ret) 873 goto err; 874 875 u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY|BCH_INODE_UNLINKED; 876 877 ret = __write_inode(trans, &u, iter->pos.snapshot); 878 bch_err_msg(c, ret, "in fsck updating inode"); 879 if (ret) 880 return ret; 881 882 if (!bpos_eq(new_min_pos, POS_MIN)) 883 bch2_btree_iter_set_pos(iter, bpos_predecessor(new_min_pos)); 884 return 0; 885 } 886 887 if (u.bi_flags & BCH_INODE_UNLINKED && 888 (!c->sb.clean || 889 fsck_err(c, "filesystem marked clean, but inode %llu unlinked", 890 u.bi_inum))) { 891 bch2_trans_unlock(trans); 892 bch2_fs_lazy_rw(c); 893 894 ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot); 895 bch_err_msg(c, ret, "in fsck deleting inode"); 896 return ret; 897 } 898 899 if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY && 900 (!c->sb.clean || 901 fsck_err(c, "filesystem marked clean, but inode %llu has i_size dirty", 902 u.bi_inum))) { 903 bch_verbose(c, "truncating inode %llu", u.bi_inum); 904 905 bch2_trans_unlock(trans); 906 bch2_fs_lazy_rw(c); 907 908 /* 909 * XXX: need to truncate partial blocks too here - or ideally 910 * just switch units to bytes and that issue goes away 911 */ 912 ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents, 913 SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9, 914 iter->pos.snapshot), 915 POS(u.bi_inum, U64_MAX), 916 0, NULL); 917 bch_err_msg(c, ret, "in fsck truncating inode"); 918 if (ret) 919 return ret; 920 921 /* 922 * We truncated without our normal sector accounting hook, just 923 * make sure we recalculate it: 924 */ 925 u.bi_flags |= BCH_INODE_I_SECTORS_DIRTY; 926 927 u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY; 928 do_update = true; 929 } 930 931 if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY && 932 (!c->sb.clean || 933 fsck_err(c, "filesystem marked clean, but inode %llu has i_sectors dirty", 934 u.bi_inum))) { 935 s64 sectors; 936 937 bch_verbose(c, "recounting sectors for inode %llu", 938 u.bi_inum); 939 940 sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot); 941 if (sectors < 0) { 942 bch_err_msg(c, sectors, "in fsck recounting inode sectors"); 943 return sectors; 944 } 945 946 u.bi_sectors = sectors; 947 u.bi_flags &= ~BCH_INODE_I_SECTORS_DIRTY; 948 do_update = true; 949 } 950 951 if (u.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) { 952 u.bi_dir = 0; 953 u.bi_dir_offset = 0; 954 u.bi_flags &= ~BCH_INODE_BACKPTR_UNTRUSTED; 955 do_update = true; 956 } 957 958 if (do_update) { 959 ret = __write_inode(trans, &u, iter->pos.snapshot); 960 bch_err_msg(c, ret, "in fsck updating inode"); 961 if (ret) 962 return ret; 963 } 964 err: 965 fsck_err: 966 bch_err_fn(c, ret); 967 return ret; 968 } 969 970 noinline_for_stack 971 int bch2_check_inodes(struct bch_fs *c) 972 { 973 bool full = c->opts.fsck; 974 struct btree_trans *trans = bch2_trans_get(c); 975 struct btree_iter iter; 976 struct bch_inode_unpacked prev = { 0 }; 977 struct snapshots_seen s; 978 struct bkey_s_c k; 979 int ret; 980 981 snapshots_seen_init(&s); 982 983 ret = for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, 984 POS_MIN, 985 BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, 986 NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, 987 check_inode(trans, &iter, k, &prev, &s, full)); 988 989 snapshots_seen_exit(&s); 990 bch2_trans_put(trans); 991 bch_err_fn(c, ret); 992 return ret; 993 } 994 995 static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, 996 struct btree_iter *iter, 997 struct bpos pos) 998 { 999 return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent); 1000 } 1001 1002 static bool inode_points_to_dirent(struct bch_inode_unpacked *inode, 1003 struct bkey_s_c_dirent d) 1004 { 1005 return inode->bi_dir == d.k->p.inode && 1006 inode->bi_dir_offset == d.k->p.offset; 1007 } 1008 1009 static bool dirent_points_to_inode(struct bkey_s_c_dirent d, 1010 struct bch_inode_unpacked *inode) 1011 { 1012 return d.v->d_type == DT_SUBVOL 1013 ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol 1014 : le64_to_cpu(d.v->d_inum) == inode->bi_inum; 1015 } 1016 1017 static int inode_backpointer_exists(struct btree_trans *trans, 1018 struct bch_inode_unpacked *inode, 1019 u32 snapshot) 1020 { 1021 struct btree_iter iter; 1022 struct bkey_s_c_dirent d; 1023 int ret; 1024 1025 d = dirent_get_by_pos(trans, &iter, 1026 SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot)); 1027 ret = bkey_err(d); 1028 if (ret) 1029 return bch2_err_matches(ret, ENOENT) ? 0 : ret; 1030 1031 ret = dirent_points_to_inode(d, inode); 1032 bch2_trans_iter_exit(trans, &iter); 1033 return ret; 1034 } 1035 1036 static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) 1037 { 1038 struct bch_fs *c = trans->c; 1039 struct inode_walker_entry *i; 1040 u32 restart_count = trans->restart_count; 1041 int ret = 0; 1042 s64 count2; 1043 1044 darray_for_each(w->inodes, i) { 1045 if (i->inode.bi_sectors == i->count) 1046 continue; 1047 1048 count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot); 1049 1050 if (w->recalculate_sums) 1051 i->count = count2; 1052 1053 if (i->count != count2) { 1054 bch_err(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu", 1055 w->last_pos.inode, i->snapshot, i->count, count2); 1056 return -BCH_ERR_internal_fsck_err; 1057 } 1058 1059 if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY), c, 1060 "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu", 1061 w->last_pos.inode, i->snapshot, 1062 i->inode.bi_sectors, i->count)) { 1063 i->inode.bi_sectors = i->count; 1064 ret = fsck_write_inode(trans, &i->inode, i->snapshot); 1065 if (ret) 1066 break; 1067 } 1068 } 1069 fsck_err: 1070 bch_err_fn(c, ret); 1071 return ret ?: trans_was_restarted(trans, restart_count); 1072 } 1073 1074 struct extent_end { 1075 u32 snapshot; 1076 u64 offset; 1077 struct snapshots_seen seen; 1078 }; 1079 1080 struct extent_ends { 1081 struct bpos last_pos; 1082 DARRAY(struct extent_end) e; 1083 }; 1084 1085 static void extent_ends_reset(struct extent_ends *extent_ends) 1086 { 1087 struct extent_end *i; 1088 1089 darray_for_each(extent_ends->e, i) 1090 snapshots_seen_exit(&i->seen); 1091 1092 extent_ends->e.nr = 0; 1093 } 1094 1095 static void extent_ends_exit(struct extent_ends *extent_ends) 1096 { 1097 extent_ends_reset(extent_ends); 1098 darray_exit(&extent_ends->e); 1099 } 1100 1101 static void extent_ends_init(struct extent_ends *extent_ends) 1102 { 1103 memset(extent_ends, 0, sizeof(*extent_ends)); 1104 } 1105 1106 static int extent_ends_at(struct bch_fs *c, 1107 struct extent_ends *extent_ends, 1108 struct snapshots_seen *seen, 1109 struct bkey_s_c k) 1110 { 1111 struct extent_end *i, n = (struct extent_end) { 1112 .offset = k.k->p.offset, 1113 .snapshot = k.k->p.snapshot, 1114 .seen = *seen, 1115 }; 1116 1117 n.seen.ids.data = kmemdup(seen->ids.data, 1118 sizeof(seen->ids.data[0]) * seen->ids.size, 1119 GFP_KERNEL); 1120 if (!n.seen.ids.data) 1121 return -BCH_ERR_ENOMEM_fsck_extent_ends_at; 1122 1123 darray_for_each(extent_ends->e, i) { 1124 if (i->snapshot == k.k->p.snapshot) { 1125 snapshots_seen_exit(&i->seen); 1126 *i = n; 1127 return 0; 1128 } 1129 1130 if (i->snapshot >= k.k->p.snapshot) 1131 break; 1132 } 1133 1134 return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n); 1135 } 1136 1137 static int overlapping_extents_found(struct btree_trans *trans, 1138 enum btree_id btree, 1139 struct bpos pos1, struct snapshots_seen *pos1_seen, 1140 struct bkey pos2, 1141 bool *fixed, 1142 struct extent_end *extent_end) 1143 { 1144 struct bch_fs *c = trans->c; 1145 struct printbuf buf = PRINTBUF; 1146 struct btree_iter iter1, iter2 = { NULL }; 1147 struct bkey_s_c k1, k2; 1148 int ret; 1149 1150 BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2))); 1151 1152 bch2_trans_iter_init(trans, &iter1, btree, pos1, 1153 BTREE_ITER_ALL_SNAPSHOTS| 1154 BTREE_ITER_NOT_EXTENTS); 1155 k1 = bch2_btree_iter_peek_upto(&iter1, POS(pos1.inode, U64_MAX)); 1156 ret = bkey_err(k1); 1157 if (ret) 1158 goto err; 1159 1160 prt_str(&buf, "\n "); 1161 bch2_bkey_val_to_text(&buf, c, k1); 1162 1163 if (!bpos_eq(pos1, k1.k->p)) { 1164 prt_str(&buf, "\n wanted\n "); 1165 bch2_bpos_to_text(&buf, pos1); 1166 prt_str(&buf, "\n "); 1167 bch2_bkey_to_text(&buf, &pos2); 1168 1169 bch_err(c, "%s: error finding first overlapping extent when repairing, got%s", 1170 __func__, buf.buf); 1171 ret = -BCH_ERR_internal_fsck_err; 1172 goto err; 1173 } 1174 1175 bch2_trans_copy_iter(&iter2, &iter1); 1176 1177 while (1) { 1178 bch2_btree_iter_advance(&iter2); 1179 1180 k2 = bch2_btree_iter_peek_upto(&iter2, POS(pos1.inode, U64_MAX)); 1181 ret = bkey_err(k2); 1182 if (ret) 1183 goto err; 1184 1185 if (bpos_ge(k2.k->p, pos2.p)) 1186 break; 1187 } 1188 1189 prt_str(&buf, "\n "); 1190 bch2_bkey_val_to_text(&buf, c, k2); 1191 1192 if (bpos_gt(k2.k->p, pos2.p) || 1193 pos2.size != k2.k->size) { 1194 bch_err(c, "%s: error finding seconding overlapping extent when repairing%s", 1195 __func__, buf.buf); 1196 ret = -BCH_ERR_internal_fsck_err; 1197 goto err; 1198 } 1199 1200 prt_printf(&buf, "\n overwriting %s extent", 1201 pos1.snapshot >= pos2.p.snapshot ? "first" : "second"); 1202 1203 if (fsck_err(c, "overlapping extents%s", buf.buf)) { 1204 struct btree_iter *old_iter = &iter1; 1205 struct disk_reservation res = { 0 }; 1206 1207 if (pos1.snapshot < pos2.p.snapshot) { 1208 old_iter = &iter2; 1209 swap(k1, k2); 1210 } 1211 1212 trans->extra_journal_res += bch2_bkey_sectors_compressed(k2); 1213 1214 ret = bch2_trans_update_extent_overwrite(trans, old_iter, 1215 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE, 1216 k1, k2) ?: 1217 bch2_trans_commit(trans, &res, NULL, 1218 BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL); 1219 bch2_disk_reservation_put(c, &res); 1220 1221 if (ret) 1222 goto err; 1223 1224 *fixed = true; 1225 1226 if (pos1.snapshot == pos2.p.snapshot) { 1227 /* 1228 * We overwrote the first extent, and did the overwrite 1229 * in the same snapshot: 1230 */ 1231 extent_end->offset = bkey_start_offset(&pos2); 1232 } else if (pos1.snapshot > pos2.p.snapshot) { 1233 /* 1234 * We overwrote the first extent in pos2's snapshot: 1235 */ 1236 ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot); 1237 } else { 1238 /* 1239 * We overwrote the second extent - restart 1240 * check_extent() from the top: 1241 */ 1242 ret = -BCH_ERR_transaction_restart_nested; 1243 } 1244 } 1245 fsck_err: 1246 err: 1247 bch2_trans_iter_exit(trans, &iter2); 1248 bch2_trans_iter_exit(trans, &iter1); 1249 printbuf_exit(&buf); 1250 return ret; 1251 } 1252 1253 static int check_overlapping_extents(struct btree_trans *trans, 1254 struct snapshots_seen *seen, 1255 struct extent_ends *extent_ends, 1256 struct bkey_s_c k, 1257 u32 equiv, 1258 struct btree_iter *iter, 1259 bool *fixed) 1260 { 1261 struct bch_fs *c = trans->c; 1262 struct extent_end *i; 1263 int ret = 0; 1264 1265 /* transaction restart, running again */ 1266 if (bpos_eq(extent_ends->last_pos, k.k->p)) 1267 return 0; 1268 1269 if (extent_ends->last_pos.inode != k.k->p.inode) 1270 extent_ends_reset(extent_ends); 1271 1272 darray_for_each(extent_ends->e, i) { 1273 if (i->offset <= bkey_start_offset(k.k)) 1274 continue; 1275 1276 if (!ref_visible2(c, 1277 k.k->p.snapshot, seen, 1278 i->snapshot, &i->seen)) 1279 continue; 1280 1281 ret = overlapping_extents_found(trans, iter->btree_id, 1282 SPOS(iter->pos.inode, 1283 i->offset, 1284 i->snapshot), 1285 &i->seen, 1286 *k.k, fixed, i); 1287 if (ret) 1288 goto err; 1289 } 1290 1291 ret = extent_ends_at(c, extent_ends, seen, k); 1292 if (ret) 1293 goto err; 1294 1295 extent_ends->last_pos = k.k->p; 1296 err: 1297 return ret; 1298 } 1299 1300 static int check_extent(struct btree_trans *trans, struct btree_iter *iter, 1301 struct bkey_s_c k, 1302 struct inode_walker *inode, 1303 struct snapshots_seen *s, 1304 struct extent_ends *extent_ends) 1305 { 1306 struct bch_fs *c = trans->c; 1307 struct inode_walker_entry *i; 1308 struct printbuf buf = PRINTBUF; 1309 struct bpos equiv = k.k->p; 1310 int ret = 0; 1311 1312 equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot); 1313 1314 ret = check_key_has_snapshot(trans, iter, k); 1315 if (ret) { 1316 ret = ret < 0 ? ret : 0; 1317 goto out; 1318 } 1319 1320 if (inode->last_pos.inode != k.k->p.inode) { 1321 ret = check_i_sectors(trans, inode); 1322 if (ret) 1323 goto err; 1324 } 1325 1326 i = walk_inode(trans, inode, equiv, k.k->type == KEY_TYPE_whiteout); 1327 ret = PTR_ERR_OR_ZERO(i); 1328 if (ret) 1329 goto err; 1330 1331 ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); 1332 if (ret) 1333 goto err; 1334 1335 if (k.k->type != KEY_TYPE_whiteout) { 1336 if (fsck_err_on(!i, c, 1337 "extent in missing inode:\n %s", 1338 (printbuf_reset(&buf), 1339 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) 1340 goto delete; 1341 1342 if (fsck_err_on(i && 1343 !S_ISREG(i->inode.bi_mode) && 1344 !S_ISLNK(i->inode.bi_mode), c, 1345 "extent in non regular inode mode %o:\n %s", 1346 i->inode.bi_mode, 1347 (printbuf_reset(&buf), 1348 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) 1349 goto delete; 1350 1351 ret = check_overlapping_extents(trans, s, extent_ends, k, 1352 equiv.snapshot, iter, 1353 &inode->recalculate_sums); 1354 if (ret) 1355 goto err; 1356 } 1357 1358 /* 1359 * Check inodes in reverse order, from oldest snapshots to newest, 1360 * starting from the inode that matches this extent's snapshot. If we 1361 * didn't have one, iterate over all inodes: 1362 */ 1363 if (!i) 1364 i = inode->inodes.data + inode->inodes.nr - 1; 1365 1366 for (; 1367 inode->inodes.data && i >= inode->inodes.data; 1368 --i) { 1369 if (i->snapshot > equiv.snapshot || 1370 !key_visible_in_snapshot(c, s, i->snapshot, equiv.snapshot)) 1371 continue; 1372 1373 if (k.k->type != KEY_TYPE_whiteout) { 1374 if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) && 1375 k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 && 1376 !bkey_extent_is_reservation(k), c, 1377 "extent type past end of inode %llu:%u, i_size %llu\n %s", 1378 i->inode.bi_inum, i->snapshot, i->inode.bi_size, 1379 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { 1380 struct btree_iter iter2; 1381 1382 bch2_trans_copy_iter(&iter2, iter); 1383 bch2_btree_iter_set_snapshot(&iter2, i->snapshot); 1384 ret = bch2_btree_iter_traverse(&iter2) ?: 1385 bch2_btree_delete_at(trans, &iter2, 1386 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); 1387 bch2_trans_iter_exit(trans, &iter2); 1388 if (ret) 1389 goto err; 1390 1391 iter->k.type = KEY_TYPE_whiteout; 1392 } 1393 1394 if (bkey_extent_is_allocation(k.k)) 1395 i->count += k.k->size; 1396 } 1397 1398 i->seen_this_pos = true; 1399 } 1400 out: 1401 err: 1402 fsck_err: 1403 printbuf_exit(&buf); 1404 bch_err_fn(c, ret); 1405 return ret; 1406 delete: 1407 ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); 1408 goto out; 1409 } 1410 1411 /* 1412 * Walk extents: verify that extents have a corresponding S_ISREG inode, and 1413 * that i_size an i_sectors are consistent 1414 */ 1415 int bch2_check_extents(struct bch_fs *c) 1416 { 1417 struct inode_walker w = inode_walker_init(); 1418 struct snapshots_seen s; 1419 struct btree_trans *trans = bch2_trans_get(c); 1420 struct btree_iter iter; 1421 struct bkey_s_c k; 1422 struct extent_ends extent_ends; 1423 struct disk_reservation res = { 0 }; 1424 int ret = 0; 1425 1426 snapshots_seen_init(&s); 1427 extent_ends_init(&extent_ends); 1428 1429 ret = for_each_btree_key_commit(trans, iter, BTREE_ID_extents, 1430 POS(BCACHEFS_ROOT_INO, 0), 1431 BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, 1432 &res, NULL, 1433 BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({ 1434 bch2_disk_reservation_put(c, &res); 1435 check_extent(trans, &iter, k, &w, &s, &extent_ends); 1436 })) ?: 1437 check_i_sectors(trans, &w); 1438 1439 bch2_disk_reservation_put(c, &res); 1440 extent_ends_exit(&extent_ends); 1441 inode_walker_exit(&w); 1442 snapshots_seen_exit(&s); 1443 bch2_trans_put(trans); 1444 1445 bch_err_fn(c, ret); 1446 return ret; 1447 } 1448 1449 static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) 1450 { 1451 struct bch_fs *c = trans->c; 1452 struct inode_walker_entry *i; 1453 u32 restart_count = trans->restart_count; 1454 int ret = 0; 1455 s64 count2; 1456 1457 darray_for_each(w->inodes, i) { 1458 if (i->inode.bi_nlink == i->count) 1459 continue; 1460 1461 count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot); 1462 if (count2 < 0) 1463 return count2; 1464 1465 if (i->count != count2) { 1466 bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu", 1467 i->count, count2); 1468 i->count = count2; 1469 if (i->inode.bi_nlink == i->count) 1470 continue; 1471 } 1472 1473 if (fsck_err_on(i->inode.bi_nlink != i->count, c, 1474 "directory %llu:%u with wrong i_nlink: got %u, should be %llu", 1475 w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) { 1476 i->inode.bi_nlink = i->count; 1477 ret = fsck_write_inode(trans, &i->inode, i->snapshot); 1478 if (ret) 1479 break; 1480 } 1481 } 1482 fsck_err: 1483 bch_err_fn(c, ret); 1484 return ret ?: trans_was_restarted(trans, restart_count); 1485 } 1486 1487 static int check_dirent_target(struct btree_trans *trans, 1488 struct btree_iter *iter, 1489 struct bkey_s_c_dirent d, 1490 struct bch_inode_unpacked *target, 1491 u32 target_snapshot) 1492 { 1493 struct bch_fs *c = trans->c; 1494 struct bkey_i_dirent *n; 1495 bool backpointer_exists = true; 1496 struct printbuf buf = PRINTBUF; 1497 int ret = 0; 1498 1499 if (!target->bi_dir && 1500 !target->bi_dir_offset) { 1501 target->bi_dir = d.k->p.inode; 1502 target->bi_dir_offset = d.k->p.offset; 1503 1504 ret = __write_inode(trans, target, target_snapshot); 1505 if (ret) 1506 goto err; 1507 } 1508 1509 if (!inode_points_to_dirent(target, d)) { 1510 ret = inode_backpointer_exists(trans, target, d.k->p.snapshot); 1511 if (ret < 0) 1512 goto err; 1513 1514 backpointer_exists = ret; 1515 ret = 0; 1516 1517 if (fsck_err_on(S_ISDIR(target->bi_mode) && 1518 backpointer_exists, c, 1519 "directory %llu with multiple links", 1520 target->bi_inum)) { 1521 ret = __remove_dirent(trans, d.k->p); 1522 goto out; 1523 } 1524 1525 if (fsck_err_on(backpointer_exists && 1526 !target->bi_nlink, c, 1527 "inode %llu type %s has multiple links but i_nlink 0", 1528 target->bi_inum, bch2_d_types[d.v->d_type])) { 1529 target->bi_nlink++; 1530 target->bi_flags &= ~BCH_INODE_UNLINKED; 1531 1532 ret = __write_inode(trans, target, target_snapshot); 1533 if (ret) 1534 goto err; 1535 } 1536 1537 if (fsck_err_on(!backpointer_exists, c, 1538 "inode %llu:%u has wrong backpointer:\n" 1539 "got %llu:%llu\n" 1540 "should be %llu:%llu", 1541 target->bi_inum, target_snapshot, 1542 target->bi_dir, 1543 target->bi_dir_offset, 1544 d.k->p.inode, 1545 d.k->p.offset)) { 1546 target->bi_dir = d.k->p.inode; 1547 target->bi_dir_offset = d.k->p.offset; 1548 1549 ret = __write_inode(trans, target, target_snapshot); 1550 if (ret) 1551 goto err; 1552 } 1553 } 1554 1555 if (fsck_err_on(d.v->d_type != inode_d_type(target), c, 1556 "incorrect d_type: got %s, should be %s:\n%s", 1557 bch2_d_type_str(d.v->d_type), 1558 bch2_d_type_str(inode_d_type(target)), 1559 (printbuf_reset(&buf), 1560 bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { 1561 n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); 1562 ret = PTR_ERR_OR_ZERO(n); 1563 if (ret) 1564 goto err; 1565 1566 bkey_reassemble(&n->k_i, d.s_c); 1567 n->v.d_type = inode_d_type(target); 1568 1569 ret = bch2_trans_update(trans, iter, &n->k_i, 0); 1570 if (ret) 1571 goto err; 1572 1573 d = dirent_i_to_s_c(n); 1574 } 1575 1576 if (d.v->d_type == DT_SUBVOL && 1577 target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol) && 1578 (c->sb.version < bcachefs_metadata_version_subvol_dirent || 1579 fsck_err(c, "dirent has wrong d_parent_subvol field: got %u, should be %u", 1580 le32_to_cpu(d.v->d_parent_subvol), 1581 target->bi_parent_subvol))) { 1582 n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); 1583 ret = PTR_ERR_OR_ZERO(n); 1584 if (ret) 1585 goto err; 1586 1587 bkey_reassemble(&n->k_i, d.s_c); 1588 n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol); 1589 1590 ret = bch2_trans_update(trans, iter, &n->k_i, 0); 1591 if (ret) 1592 goto err; 1593 1594 d = dirent_i_to_s_c(n); 1595 } 1596 out: 1597 err: 1598 fsck_err: 1599 printbuf_exit(&buf); 1600 bch_err_fn(c, ret); 1601 return ret; 1602 } 1603 1604 static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, 1605 struct bkey_s_c k, 1606 struct bch_hash_info *hash_info, 1607 struct inode_walker *dir, 1608 struct inode_walker *target, 1609 struct snapshots_seen *s) 1610 { 1611 struct bch_fs *c = trans->c; 1612 struct bkey_s_c_dirent d; 1613 struct inode_walker_entry *i; 1614 struct printbuf buf = PRINTBUF; 1615 struct bpos equiv; 1616 int ret = 0; 1617 1618 ret = check_key_has_snapshot(trans, iter, k); 1619 if (ret) { 1620 ret = ret < 0 ? ret : 0; 1621 goto out; 1622 } 1623 1624 equiv = k.k->p; 1625 equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot); 1626 1627 ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); 1628 if (ret) 1629 goto err; 1630 1631 if (k.k->type == KEY_TYPE_whiteout) 1632 goto out; 1633 1634 if (dir->last_pos.inode != k.k->p.inode) { 1635 ret = check_subdir_count(trans, dir); 1636 if (ret) 1637 goto err; 1638 } 1639 1640 BUG_ON(!iter->path->should_be_locked); 1641 1642 i = walk_inode(trans, dir, equiv, k.k->type == KEY_TYPE_whiteout); 1643 ret = PTR_ERR_OR_ZERO(i); 1644 if (ret < 0) 1645 goto err; 1646 1647 if (dir->first_this_inode && dir->inodes.nr) 1648 *hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode); 1649 dir->first_this_inode = false; 1650 1651 if (fsck_err_on(!i, c, 1652 "dirent in nonexisting directory:\n%s", 1653 (printbuf_reset(&buf), 1654 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { 1655 ret = bch2_btree_delete_at(trans, iter, 1656 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); 1657 goto out; 1658 } 1659 1660 if (!i) 1661 goto out; 1662 1663 if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c, 1664 "dirent in non directory inode type %s:\n%s", 1665 bch2_d_type_str(inode_d_type(&i->inode)), 1666 (printbuf_reset(&buf), 1667 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { 1668 ret = bch2_btree_delete_at(trans, iter, 0); 1669 goto out; 1670 } 1671 1672 ret = hash_check_key(trans, bch2_dirent_hash_desc, hash_info, iter, k); 1673 if (ret < 0) 1674 goto err; 1675 if (ret) { 1676 /* dirent has been deleted */ 1677 ret = 0; 1678 goto out; 1679 } 1680 1681 if (k.k->type != KEY_TYPE_dirent) 1682 goto out; 1683 1684 d = bkey_s_c_to_dirent(k); 1685 1686 if (d.v->d_type == DT_SUBVOL) { 1687 struct bch_inode_unpacked subvol_root; 1688 u32 target_subvol = le32_to_cpu(d.v->d_child_subvol); 1689 u32 target_snapshot; 1690 u64 target_inum; 1691 1692 ret = __subvol_lookup(trans, target_subvol, 1693 &target_snapshot, &target_inum); 1694 if (ret && !bch2_err_matches(ret, ENOENT)) 1695 goto err; 1696 1697 if (fsck_err_on(ret, c, 1698 "dirent points to missing subvolume %u", 1699 le32_to_cpu(d.v->d_child_subvol))) { 1700 ret = __remove_dirent(trans, d.k->p); 1701 goto err; 1702 } 1703 1704 ret = __lookup_inode(trans, target_inum, 1705 &subvol_root, &target_snapshot); 1706 if (ret && !bch2_err_matches(ret, ENOENT)) 1707 goto err; 1708 1709 if (fsck_err_on(ret, c, 1710 "subvolume %u points to missing subvolume root %llu", 1711 target_subvol, 1712 target_inum)) { 1713 bch_err(c, "repair not implemented yet"); 1714 ret = -EINVAL; 1715 goto err; 1716 } 1717 1718 if (fsck_err_on(subvol_root.bi_subvol != target_subvol, c, 1719 "subvol root %llu has wrong bi_subvol field: got %u, should be %u", 1720 target_inum, 1721 subvol_root.bi_subvol, target_subvol)) { 1722 subvol_root.bi_subvol = target_subvol; 1723 ret = __write_inode(trans, &subvol_root, target_snapshot); 1724 if (ret) 1725 goto err; 1726 } 1727 1728 ret = check_dirent_target(trans, iter, d, &subvol_root, 1729 target_snapshot); 1730 if (ret) 1731 goto err; 1732 } else { 1733 ret = __get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum)); 1734 if (ret) 1735 goto err; 1736 1737 if (fsck_err_on(!target->inodes.nr, c, 1738 "dirent points to missing inode: (equiv %u)\n%s", 1739 equiv.snapshot, 1740 (printbuf_reset(&buf), 1741 bch2_bkey_val_to_text(&buf, c, k), 1742 buf.buf))) { 1743 ret = __remove_dirent(trans, d.k->p); 1744 if (ret) 1745 goto err; 1746 } 1747 1748 darray_for_each(target->inodes, i) { 1749 ret = check_dirent_target(trans, iter, d, 1750 &i->inode, i->snapshot); 1751 if (ret) 1752 goto err; 1753 } 1754 } 1755 1756 if (d.v->d_type == DT_DIR) 1757 for_each_visible_inode(c, s, dir, equiv.snapshot, i) 1758 i->count++; 1759 1760 out: 1761 err: 1762 fsck_err: 1763 printbuf_exit(&buf); 1764 bch_err_fn(c, ret); 1765 return ret; 1766 } 1767 1768 /* 1769 * Walk dirents: verify that they all have a corresponding S_ISDIR inode, 1770 * validate d_type 1771 */ 1772 int bch2_check_dirents(struct bch_fs *c) 1773 { 1774 struct inode_walker dir = inode_walker_init(); 1775 struct inode_walker target = inode_walker_init(); 1776 struct snapshots_seen s; 1777 struct bch_hash_info hash_info; 1778 struct btree_trans *trans = bch2_trans_get(c); 1779 struct btree_iter iter; 1780 struct bkey_s_c k; 1781 int ret = 0; 1782 1783 snapshots_seen_init(&s); 1784 1785 ret = for_each_btree_key_commit(trans, iter, BTREE_ID_dirents, 1786 POS(BCACHEFS_ROOT_INO, 0), 1787 BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, 1788 k, 1789 NULL, NULL, 1790 BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, 1791 check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)); 1792 1793 bch2_trans_put(trans); 1794 snapshots_seen_exit(&s); 1795 inode_walker_exit(&dir); 1796 inode_walker_exit(&target); 1797 bch_err_fn(c, ret); 1798 return ret; 1799 } 1800 1801 static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, 1802 struct bkey_s_c k, 1803 struct bch_hash_info *hash_info, 1804 struct inode_walker *inode) 1805 { 1806 struct bch_fs *c = trans->c; 1807 struct inode_walker_entry *i; 1808 int ret; 1809 1810 ret = check_key_has_snapshot(trans, iter, k); 1811 if (ret) 1812 return ret; 1813 1814 i = walk_inode(trans, inode, k.k->p, k.k->type == KEY_TYPE_whiteout); 1815 ret = PTR_ERR_OR_ZERO(i); 1816 if (ret) 1817 return ret; 1818 1819 if (inode->first_this_inode && inode->inodes.nr) 1820 *hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode); 1821 inode->first_this_inode = false; 1822 1823 if (fsck_err_on(!i, c, 1824 "xattr for missing inode %llu", 1825 k.k->p.inode)) 1826 return bch2_btree_delete_at(trans, iter, 0); 1827 1828 if (!i) 1829 return 0; 1830 1831 ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k); 1832 fsck_err: 1833 bch_err_fn(c, ret); 1834 return ret; 1835 } 1836 1837 /* 1838 * Walk xattrs: verify that they all have a corresponding inode 1839 */ 1840 int bch2_check_xattrs(struct bch_fs *c) 1841 { 1842 struct inode_walker inode = inode_walker_init(); 1843 struct bch_hash_info hash_info; 1844 struct btree_iter iter; 1845 struct bkey_s_c k; 1846 int ret = 0; 1847 1848 ret = bch2_trans_run(c, 1849 for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, 1850 POS(BCACHEFS_ROOT_INO, 0), 1851 BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, 1852 k, 1853 NULL, NULL, 1854 BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, 1855 check_xattr(trans, &iter, k, &hash_info, &inode))); 1856 bch_err_fn(c, ret); 1857 return ret; 1858 } 1859 1860 static int check_root_trans(struct btree_trans *trans) 1861 { 1862 struct bch_fs *c = trans->c; 1863 struct bch_inode_unpacked root_inode; 1864 u32 snapshot; 1865 u64 inum; 1866 int ret; 1867 1868 ret = __subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum); 1869 if (ret && !bch2_err_matches(ret, ENOENT)) 1870 return ret; 1871 1872 if (mustfix_fsck_err_on(ret, c, "root subvol missing")) { 1873 struct bkey_i_subvolume root_subvol; 1874 1875 snapshot = U32_MAX; 1876 inum = BCACHEFS_ROOT_INO; 1877 1878 bkey_subvolume_init(&root_subvol.k_i); 1879 root_subvol.k.p.offset = BCACHEFS_ROOT_SUBVOL; 1880 root_subvol.v.flags = 0; 1881 root_subvol.v.snapshot = cpu_to_le32(snapshot); 1882 root_subvol.v.inode = cpu_to_le64(inum); 1883 ret = commit_do(trans, NULL, NULL, 1884 BTREE_INSERT_NOFAIL| 1885 BTREE_INSERT_LAZY_RW, 1886 bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, 1887 &root_subvol.k_i, 0)); 1888 bch_err_msg(c, ret, "writing root subvol"); 1889 if (ret) 1890 goto err; 1891 1892 } 1893 1894 ret = __lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot); 1895 if (ret && !bch2_err_matches(ret, ENOENT)) 1896 return ret; 1897 1898 if (mustfix_fsck_err_on(ret, c, "root directory missing") || 1899 mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), c, 1900 "root inode not a directory")) { 1901 bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, 1902 0, NULL); 1903 root_inode.bi_inum = inum; 1904 1905 ret = __write_inode(trans, &root_inode, snapshot); 1906 bch_err_msg(c, ret, "writing root inode"); 1907 } 1908 err: 1909 fsck_err: 1910 return ret; 1911 } 1912 1913 /* Get root directory, create if it doesn't exist: */ 1914 int bch2_check_root(struct bch_fs *c) 1915 { 1916 int ret; 1917 1918 ret = bch2_trans_do(c, NULL, NULL, 1919 BTREE_INSERT_NOFAIL| 1920 BTREE_INSERT_LAZY_RW, 1921 check_root_trans(trans)); 1922 bch_err_fn(c, ret); 1923 return ret; 1924 } 1925 1926 struct pathbuf_entry { 1927 u64 inum; 1928 u32 snapshot; 1929 }; 1930 1931 typedef DARRAY(struct pathbuf_entry) pathbuf; 1932 1933 static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot) 1934 { 1935 struct pathbuf_entry *i; 1936 1937 darray_for_each(*p, i) 1938 if (i->inum == inum && 1939 i->snapshot == snapshot) 1940 return true; 1941 1942 return false; 1943 } 1944 1945 static int path_down(struct bch_fs *c, pathbuf *p, 1946 u64 inum, u32 snapshot) 1947 { 1948 int ret = darray_push(p, ((struct pathbuf_entry) { 1949 .inum = inum, 1950 .snapshot = snapshot, 1951 })); 1952 1953 if (ret) 1954 bch_err(c, "fsck: error allocating memory for pathbuf, size %zu", 1955 p->size); 1956 return ret; 1957 } 1958 1959 /* 1960 * Check that a given inode is reachable from the root: 1961 * 1962 * XXX: we should also be verifying that inodes are in the right subvolumes 1963 */ 1964 static int check_path(struct btree_trans *trans, 1965 pathbuf *p, 1966 struct bch_inode_unpacked *inode, 1967 u32 snapshot) 1968 { 1969 struct bch_fs *c = trans->c; 1970 int ret = 0; 1971 1972 snapshot = bch2_snapshot_equiv(c, snapshot); 1973 p->nr = 0; 1974 1975 while (!(inode->bi_inum == BCACHEFS_ROOT_INO && 1976 inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)) { 1977 struct btree_iter dirent_iter; 1978 struct bkey_s_c_dirent d; 1979 u32 parent_snapshot = snapshot; 1980 1981 if (inode->bi_subvol) { 1982 u64 inum; 1983 1984 ret = subvol_lookup(trans, inode->bi_parent_subvol, 1985 &parent_snapshot, &inum); 1986 if (ret) 1987 break; 1988 } 1989 1990 ret = lockrestart_do(trans, 1991 PTR_ERR_OR_ZERO((d = dirent_get_by_pos(trans, &dirent_iter, 1992 SPOS(inode->bi_dir, inode->bi_dir_offset, 1993 parent_snapshot))).k)); 1994 if (ret && !bch2_err_matches(ret, ENOENT)) 1995 break; 1996 1997 if (!ret && !dirent_points_to_inode(d, inode)) { 1998 bch2_trans_iter_exit(trans, &dirent_iter); 1999 ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; 2000 } 2001 2002 if (bch2_err_matches(ret, ENOENT)) { 2003 if (fsck_err(c, "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu", 2004 inode->bi_inum, snapshot, 2005 bch2_d_type_str(inode_d_type(inode)), 2006 inode->bi_nlink, 2007 inode->bi_dir, 2008 inode->bi_dir_offset)) 2009 ret = reattach_inode(trans, inode, snapshot); 2010 break; 2011 } 2012 2013 bch2_trans_iter_exit(trans, &dirent_iter); 2014 2015 if (!S_ISDIR(inode->bi_mode)) 2016 break; 2017 2018 ret = path_down(c, p, inode->bi_inum, snapshot); 2019 if (ret) { 2020 bch_err(c, "memory allocation failure"); 2021 return ret; 2022 } 2023 2024 snapshot = parent_snapshot; 2025 2026 ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot); 2027 if (ret) { 2028 /* Should have been caught in dirents pass */ 2029 bch_err(c, "error looking up parent directory: %i", ret); 2030 break; 2031 } 2032 2033 if (path_is_dup(p, inode->bi_inum, snapshot)) { 2034 struct pathbuf_entry *i; 2035 2036 /* XXX print path */ 2037 bch_err(c, "directory structure loop"); 2038 2039 darray_for_each(*p, i) 2040 pr_err("%llu:%u", i->inum, i->snapshot); 2041 pr_err("%llu:%u", inode->bi_inum, snapshot); 2042 2043 if (!fsck_err(c, "directory structure loop")) 2044 return 0; 2045 2046 ret = commit_do(trans, NULL, NULL, 2047 BTREE_INSERT_NOFAIL| 2048 BTREE_INSERT_LAZY_RW, 2049 remove_backpointer(trans, inode)); 2050 if (ret) { 2051 bch_err(c, "error removing dirent: %i", ret); 2052 break; 2053 } 2054 2055 ret = reattach_inode(trans, inode, snapshot); 2056 } 2057 } 2058 fsck_err: 2059 bch_err_fn(c, ret); 2060 return ret; 2061 } 2062 2063 /* 2064 * Check for unreachable inodes, as well as loops in the directory structure: 2065 * After bch2_check_dirents(), if an inode backpointer doesn't exist that means it's 2066 * unreachable: 2067 */ 2068 int bch2_check_directory_structure(struct bch_fs *c) 2069 { 2070 struct btree_trans *trans = bch2_trans_get(c); 2071 struct btree_iter iter; 2072 struct bkey_s_c k; 2073 struct bch_inode_unpacked u; 2074 pathbuf path = { 0, }; 2075 int ret; 2076 2077 for_each_btree_key(trans, iter, BTREE_ID_inodes, POS_MIN, 2078 BTREE_ITER_INTENT| 2079 BTREE_ITER_PREFETCH| 2080 BTREE_ITER_ALL_SNAPSHOTS, k, ret) { 2081 if (!bkey_is_inode(k.k)) 2082 continue; 2083 2084 ret = bch2_inode_unpack(k, &u); 2085 if (ret) { 2086 /* Should have been caught earlier in fsck: */ 2087 bch_err(c, "error unpacking inode %llu: %i", k.k->p.offset, ret); 2088 break; 2089 } 2090 2091 if (u.bi_flags & BCH_INODE_UNLINKED) 2092 continue; 2093 2094 ret = check_path(trans, &path, &u, iter.pos.snapshot); 2095 if (ret) 2096 break; 2097 } 2098 bch2_trans_iter_exit(trans, &iter); 2099 bch2_trans_put(trans); 2100 darray_exit(&path); 2101 bch_err_fn(c, ret); 2102 return ret; 2103 } 2104 2105 struct nlink_table { 2106 size_t nr; 2107 size_t size; 2108 2109 struct nlink { 2110 u64 inum; 2111 u32 snapshot; 2112 u32 count; 2113 } *d; 2114 }; 2115 2116 static int add_nlink(struct bch_fs *c, struct nlink_table *t, 2117 u64 inum, u32 snapshot) 2118 { 2119 if (t->nr == t->size) { 2120 size_t new_size = max_t(size_t, 128UL, t->size * 2); 2121 void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL); 2122 2123 if (!d) { 2124 bch_err(c, "fsck: error allocating memory for nlink_table, size %zu", 2125 new_size); 2126 return -BCH_ERR_ENOMEM_fsck_add_nlink; 2127 } 2128 2129 if (t->d) 2130 memcpy(d, t->d, t->size * sizeof(t->d[0])); 2131 kvfree(t->d); 2132 2133 t->d = d; 2134 t->size = new_size; 2135 } 2136 2137 2138 t->d[t->nr++] = (struct nlink) { 2139 .inum = inum, 2140 .snapshot = snapshot, 2141 }; 2142 2143 return 0; 2144 } 2145 2146 static int nlink_cmp(const void *_l, const void *_r) 2147 { 2148 const struct nlink *l = _l; 2149 const struct nlink *r = _r; 2150 2151 return cmp_int(l->inum, r->inum) ?: cmp_int(l->snapshot, r->snapshot); 2152 } 2153 2154 static void inc_link(struct bch_fs *c, struct snapshots_seen *s, 2155 struct nlink_table *links, 2156 u64 range_start, u64 range_end, u64 inum, u32 snapshot) 2157 { 2158 struct nlink *link, key = { 2159 .inum = inum, .snapshot = U32_MAX, 2160 }; 2161 2162 if (inum < range_start || inum >= range_end) 2163 return; 2164 2165 link = __inline_bsearch(&key, links->d, links->nr, 2166 sizeof(links->d[0]), nlink_cmp); 2167 if (!link) 2168 return; 2169 2170 while (link > links->d && link[0].inum == link[-1].inum) 2171 --link; 2172 2173 for (; link < links->d + links->nr && link->inum == inum; link++) 2174 if (ref_visible(c, s, snapshot, link->snapshot)) { 2175 link->count++; 2176 if (link->snapshot >= snapshot) 2177 break; 2178 } 2179 } 2180 2181 noinline_for_stack 2182 static int check_nlinks_find_hardlinks(struct bch_fs *c, 2183 struct nlink_table *t, 2184 u64 start, u64 *end) 2185 { 2186 struct btree_trans *trans = bch2_trans_get(c); 2187 struct btree_iter iter; 2188 struct bkey_s_c k; 2189 struct bch_inode_unpacked u; 2190 int ret = 0; 2191 2192 for_each_btree_key(trans, iter, BTREE_ID_inodes, 2193 POS(0, start), 2194 BTREE_ITER_INTENT| 2195 BTREE_ITER_PREFETCH| 2196 BTREE_ITER_ALL_SNAPSHOTS, k, ret) { 2197 if (!bkey_is_inode(k.k)) 2198 continue; 2199 2200 /* Should never fail, checked by bch2_inode_invalid: */ 2201 BUG_ON(bch2_inode_unpack(k, &u)); 2202 2203 /* 2204 * Backpointer and directory structure checks are sufficient for 2205 * directories, since they can't have hardlinks: 2206 */ 2207 if (S_ISDIR(u.bi_mode)) 2208 continue; 2209 2210 if (!u.bi_nlink) 2211 continue; 2212 2213 ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot); 2214 if (ret) { 2215 *end = k.k->p.offset; 2216 ret = 0; 2217 break; 2218 } 2219 2220 } 2221 bch2_trans_iter_exit(trans, &iter); 2222 bch2_trans_put(trans); 2223 2224 if (ret) 2225 bch_err(c, "error in fsck: btree error %i while walking inodes", ret); 2226 2227 return ret; 2228 } 2229 2230 noinline_for_stack 2231 static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links, 2232 u64 range_start, u64 range_end) 2233 { 2234 struct btree_trans *trans = bch2_trans_get(c); 2235 struct snapshots_seen s; 2236 struct btree_iter iter; 2237 struct bkey_s_c k; 2238 struct bkey_s_c_dirent d; 2239 int ret; 2240 2241 snapshots_seen_init(&s); 2242 2243 for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN, 2244 BTREE_ITER_INTENT| 2245 BTREE_ITER_PREFETCH| 2246 BTREE_ITER_ALL_SNAPSHOTS, k, ret) { 2247 ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p); 2248 if (ret) 2249 break; 2250 2251 switch (k.k->type) { 2252 case KEY_TYPE_dirent: 2253 d = bkey_s_c_to_dirent(k); 2254 2255 if (d.v->d_type != DT_DIR && 2256 d.v->d_type != DT_SUBVOL) 2257 inc_link(c, &s, links, range_start, range_end, 2258 le64_to_cpu(d.v->d_inum), 2259 bch2_snapshot_equiv(c, d.k->p.snapshot)); 2260 break; 2261 } 2262 } 2263 bch2_trans_iter_exit(trans, &iter); 2264 2265 if (ret) 2266 bch_err(c, "error in fsck: btree error %i while walking dirents", ret); 2267 2268 bch2_trans_put(trans); 2269 snapshots_seen_exit(&s); 2270 return ret; 2271 } 2272 2273 static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter, 2274 struct bkey_s_c k, 2275 struct nlink_table *links, 2276 size_t *idx, u64 range_end) 2277 { 2278 struct bch_fs *c = trans->c; 2279 struct bch_inode_unpacked u; 2280 struct nlink *link = &links->d[*idx]; 2281 int ret = 0; 2282 2283 if (k.k->p.offset >= range_end) 2284 return 1; 2285 2286 if (!bkey_is_inode(k.k)) 2287 return 0; 2288 2289 BUG_ON(bch2_inode_unpack(k, &u)); 2290 2291 if (S_ISDIR(u.bi_mode)) 2292 return 0; 2293 2294 if (!u.bi_nlink) 2295 return 0; 2296 2297 while ((cmp_int(link->inum, k.k->p.offset) ?: 2298 cmp_int(link->snapshot, k.k->p.snapshot)) < 0) { 2299 BUG_ON(*idx == links->nr); 2300 link = &links->d[++*idx]; 2301 } 2302 2303 if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, c, 2304 "inode %llu type %s has wrong i_nlink (%u, should be %u)", 2305 u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)], 2306 bch2_inode_nlink_get(&u), link->count)) { 2307 bch2_inode_nlink_set(&u, link->count); 2308 ret = __write_inode(trans, &u, k.k->p.snapshot); 2309 } 2310 fsck_err: 2311 return ret; 2312 } 2313 2314 noinline_for_stack 2315 static int check_nlinks_update_hardlinks(struct bch_fs *c, 2316 struct nlink_table *links, 2317 u64 range_start, u64 range_end) 2318 { 2319 struct btree_iter iter; 2320 struct bkey_s_c k; 2321 size_t idx = 0; 2322 int ret = 0; 2323 2324 ret = bch2_trans_run(c, 2325 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, 2326 POS(0, range_start), 2327 BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, 2328 NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, 2329 check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end))); 2330 if (ret < 0) { 2331 bch_err(c, "error in fsck: btree error %i while walking inodes", ret); 2332 return ret; 2333 } 2334 2335 return 0; 2336 } 2337 2338 int bch2_check_nlinks(struct bch_fs *c) 2339 { 2340 struct nlink_table links = { 0 }; 2341 u64 this_iter_range_start, next_iter_range_start = 0; 2342 int ret = 0; 2343 2344 do { 2345 this_iter_range_start = next_iter_range_start; 2346 next_iter_range_start = U64_MAX; 2347 2348 ret = check_nlinks_find_hardlinks(c, &links, 2349 this_iter_range_start, 2350 &next_iter_range_start); 2351 2352 ret = check_nlinks_walk_dirents(c, &links, 2353 this_iter_range_start, 2354 next_iter_range_start); 2355 if (ret) 2356 break; 2357 2358 ret = check_nlinks_update_hardlinks(c, &links, 2359 this_iter_range_start, 2360 next_iter_range_start); 2361 if (ret) 2362 break; 2363 2364 links.nr = 0; 2365 } while (next_iter_range_start != U64_MAX); 2366 2367 kvfree(links.d); 2368 bch_err_fn(c, ret); 2369 return ret; 2370 } 2371 2372 static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter, 2373 struct bkey_s_c k) 2374 { 2375 struct bkey_s_c_reflink_p p; 2376 struct bkey_i_reflink_p *u; 2377 int ret; 2378 2379 if (k.k->type != KEY_TYPE_reflink_p) 2380 return 0; 2381 2382 p = bkey_s_c_to_reflink_p(k); 2383 2384 if (!p.v->front_pad && !p.v->back_pad) 2385 return 0; 2386 2387 u = bch2_trans_kmalloc(trans, sizeof(*u)); 2388 ret = PTR_ERR_OR_ZERO(u); 2389 if (ret) 2390 return ret; 2391 2392 bkey_reassemble(&u->k_i, k); 2393 u->v.front_pad = 0; 2394 u->v.back_pad = 0; 2395 2396 return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_NORUN); 2397 } 2398 2399 int bch2_fix_reflink_p(struct bch_fs *c) 2400 { 2401 struct btree_iter iter; 2402 struct bkey_s_c k; 2403 int ret; 2404 2405 if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix) 2406 return 0; 2407 2408 ret = bch2_trans_run(c, 2409 for_each_btree_key_commit(trans, iter, 2410 BTREE_ID_extents, POS_MIN, 2411 BTREE_ITER_INTENT|BTREE_ITER_PREFETCH| 2412 BTREE_ITER_ALL_SNAPSHOTS, k, 2413 NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, 2414 fix_reflink_p_key(trans, &iter, k))); 2415 bch_err_fn(c, ret); 2416 return ret; 2417 } 2418