1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "bkey_buf.h" 5 #include "btree_cache.h" 6 #include "btree_update.h" 7 #include "buckets.h" 8 #include "darray.h" 9 #include "dirent.h" 10 #include "error.h" 11 #include "fs-common.h" 12 #include "fsck.h" 13 #include "inode.h" 14 #include "keylist.h" 15 #include "recovery.h" 16 #include "snapshot.h" 17 #include "super.h" 18 #include "xattr.h" 19 20 #include <linux/bsearch.h> 21 #include <linux/dcache.h> /* struct qstr */ 22 23 /* 24 * XXX: this is handling transaction restarts without returning 25 * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore: 26 */ 27 static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum, 28 u32 snapshot) 29 { 30 u64 sectors = 0; 31 32 int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_extents, 33 SPOS(inum, 0, snapshot), 34 POS(inum, U64_MAX), 35 0, k, ({ 36 if (bkey_extent_is_allocation(k.k)) 37 sectors += k.k->size; 38 0; 39 })); 40 41 return ret ?: sectors; 42 } 43 44 static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum, 45 u32 snapshot) 46 { 47 u64 subdirs = 0; 48 49 int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_dirents, 50 SPOS(inum, 0, snapshot), 51 POS(inum, U64_MAX), 52 0, k, ({ 53 if (k.k->type == KEY_TYPE_dirent && 54 bkey_s_c_to_dirent(k).v->d_type == DT_DIR) 55 subdirs++; 56 0; 57 })); 58 59 return ret ?: subdirs; 60 } 61 62 static int subvol_lookup(struct btree_trans *trans, u32 subvol, 63 u32 *snapshot, u64 *inum) 64 { 65 struct bch_subvolume s; 66 int ret; 67 68 ret = bch2_subvolume_get(trans, subvol, false, 0, &s); 69 70 *snapshot = le32_to_cpu(s.snapshot); 71 *inum = le64_to_cpu(s.inode); 72 return ret; 73 } 74 75 static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, 76 struct bch_inode_unpacked *inode) 77 { 78 struct btree_iter iter; 79 struct bkey_s_c k; 80 int ret; 81 82 bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, 83 POS(0, inode_nr), 84 BTREE_ITER_ALL_SNAPSHOTS); 85 k = bch2_btree_iter_peek(&iter); 86 ret = bkey_err(k); 87 if (ret) 88 goto err; 89 90 if (!k.k || !bkey_eq(k.k->p, POS(0, inode_nr))) { 91 ret = -BCH_ERR_ENOENT_inode; 92 goto err; 93 } 94 95 ret = bch2_inode_unpack(k, inode); 96 err: 97 bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr); 98 bch2_trans_iter_exit(trans, &iter); 99 return ret; 100 } 101 102 static int lookup_inode(struct btree_trans *trans, u64 inode_nr, 103 struct bch_inode_unpacked *inode, 104 u32 *snapshot) 105 { 106 struct btree_iter iter; 107 struct bkey_s_c k; 108 int ret; 109 110 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, 111 SPOS(0, inode_nr, *snapshot), 0); 112 ret = bkey_err(k); 113 if (ret) 114 goto err; 115 116 ret = bkey_is_inode(k.k) 117 ? bch2_inode_unpack(k, inode) 118 : -BCH_ERR_ENOENT_inode; 119 if (!ret) 120 *snapshot = iter.pos.snapshot; 121 err: 122 bch2_trans_iter_exit(trans, &iter); 123 return ret; 124 } 125 126 static int lookup_dirent_in_snapshot(struct btree_trans *trans, 127 struct bch_hash_info hash_info, 128 subvol_inum dir, struct qstr *name, 129 u64 *target, unsigned *type, u32 snapshot) 130 { 131 struct btree_iter iter; 132 struct bkey_s_c_dirent d; 133 int ret = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc, 134 &hash_info, dir, name, 0, snapshot); 135 if (ret) 136 return ret; 137 138 d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter)); 139 *target = le64_to_cpu(d.v->d_inum); 140 *type = d.v->d_type; 141 bch2_trans_iter_exit(trans, &iter); 142 return 0; 143 } 144 145 static int __write_inode(struct btree_trans *trans, 146 struct bch_inode_unpacked *inode, 147 u32 snapshot) 148 { 149 struct bkey_inode_buf *inode_p = 150 bch2_trans_kmalloc(trans, sizeof(*inode_p)); 151 152 if (IS_ERR(inode_p)) 153 return PTR_ERR(inode_p); 154 155 bch2_inode_pack(inode_p, inode); 156 inode_p->inode.k.p.snapshot = snapshot; 157 158 return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes, 159 &inode_p->inode.k_i, 160 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); 161 } 162 163 static int fsck_write_inode(struct btree_trans *trans, 164 struct bch_inode_unpacked *inode, 165 u32 snapshot) 166 { 167 int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 168 __write_inode(trans, inode, snapshot)); 169 bch_err_fn(trans->c, ret); 170 return ret; 171 } 172 173 static int __remove_dirent(struct btree_trans *trans, struct bpos pos) 174 { 175 struct bch_fs *c = trans->c; 176 struct btree_iter iter; 177 struct bch_inode_unpacked dir_inode; 178 struct bch_hash_info dir_hash_info; 179 int ret; 180 181 ret = lookup_first_inode(trans, pos.inode, &dir_inode); 182 if (ret) 183 goto err; 184 185 dir_hash_info = bch2_hash_info_init(c, &dir_inode); 186 187 bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT); 188 189 ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc, 190 &dir_hash_info, &iter, 191 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); 192 bch2_trans_iter_exit(trans, &iter); 193 err: 194 bch_err_fn(c, ret); 195 return ret; 196 } 197 198 /* Get lost+found, create if it doesn't exist: */ 199 static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, 200 struct bch_inode_unpacked *lostfound) 201 { 202 struct bch_fs *c = trans->c; 203 struct qstr lostfound_str = QSTR("lost+found"); 204 u64 inum = 0; 205 unsigned d_type = 0; 206 int ret; 207 208 struct bch_snapshot_tree st; 209 ret = bch2_snapshot_tree_lookup(trans, 210 bch2_snapshot_tree(c, snapshot), &st); 211 if (ret) 212 return ret; 213 214 subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) }; 215 u32 subvol_snapshot; 216 217 ret = subvol_lookup(trans, le32_to_cpu(st.master_subvol), 218 &subvol_snapshot, &root_inum.inum); 219 bch_err_msg(c, ret, "looking up root subvol"); 220 if (ret) 221 return ret; 222 223 struct bch_inode_unpacked root_inode; 224 struct bch_hash_info root_hash_info; 225 u32 root_inode_snapshot = snapshot; 226 ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot); 227 bch_err_msg(c, ret, "looking up root inode"); 228 if (ret) 229 return ret; 230 231 root_hash_info = bch2_hash_info_init(c, &root_inode); 232 233 ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum, 234 &lostfound_str, &inum, &d_type, snapshot); 235 if (bch2_err_matches(ret, ENOENT)) 236 goto create_lostfound; 237 238 bch_err_fn(c, ret); 239 if (ret) 240 return ret; 241 242 if (d_type != DT_DIR) { 243 bch_err(c, "error looking up lost+found: not a directory"); 244 return -BCH_ERR_ENOENT_not_directory; 245 } 246 247 /* 248 * The bch2_check_dirents pass has already run, dangling dirents 249 * shouldn't exist here: 250 */ 251 ret = lookup_inode(trans, inum, lostfound, &snapshot); 252 bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)", 253 inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot)); 254 return ret; 255 256 create_lostfound: 257 /* 258 * XXX: we could have a nicer log message here if we had a nice way to 259 * walk backpointers to print a path 260 */ 261 bch_notice(c, "creating lost+found in snapshot %u", le32_to_cpu(st.root_snapshot)); 262 263 u64 now = bch2_current_time(c); 264 struct btree_iter lostfound_iter = { NULL }; 265 u64 cpu = raw_smp_processor_id(); 266 267 bch2_inode_init_early(c, lostfound); 268 bch2_inode_init_late(lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode); 269 lostfound->bi_dir = root_inode.bi_inum; 270 271 root_inode.bi_nlink++; 272 273 ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu); 274 if (ret) 275 goto err; 276 277 bch2_btree_iter_set_snapshot(&lostfound_iter, snapshot); 278 ret = bch2_btree_iter_traverse(&lostfound_iter); 279 if (ret) 280 goto err; 281 282 ret = bch2_dirent_create_snapshot(trans, 283 root_inode.bi_inum, snapshot, &root_hash_info, 284 mode_to_type(lostfound->bi_mode), 285 &lostfound_str, 286 lostfound->bi_inum, 287 &lostfound->bi_dir_offset, 288 BCH_HASH_SET_MUST_CREATE) ?: 289 bch2_inode_write_flags(trans, &lostfound_iter, lostfound, 290 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); 291 err: 292 bch_err_msg(c, ret, "creating lost+found"); 293 bch2_trans_iter_exit(trans, &lostfound_iter); 294 return ret; 295 } 296 297 static int reattach_inode(struct btree_trans *trans, 298 struct bch_inode_unpacked *inode, 299 u32 inode_snapshot) 300 { 301 struct bch_hash_info dir_hash; 302 struct bch_inode_unpacked lostfound; 303 char name_buf[20]; 304 struct qstr name; 305 u64 dir_offset = 0; 306 int ret; 307 308 ret = lookup_lostfound(trans, inode_snapshot, &lostfound); 309 if (ret) 310 return ret; 311 312 if (S_ISDIR(inode->bi_mode)) { 313 lostfound.bi_nlink++; 314 315 ret = __write_inode(trans, &lostfound, U32_MAX); 316 if (ret) 317 return ret; 318 } 319 320 dir_hash = bch2_hash_info_init(trans->c, &lostfound); 321 322 snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum); 323 name = (struct qstr) QSTR(name_buf); 324 325 ret = bch2_dirent_create_snapshot(trans, 326 lostfound.bi_inum, inode_snapshot, 327 &dir_hash, 328 inode_d_type(inode), 329 &name, inode->bi_inum, &dir_offset, 330 BCH_HASH_SET_MUST_CREATE); 331 if (ret) 332 return ret; 333 334 inode->bi_dir = lostfound.bi_inum; 335 inode->bi_dir_offset = dir_offset; 336 337 return __write_inode(trans, inode, inode_snapshot); 338 } 339 340 static int remove_backpointer(struct btree_trans *trans, 341 struct bch_inode_unpacked *inode) 342 { 343 struct btree_iter iter; 344 struct bkey_s_c_dirent d; 345 int ret; 346 347 d = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents, 348 POS(inode->bi_dir, inode->bi_dir_offset), 0, 349 dirent); 350 ret = bkey_err(d) ?: 351 __remove_dirent(trans, d.k->p); 352 bch2_trans_iter_exit(trans, &iter); 353 return ret; 354 } 355 356 struct snapshots_seen_entry { 357 u32 id; 358 u32 equiv; 359 }; 360 361 struct snapshots_seen { 362 struct bpos pos; 363 DARRAY(struct snapshots_seen_entry) ids; 364 }; 365 366 static inline void snapshots_seen_exit(struct snapshots_seen *s) 367 { 368 darray_exit(&s->ids); 369 } 370 371 static inline void snapshots_seen_init(struct snapshots_seen *s) 372 { 373 memset(s, 0, sizeof(*s)); 374 } 375 376 static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id) 377 { 378 struct snapshots_seen_entry *i, n = { 379 .id = id, 380 .equiv = bch2_snapshot_equiv(c, id), 381 }; 382 int ret = 0; 383 384 __darray_for_each(s->ids, i) { 385 if (i->id == id) 386 return 0; 387 if (i->id > id) 388 break; 389 } 390 391 ret = darray_insert_item(&s->ids, i - s->ids.data, n); 392 if (ret) 393 bch_err(c, "error reallocating snapshots_seen table (size %zu)", 394 s->ids.size); 395 return ret; 396 } 397 398 static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, 399 enum btree_id btree_id, struct bpos pos) 400 { 401 struct snapshots_seen_entry n = { 402 .id = pos.snapshot, 403 .equiv = bch2_snapshot_equiv(c, pos.snapshot), 404 }; 405 int ret = 0; 406 407 if (!bkey_eq(s->pos, pos)) 408 s->ids.nr = 0; 409 410 s->pos = pos; 411 s->pos.snapshot = n.equiv; 412 413 darray_for_each(s->ids, i) { 414 if (i->id == n.id) 415 return 0; 416 417 /* 418 * We currently don't rigorously track for snapshot cleanup 419 * needing to be run, so it shouldn't be a fsck error yet: 420 */ 421 if (i->equiv == n.equiv) { 422 bch_err(c, "snapshot deletion did not finish:\n" 423 " duplicate keys in btree %s at %llu:%llu snapshots %u, %u (equiv %u)\n", 424 bch2_btree_id_str(btree_id), 425 pos.inode, pos.offset, 426 i->id, n.id, n.equiv); 427 set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags); 428 return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_delete_dead_snapshots); 429 } 430 } 431 432 ret = darray_push(&s->ids, n); 433 if (ret) 434 bch_err(c, "error reallocating snapshots_seen table (size %zu)", 435 s->ids.size); 436 return ret; 437 } 438 439 /** 440 * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor, 441 * and @ancestor hasn't been overwritten in @seen 442 * 443 * @c: filesystem handle 444 * @seen: list of snapshot ids already seen at current position 445 * @id: descendent snapshot id 446 * @ancestor: ancestor snapshot id 447 * 448 * Returns: whether key in @ancestor snapshot is visible in @id snapshot 449 */ 450 static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen, 451 u32 id, u32 ancestor) 452 { 453 ssize_t i; 454 455 EBUG_ON(id > ancestor); 456 EBUG_ON(!bch2_snapshot_is_equiv(c, id)); 457 EBUG_ON(!bch2_snapshot_is_equiv(c, ancestor)); 458 459 /* @ancestor should be the snapshot most recently added to @seen */ 460 EBUG_ON(ancestor != seen->pos.snapshot); 461 EBUG_ON(ancestor != seen->ids.data[seen->ids.nr - 1].equiv); 462 463 if (id == ancestor) 464 return true; 465 466 if (!bch2_snapshot_is_ancestor(c, id, ancestor)) 467 return false; 468 469 /* 470 * We know that @id is a descendant of @ancestor, we're checking if 471 * we've seen a key that overwrote @ancestor - i.e. also a descendent of 472 * @ascestor and with @id as a descendent. 473 * 474 * But we already know that we're scanning IDs between @id and @ancestor 475 * numerically, since snapshot ID lists are kept sorted, so if we find 476 * an id that's an ancestor of @id we're done: 477 */ 478 479 for (i = seen->ids.nr - 2; 480 i >= 0 && seen->ids.data[i].equiv >= id; 481 --i) 482 if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i].equiv)) 483 return false; 484 485 return true; 486 } 487 488 /** 489 * ref_visible - given a key with snapshot id @src that points to a key with 490 * snapshot id @dst, test whether there is some snapshot in which @dst is 491 * visible. 492 * 493 * @c: filesystem handle 494 * @s: list of snapshot IDs already seen at @src 495 * @src: snapshot ID of src key 496 * @dst: snapshot ID of dst key 497 * Returns: true if there is some snapshot in which @dst is visible 498 * 499 * Assumes we're visiting @src keys in natural key order 500 */ 501 static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s, 502 u32 src, u32 dst) 503 { 504 return dst <= src 505 ? key_visible_in_snapshot(c, s, dst, src) 506 : bch2_snapshot_is_ancestor(c, src, dst); 507 } 508 509 static int ref_visible2(struct bch_fs *c, 510 u32 src, struct snapshots_seen *src_seen, 511 u32 dst, struct snapshots_seen *dst_seen) 512 { 513 src = bch2_snapshot_equiv(c, src); 514 dst = bch2_snapshot_equiv(c, dst); 515 516 if (dst > src) { 517 swap(dst, src); 518 swap(dst_seen, src_seen); 519 } 520 return key_visible_in_snapshot(c, src_seen, dst, src); 521 } 522 523 #define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \ 524 for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && \ 525 (_i)->snapshot <= (_snapshot); _i++) \ 526 if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot)) 527 528 struct inode_walker_entry { 529 struct bch_inode_unpacked inode; 530 u32 snapshot; 531 bool seen_this_pos; 532 u64 count; 533 }; 534 535 struct inode_walker { 536 bool first_this_inode; 537 bool recalculate_sums; 538 struct bpos last_pos; 539 540 DARRAY(struct inode_walker_entry) inodes; 541 }; 542 543 static void inode_walker_exit(struct inode_walker *w) 544 { 545 darray_exit(&w->inodes); 546 } 547 548 static struct inode_walker inode_walker_init(void) 549 { 550 return (struct inode_walker) { 0, }; 551 } 552 553 static int add_inode(struct bch_fs *c, struct inode_walker *w, 554 struct bkey_s_c inode) 555 { 556 struct bch_inode_unpacked u; 557 558 BUG_ON(bch2_inode_unpack(inode, &u)); 559 560 return darray_push(&w->inodes, ((struct inode_walker_entry) { 561 .inode = u, 562 .snapshot = bch2_snapshot_equiv(c, inode.k->p.snapshot), 563 })); 564 } 565 566 static int get_inodes_all_snapshots(struct btree_trans *trans, 567 struct inode_walker *w, u64 inum) 568 { 569 struct bch_fs *c = trans->c; 570 struct btree_iter iter; 571 struct bkey_s_c k; 572 int ret; 573 574 w->recalculate_sums = false; 575 w->inodes.nr = 0; 576 577 for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum), 578 BTREE_ITER_ALL_SNAPSHOTS, k, ret) { 579 if (k.k->p.offset != inum) 580 break; 581 582 if (bkey_is_inode(k.k)) 583 add_inode(c, w, k); 584 } 585 bch2_trans_iter_exit(trans, &iter); 586 587 if (ret) 588 return ret; 589 590 w->first_this_inode = true; 591 return 0; 592 } 593 594 static struct inode_walker_entry * 595 lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, 596 u32 snapshot, bool is_whiteout) 597 { 598 struct inode_walker_entry *i; 599 600 snapshot = bch2_snapshot_equiv(c, snapshot); 601 602 __darray_for_each(w->inodes, i) 603 if (bch2_snapshot_is_ancestor(c, snapshot, i->snapshot)) 604 goto found; 605 606 return NULL; 607 found: 608 BUG_ON(snapshot > i->snapshot); 609 610 if (snapshot != i->snapshot && !is_whiteout) { 611 struct inode_walker_entry new = *i; 612 size_t pos; 613 int ret; 614 615 new.snapshot = snapshot; 616 new.count = 0; 617 618 bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u", 619 w->last_pos.inode, snapshot, i->snapshot); 620 621 while (i > w->inodes.data && i[-1].snapshot > snapshot) 622 --i; 623 624 pos = i - w->inodes.data; 625 ret = darray_insert_item(&w->inodes, pos, new); 626 if (ret) 627 return ERR_PTR(ret); 628 629 i = w->inodes.data + pos; 630 } 631 632 return i; 633 } 634 635 static struct inode_walker_entry *walk_inode(struct btree_trans *trans, 636 struct inode_walker *w, struct bpos pos, 637 bool is_whiteout) 638 { 639 if (w->last_pos.inode != pos.inode) { 640 int ret = get_inodes_all_snapshots(trans, w, pos.inode); 641 if (ret) 642 return ERR_PTR(ret); 643 } else if (bkey_cmp(w->last_pos, pos)) { 644 darray_for_each(w->inodes, i) 645 i->seen_this_pos = false; 646 } 647 648 w->last_pos = pos; 649 650 return lookup_inode_for_snapshot(trans->c, w, pos.snapshot, is_whiteout); 651 } 652 653 static int __get_visible_inodes(struct btree_trans *trans, 654 struct inode_walker *w, 655 struct snapshots_seen *s, 656 u64 inum) 657 { 658 struct bch_fs *c = trans->c; 659 struct btree_iter iter; 660 struct bkey_s_c k; 661 int ret; 662 663 w->inodes.nr = 0; 664 665 for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum), 666 BTREE_ITER_ALL_SNAPSHOTS, k, ret) { 667 u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot); 668 669 if (k.k->p.offset != inum) 670 break; 671 672 if (!ref_visible(c, s, s->pos.snapshot, equiv)) 673 continue; 674 675 if (bkey_is_inode(k.k)) 676 add_inode(c, w, k); 677 678 if (equiv >= s->pos.snapshot) 679 break; 680 } 681 bch2_trans_iter_exit(trans, &iter); 682 683 return ret; 684 } 685 686 static int check_key_has_snapshot(struct btree_trans *trans, 687 struct btree_iter *iter, 688 struct bkey_s_c k) 689 { 690 struct bch_fs *c = trans->c; 691 struct printbuf buf = PRINTBUF; 692 int ret = 0; 693 694 if (mustfix_fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c, 695 bkey_in_missing_snapshot, 696 "key in missing snapshot: %s", 697 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) 698 ret = bch2_btree_delete_at(trans, iter, 699 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1; 700 fsck_err: 701 printbuf_exit(&buf); 702 return ret; 703 } 704 705 static int hash_redo_key(struct btree_trans *trans, 706 const struct bch_hash_desc desc, 707 struct bch_hash_info *hash_info, 708 struct btree_iter *k_iter, struct bkey_s_c k) 709 { 710 struct bkey_i *delete; 711 struct bkey_i *tmp; 712 713 delete = bch2_trans_kmalloc(trans, sizeof(*delete)); 714 if (IS_ERR(delete)) 715 return PTR_ERR(delete); 716 717 tmp = bch2_bkey_make_mut_noupdate(trans, k); 718 if (IS_ERR(tmp)) 719 return PTR_ERR(tmp); 720 721 bkey_init(&delete->k); 722 delete->k.p = k_iter->pos; 723 return bch2_btree_iter_traverse(k_iter) ?: 724 bch2_trans_update(trans, k_iter, delete, 0) ?: 725 bch2_hash_set_snapshot(trans, desc, hash_info, 726 (subvol_inum) { 0, k.k->p.inode }, 727 k.k->p.snapshot, tmp, 728 BCH_HASH_SET_MUST_CREATE, 729 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 730 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); 731 } 732 733 static int hash_check_key(struct btree_trans *trans, 734 const struct bch_hash_desc desc, 735 struct bch_hash_info *hash_info, 736 struct btree_iter *k_iter, struct bkey_s_c hash_k) 737 { 738 struct bch_fs *c = trans->c; 739 struct btree_iter iter = { NULL }; 740 struct printbuf buf = PRINTBUF; 741 struct bkey_s_c k; 742 u64 hash; 743 int ret = 0; 744 745 if (hash_k.k->type != desc.key_type) 746 return 0; 747 748 hash = desc.hash_bkey(hash_info, hash_k); 749 750 if (likely(hash == hash_k.k->p.offset)) 751 return 0; 752 753 if (hash_k.k->p.offset < hash) 754 goto bad_hash; 755 756 for_each_btree_key_norestart(trans, iter, desc.btree_id, 757 SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot), 758 BTREE_ITER_SLOTS, k, ret) { 759 if (bkey_eq(k.k->p, hash_k.k->p)) 760 break; 761 762 if (fsck_err_on(k.k->type == desc.key_type && 763 !desc.cmp_bkey(k, hash_k), c, 764 hash_table_key_duplicate, 765 "duplicate hash table keys:\n%s", 766 (printbuf_reset(&buf), 767 bch2_bkey_val_to_text(&buf, c, hash_k), 768 buf.buf))) { 769 ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1; 770 break; 771 } 772 773 if (bkey_deleted(k.k)) { 774 bch2_trans_iter_exit(trans, &iter); 775 goto bad_hash; 776 } 777 } 778 out: 779 bch2_trans_iter_exit(trans, &iter); 780 printbuf_exit(&buf); 781 return ret; 782 bad_hash: 783 if (fsck_err(c, hash_table_key_wrong_offset, 784 "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s", 785 bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash, 786 (printbuf_reset(&buf), 787 bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) { 788 ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k); 789 bch_err_fn(c, ret); 790 if (ret) 791 return ret; 792 ret = -BCH_ERR_transaction_restart_nested; 793 } 794 fsck_err: 795 goto out; 796 } 797 798 static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p) 799 { 800 struct btree_iter iter; 801 struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0); 802 int ret = bkey_err(k); 803 if (ret) 804 return ret; 805 806 bch2_trans_iter_exit(trans, &iter); 807 return k.k->type == KEY_TYPE_set; 808 } 809 810 static int check_inode(struct btree_trans *trans, 811 struct btree_iter *iter, 812 struct bkey_s_c k, 813 struct bch_inode_unpacked *prev, 814 struct snapshots_seen *s, 815 bool full) 816 { 817 struct bch_fs *c = trans->c; 818 struct bch_inode_unpacked u; 819 bool do_update = false; 820 int ret; 821 822 ret = check_key_has_snapshot(trans, iter, k); 823 if (ret < 0) 824 goto err; 825 if (ret) 826 return 0; 827 828 ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); 829 if (ret) 830 goto err; 831 832 if (!bkey_is_inode(k.k)) 833 return 0; 834 835 BUG_ON(bch2_inode_unpack(k, &u)); 836 837 if (!full && 838 !(u.bi_flags & (BCH_INODE_i_size_dirty| 839 BCH_INODE_i_sectors_dirty| 840 BCH_INODE_unlinked))) 841 return 0; 842 843 if (prev->bi_inum != u.bi_inum) 844 *prev = u; 845 846 if (fsck_err_on(prev->bi_hash_seed != u.bi_hash_seed || 847 inode_d_type(prev) != inode_d_type(&u), 848 c, inode_snapshot_mismatch, 849 "inodes in different snapshots don't match")) { 850 bch_err(c, "repair not implemented yet"); 851 return -BCH_ERR_fsck_repair_unimplemented; 852 } 853 854 if ((u.bi_flags & (BCH_INODE_i_size_dirty|BCH_INODE_unlinked)) && 855 bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) { 856 struct bpos new_min_pos; 857 858 ret = bch2_propagate_key_to_snapshot_leaves(trans, iter->btree_id, k, &new_min_pos); 859 if (ret) 860 goto err; 861 862 u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked; 863 864 ret = __write_inode(trans, &u, iter->pos.snapshot); 865 bch_err_msg(c, ret, "in fsck updating inode"); 866 if (ret) 867 return ret; 868 869 if (!bpos_eq(new_min_pos, POS_MIN)) 870 bch2_btree_iter_set_pos(iter, bpos_predecessor(new_min_pos)); 871 return 0; 872 } 873 874 if (u.bi_flags & BCH_INODE_unlinked) { 875 ret = check_inode_deleted_list(trans, k.k->p); 876 if (ret < 0) 877 return ret; 878 879 fsck_err_on(ret, c, unlinked_inode_not_on_deleted_list, 880 "inode %llu:%u unlinked, but not on deleted list", 881 u.bi_inum, k.k->p.snapshot); 882 ret = 0; 883 } 884 885 if (u.bi_flags & BCH_INODE_unlinked && 886 (!c->sb.clean || 887 fsck_err(c, inode_unlinked_but_clean, 888 "filesystem marked clean, but inode %llu unlinked", 889 u.bi_inum))) { 890 ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot); 891 bch_err_msg(c, ret, "in fsck deleting inode"); 892 return ret; 893 } 894 895 if (u.bi_flags & BCH_INODE_i_size_dirty && 896 (!c->sb.clean || 897 fsck_err(c, inode_i_size_dirty_but_clean, 898 "filesystem marked clean, but inode %llu has i_size dirty", 899 u.bi_inum))) { 900 bch_verbose(c, "truncating inode %llu", u.bi_inum); 901 902 /* 903 * XXX: need to truncate partial blocks too here - or ideally 904 * just switch units to bytes and that issue goes away 905 */ 906 ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents, 907 SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9, 908 iter->pos.snapshot), 909 POS(u.bi_inum, U64_MAX), 910 0, NULL); 911 bch_err_msg(c, ret, "in fsck truncating inode"); 912 if (ret) 913 return ret; 914 915 /* 916 * We truncated without our normal sector accounting hook, just 917 * make sure we recalculate it: 918 */ 919 u.bi_flags |= BCH_INODE_i_sectors_dirty; 920 921 u.bi_flags &= ~BCH_INODE_i_size_dirty; 922 do_update = true; 923 } 924 925 if (u.bi_flags & BCH_INODE_i_sectors_dirty && 926 (!c->sb.clean || 927 fsck_err(c, inode_i_sectors_dirty_but_clean, 928 "filesystem marked clean, but inode %llu has i_sectors dirty", 929 u.bi_inum))) { 930 s64 sectors; 931 932 bch_verbose(c, "recounting sectors for inode %llu", 933 u.bi_inum); 934 935 sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot); 936 if (sectors < 0) { 937 bch_err_msg(c, sectors, "in fsck recounting inode sectors"); 938 return sectors; 939 } 940 941 u.bi_sectors = sectors; 942 u.bi_flags &= ~BCH_INODE_i_sectors_dirty; 943 do_update = true; 944 } 945 946 if (u.bi_flags & BCH_INODE_backptr_untrusted) { 947 u.bi_dir = 0; 948 u.bi_dir_offset = 0; 949 u.bi_flags &= ~BCH_INODE_backptr_untrusted; 950 do_update = true; 951 } 952 953 if (do_update) { 954 ret = __write_inode(trans, &u, iter->pos.snapshot); 955 bch_err_msg(c, ret, "in fsck updating inode"); 956 if (ret) 957 return ret; 958 } 959 err: 960 fsck_err: 961 bch_err_fn(c, ret); 962 return ret; 963 } 964 965 int bch2_check_inodes(struct bch_fs *c) 966 { 967 bool full = c->opts.fsck; 968 struct bch_inode_unpacked prev = { 0 }; 969 struct snapshots_seen s; 970 971 snapshots_seen_init(&s); 972 973 int ret = bch2_trans_run(c, 974 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, 975 POS_MIN, 976 BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, 977 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 978 check_inode(trans, &iter, k, &prev, &s, full))); 979 980 snapshots_seen_exit(&s); 981 bch_err_fn(c, ret); 982 return ret; 983 } 984 985 static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, 986 struct btree_iter *iter, 987 struct bpos pos) 988 { 989 return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent); 990 } 991 992 static bool inode_points_to_dirent(struct bch_inode_unpacked *inode, 993 struct bkey_s_c_dirent d) 994 { 995 return inode->bi_dir == d.k->p.inode && 996 inode->bi_dir_offset == d.k->p.offset; 997 } 998 999 static bool dirent_points_to_inode(struct bkey_s_c_dirent d, 1000 struct bch_inode_unpacked *inode) 1001 { 1002 return d.v->d_type == DT_SUBVOL 1003 ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol 1004 : le64_to_cpu(d.v->d_inum) == inode->bi_inum; 1005 } 1006 1007 static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) 1008 { 1009 struct bch_fs *c = trans->c; 1010 u32 restart_count = trans->restart_count; 1011 int ret = 0; 1012 s64 count2; 1013 1014 darray_for_each(w->inodes, i) { 1015 if (i->inode.bi_sectors == i->count) 1016 continue; 1017 1018 count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot); 1019 1020 if (w->recalculate_sums) 1021 i->count = count2; 1022 1023 if (i->count != count2) { 1024 bch_err(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu", 1025 w->last_pos.inode, i->snapshot, i->count, count2); 1026 return -BCH_ERR_internal_fsck_err; 1027 } 1028 1029 if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty), 1030 c, inode_i_sectors_wrong, 1031 "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu", 1032 w->last_pos.inode, i->snapshot, 1033 i->inode.bi_sectors, i->count)) { 1034 i->inode.bi_sectors = i->count; 1035 ret = fsck_write_inode(trans, &i->inode, i->snapshot); 1036 if (ret) 1037 break; 1038 } 1039 } 1040 fsck_err: 1041 bch_err_fn(c, ret); 1042 return ret ?: trans_was_restarted(trans, restart_count); 1043 } 1044 1045 struct extent_end { 1046 u32 snapshot; 1047 u64 offset; 1048 struct snapshots_seen seen; 1049 }; 1050 1051 struct extent_ends { 1052 struct bpos last_pos; 1053 DARRAY(struct extent_end) e; 1054 }; 1055 1056 static void extent_ends_reset(struct extent_ends *extent_ends) 1057 { 1058 darray_for_each(extent_ends->e, i) 1059 snapshots_seen_exit(&i->seen); 1060 extent_ends->e.nr = 0; 1061 } 1062 1063 static void extent_ends_exit(struct extent_ends *extent_ends) 1064 { 1065 extent_ends_reset(extent_ends); 1066 darray_exit(&extent_ends->e); 1067 } 1068 1069 static void extent_ends_init(struct extent_ends *extent_ends) 1070 { 1071 memset(extent_ends, 0, sizeof(*extent_ends)); 1072 } 1073 1074 static int extent_ends_at(struct bch_fs *c, 1075 struct extent_ends *extent_ends, 1076 struct snapshots_seen *seen, 1077 struct bkey_s_c k) 1078 { 1079 struct extent_end *i, n = (struct extent_end) { 1080 .offset = k.k->p.offset, 1081 .snapshot = k.k->p.snapshot, 1082 .seen = *seen, 1083 }; 1084 1085 n.seen.ids.data = kmemdup(seen->ids.data, 1086 sizeof(seen->ids.data[0]) * seen->ids.size, 1087 GFP_KERNEL); 1088 if (!n.seen.ids.data) 1089 return -BCH_ERR_ENOMEM_fsck_extent_ends_at; 1090 1091 __darray_for_each(extent_ends->e, i) { 1092 if (i->snapshot == k.k->p.snapshot) { 1093 snapshots_seen_exit(&i->seen); 1094 *i = n; 1095 return 0; 1096 } 1097 1098 if (i->snapshot >= k.k->p.snapshot) 1099 break; 1100 } 1101 1102 return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n); 1103 } 1104 1105 static int overlapping_extents_found(struct btree_trans *trans, 1106 enum btree_id btree, 1107 struct bpos pos1, struct snapshots_seen *pos1_seen, 1108 struct bkey pos2, 1109 bool *fixed, 1110 struct extent_end *extent_end) 1111 { 1112 struct bch_fs *c = trans->c; 1113 struct printbuf buf = PRINTBUF; 1114 struct btree_iter iter1, iter2 = { NULL }; 1115 struct bkey_s_c k1, k2; 1116 int ret; 1117 1118 BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2))); 1119 1120 bch2_trans_iter_init(trans, &iter1, btree, pos1, 1121 BTREE_ITER_ALL_SNAPSHOTS| 1122 BTREE_ITER_NOT_EXTENTS); 1123 k1 = bch2_btree_iter_peek_upto(&iter1, POS(pos1.inode, U64_MAX)); 1124 ret = bkey_err(k1); 1125 if (ret) 1126 goto err; 1127 1128 prt_str(&buf, "\n "); 1129 bch2_bkey_val_to_text(&buf, c, k1); 1130 1131 if (!bpos_eq(pos1, k1.k->p)) { 1132 prt_str(&buf, "\n wanted\n "); 1133 bch2_bpos_to_text(&buf, pos1); 1134 prt_str(&buf, "\n "); 1135 bch2_bkey_to_text(&buf, &pos2); 1136 1137 bch_err(c, "%s: error finding first overlapping extent when repairing, got%s", 1138 __func__, buf.buf); 1139 ret = -BCH_ERR_internal_fsck_err; 1140 goto err; 1141 } 1142 1143 bch2_trans_copy_iter(&iter2, &iter1); 1144 1145 while (1) { 1146 bch2_btree_iter_advance(&iter2); 1147 1148 k2 = bch2_btree_iter_peek_upto(&iter2, POS(pos1.inode, U64_MAX)); 1149 ret = bkey_err(k2); 1150 if (ret) 1151 goto err; 1152 1153 if (bpos_ge(k2.k->p, pos2.p)) 1154 break; 1155 } 1156 1157 prt_str(&buf, "\n "); 1158 bch2_bkey_val_to_text(&buf, c, k2); 1159 1160 if (bpos_gt(k2.k->p, pos2.p) || 1161 pos2.size != k2.k->size) { 1162 bch_err(c, "%s: error finding seconding overlapping extent when repairing%s", 1163 __func__, buf.buf); 1164 ret = -BCH_ERR_internal_fsck_err; 1165 goto err; 1166 } 1167 1168 prt_printf(&buf, "\n overwriting %s extent", 1169 pos1.snapshot >= pos2.p.snapshot ? "first" : "second"); 1170 1171 if (fsck_err(c, extent_overlapping, 1172 "overlapping extents%s", buf.buf)) { 1173 struct btree_iter *old_iter = &iter1; 1174 struct disk_reservation res = { 0 }; 1175 1176 if (pos1.snapshot < pos2.p.snapshot) { 1177 old_iter = &iter2; 1178 swap(k1, k2); 1179 } 1180 1181 trans->extra_disk_res += bch2_bkey_sectors_compressed(k2); 1182 1183 ret = bch2_trans_update_extent_overwrite(trans, old_iter, 1184 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE, 1185 k1, k2) ?: 1186 bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc); 1187 bch2_disk_reservation_put(c, &res); 1188 1189 if (ret) 1190 goto err; 1191 1192 *fixed = true; 1193 1194 if (pos1.snapshot == pos2.p.snapshot) { 1195 /* 1196 * We overwrote the first extent, and did the overwrite 1197 * in the same snapshot: 1198 */ 1199 extent_end->offset = bkey_start_offset(&pos2); 1200 } else if (pos1.snapshot > pos2.p.snapshot) { 1201 /* 1202 * We overwrote the first extent in pos2's snapshot: 1203 */ 1204 ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot); 1205 } else { 1206 /* 1207 * We overwrote the second extent - restart 1208 * check_extent() from the top: 1209 */ 1210 ret = -BCH_ERR_transaction_restart_nested; 1211 } 1212 } 1213 fsck_err: 1214 err: 1215 bch2_trans_iter_exit(trans, &iter2); 1216 bch2_trans_iter_exit(trans, &iter1); 1217 printbuf_exit(&buf); 1218 return ret; 1219 } 1220 1221 static int check_overlapping_extents(struct btree_trans *trans, 1222 struct snapshots_seen *seen, 1223 struct extent_ends *extent_ends, 1224 struct bkey_s_c k, 1225 u32 equiv, 1226 struct btree_iter *iter, 1227 bool *fixed) 1228 { 1229 struct bch_fs *c = trans->c; 1230 int ret = 0; 1231 1232 /* transaction restart, running again */ 1233 if (bpos_eq(extent_ends->last_pos, k.k->p)) 1234 return 0; 1235 1236 if (extent_ends->last_pos.inode != k.k->p.inode) 1237 extent_ends_reset(extent_ends); 1238 1239 darray_for_each(extent_ends->e, i) { 1240 if (i->offset <= bkey_start_offset(k.k)) 1241 continue; 1242 1243 if (!ref_visible2(c, 1244 k.k->p.snapshot, seen, 1245 i->snapshot, &i->seen)) 1246 continue; 1247 1248 ret = overlapping_extents_found(trans, iter->btree_id, 1249 SPOS(iter->pos.inode, 1250 i->offset, 1251 i->snapshot), 1252 &i->seen, 1253 *k.k, fixed, i); 1254 if (ret) 1255 goto err; 1256 } 1257 1258 ret = extent_ends_at(c, extent_ends, seen, k); 1259 if (ret) 1260 goto err; 1261 1262 extent_ends->last_pos = k.k->p; 1263 err: 1264 return ret; 1265 } 1266 1267 static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter, 1268 struct bkey_s_c k) 1269 { 1270 struct bch_fs *c = trans->c; 1271 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 1272 struct bch_extent_crc_unpacked crc; 1273 const union bch_extent_entry *i; 1274 unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9; 1275 1276 bkey_for_each_crc(k.k, ptrs, crc, i) 1277 if (crc_is_encoded(crc) && 1278 crc.uncompressed_size > encoded_extent_max_sectors) { 1279 struct printbuf buf = PRINTBUF; 1280 1281 bch2_bkey_val_to_text(&buf, c, k); 1282 bch_err(c, "overbig encoded extent, please report this:\n %s", buf.buf); 1283 printbuf_exit(&buf); 1284 } 1285 1286 return 0; 1287 } 1288 1289 static int check_extent(struct btree_trans *trans, struct btree_iter *iter, 1290 struct bkey_s_c k, 1291 struct inode_walker *inode, 1292 struct snapshots_seen *s, 1293 struct extent_ends *extent_ends) 1294 { 1295 struct bch_fs *c = trans->c; 1296 struct inode_walker_entry *i; 1297 struct printbuf buf = PRINTBUF; 1298 struct bpos equiv = k.k->p; 1299 int ret = 0; 1300 1301 equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot); 1302 1303 ret = check_key_has_snapshot(trans, iter, k); 1304 if (ret) { 1305 ret = ret < 0 ? ret : 0; 1306 goto out; 1307 } 1308 1309 if (inode->last_pos.inode != k.k->p.inode) { 1310 ret = check_i_sectors(trans, inode); 1311 if (ret) 1312 goto err; 1313 } 1314 1315 i = walk_inode(trans, inode, equiv, k.k->type == KEY_TYPE_whiteout); 1316 ret = PTR_ERR_OR_ZERO(i); 1317 if (ret) 1318 goto err; 1319 1320 ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); 1321 if (ret) 1322 goto err; 1323 1324 if (k.k->type != KEY_TYPE_whiteout) { 1325 if (fsck_err_on(!i, c, extent_in_missing_inode, 1326 "extent in missing inode:\n %s", 1327 (printbuf_reset(&buf), 1328 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) 1329 goto delete; 1330 1331 if (fsck_err_on(i && 1332 !S_ISREG(i->inode.bi_mode) && 1333 !S_ISLNK(i->inode.bi_mode), 1334 c, extent_in_non_reg_inode, 1335 "extent in non regular inode mode %o:\n %s", 1336 i->inode.bi_mode, 1337 (printbuf_reset(&buf), 1338 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) 1339 goto delete; 1340 1341 ret = check_overlapping_extents(trans, s, extent_ends, k, 1342 equiv.snapshot, iter, 1343 &inode->recalculate_sums); 1344 if (ret) 1345 goto err; 1346 } 1347 1348 /* 1349 * Check inodes in reverse order, from oldest snapshots to newest, 1350 * starting from the inode that matches this extent's snapshot. If we 1351 * didn't have one, iterate over all inodes: 1352 */ 1353 if (!i) 1354 i = inode->inodes.data + inode->inodes.nr - 1; 1355 1356 for (; 1357 inode->inodes.data && i >= inode->inodes.data; 1358 --i) { 1359 if (i->snapshot > equiv.snapshot || 1360 !key_visible_in_snapshot(c, s, i->snapshot, equiv.snapshot)) 1361 continue; 1362 1363 if (k.k->type != KEY_TYPE_whiteout) { 1364 if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) && 1365 k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 && 1366 !bkey_extent_is_reservation(k), 1367 c, extent_past_end_of_inode, 1368 "extent type past end of inode %llu:%u, i_size %llu\n %s", 1369 i->inode.bi_inum, i->snapshot, i->inode.bi_size, 1370 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { 1371 struct btree_iter iter2; 1372 1373 bch2_trans_copy_iter(&iter2, iter); 1374 bch2_btree_iter_set_snapshot(&iter2, i->snapshot); 1375 ret = bch2_btree_iter_traverse(&iter2) ?: 1376 bch2_btree_delete_at(trans, &iter2, 1377 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); 1378 bch2_trans_iter_exit(trans, &iter2); 1379 if (ret) 1380 goto err; 1381 1382 iter->k.type = KEY_TYPE_whiteout; 1383 } 1384 1385 if (bkey_extent_is_allocation(k.k)) 1386 i->count += k.k->size; 1387 } 1388 1389 i->seen_this_pos = true; 1390 } 1391 out: 1392 err: 1393 fsck_err: 1394 printbuf_exit(&buf); 1395 bch_err_fn(c, ret); 1396 return ret; 1397 delete: 1398 ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); 1399 goto out; 1400 } 1401 1402 /* 1403 * Walk extents: verify that extents have a corresponding S_ISREG inode, and 1404 * that i_size an i_sectors are consistent 1405 */ 1406 int bch2_check_extents(struct bch_fs *c) 1407 { 1408 struct inode_walker w = inode_walker_init(); 1409 struct snapshots_seen s; 1410 struct extent_ends extent_ends; 1411 struct disk_reservation res = { 0 }; 1412 1413 snapshots_seen_init(&s); 1414 extent_ends_init(&extent_ends); 1415 1416 int ret = bch2_trans_run(c, 1417 for_each_btree_key_commit(trans, iter, BTREE_ID_extents, 1418 POS(BCACHEFS_ROOT_INO, 0), 1419 BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, 1420 &res, NULL, 1421 BCH_TRANS_COMMIT_no_enospc, ({ 1422 bch2_disk_reservation_put(c, &res); 1423 check_extent(trans, &iter, k, &w, &s, &extent_ends) ?: 1424 check_extent_overbig(trans, &iter, k); 1425 })) ?: 1426 check_i_sectors(trans, &w)); 1427 1428 bch2_disk_reservation_put(c, &res); 1429 extent_ends_exit(&extent_ends); 1430 inode_walker_exit(&w); 1431 snapshots_seen_exit(&s); 1432 1433 bch_err_fn(c, ret); 1434 return ret; 1435 } 1436 1437 int bch2_check_indirect_extents(struct bch_fs *c) 1438 { 1439 struct disk_reservation res = { 0 }; 1440 1441 int ret = bch2_trans_run(c, 1442 for_each_btree_key_commit(trans, iter, BTREE_ID_reflink, 1443 POS_MIN, 1444 BTREE_ITER_PREFETCH, k, 1445 &res, NULL, 1446 BCH_TRANS_COMMIT_no_enospc, ({ 1447 bch2_disk_reservation_put(c, &res); 1448 check_extent_overbig(trans, &iter, k); 1449 }))); 1450 1451 bch2_disk_reservation_put(c, &res); 1452 bch_err_fn(c, ret); 1453 return ret; 1454 } 1455 1456 static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) 1457 { 1458 struct bch_fs *c = trans->c; 1459 u32 restart_count = trans->restart_count; 1460 int ret = 0; 1461 s64 count2; 1462 1463 darray_for_each(w->inodes, i) { 1464 if (i->inode.bi_nlink == i->count) 1465 continue; 1466 1467 count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot); 1468 if (count2 < 0) 1469 return count2; 1470 1471 if (i->count != count2) { 1472 bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu", 1473 i->count, count2); 1474 i->count = count2; 1475 if (i->inode.bi_nlink == i->count) 1476 continue; 1477 } 1478 1479 if (fsck_err_on(i->inode.bi_nlink != i->count, 1480 c, inode_dir_wrong_nlink, 1481 "directory %llu:%u with wrong i_nlink: got %u, should be %llu", 1482 w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) { 1483 i->inode.bi_nlink = i->count; 1484 ret = fsck_write_inode(trans, &i->inode, i->snapshot); 1485 if (ret) 1486 break; 1487 } 1488 } 1489 fsck_err: 1490 bch_err_fn(c, ret); 1491 return ret ?: trans_was_restarted(trans, restart_count); 1492 } 1493 1494 static int check_dirent_target(struct btree_trans *trans, 1495 struct btree_iter *iter, 1496 struct bkey_s_c_dirent d, 1497 struct bch_inode_unpacked *target, 1498 u32 target_snapshot) 1499 { 1500 struct bch_fs *c = trans->c; 1501 struct bkey_i_dirent *n; 1502 struct printbuf buf = PRINTBUF; 1503 struct btree_iter bp_iter = { NULL }; 1504 int ret = 0; 1505 1506 if (!target->bi_dir && 1507 !target->bi_dir_offset) { 1508 target->bi_dir = d.k->p.inode; 1509 target->bi_dir_offset = d.k->p.offset; 1510 1511 ret = __write_inode(trans, target, target_snapshot); 1512 if (ret) 1513 goto err; 1514 } 1515 1516 if (!inode_points_to_dirent(target, d)) { 1517 struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter, 1518 SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot)); 1519 ret = bkey_err(bp_dirent); 1520 if (ret && !bch2_err_matches(ret, ENOENT)) 1521 goto err; 1522 1523 bool backpointer_exists = !ret; 1524 ret = 0; 1525 1526 bch2_bkey_val_to_text(&buf, c, d.s_c); 1527 prt_newline(&buf); 1528 if (backpointer_exists) 1529 bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c); 1530 1531 if (fsck_err_on(S_ISDIR(target->bi_mode) && backpointer_exists, 1532 c, inode_dir_multiple_links, 1533 "directory %llu:%u with multiple links\n%s", 1534 target->bi_inum, target_snapshot, buf.buf)) { 1535 ret = __remove_dirent(trans, d.k->p); 1536 goto out; 1537 } 1538 1539 /* 1540 * hardlinked file with nlink 0: 1541 * We're just adjusting nlink here so check_nlinks() will pick 1542 * it up, it ignores inodes with nlink 0 1543 */ 1544 if (fsck_err_on(backpointer_exists && !target->bi_nlink, 1545 c, inode_multiple_links_but_nlink_0, 1546 "inode %llu:%u type %s has multiple links but i_nlink 0\n%s", 1547 target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) { 1548 target->bi_nlink++; 1549 target->bi_flags &= ~BCH_INODE_unlinked; 1550 1551 ret = __write_inode(trans, target, target_snapshot); 1552 if (ret) 1553 goto err; 1554 } 1555 1556 if (fsck_err_on(!backpointer_exists, 1557 c, inode_wrong_backpointer, 1558 "inode %llu:%u has wrong backpointer:\n" 1559 "got %llu:%llu\n" 1560 "should be %llu:%llu", 1561 target->bi_inum, target_snapshot, 1562 target->bi_dir, 1563 target->bi_dir_offset, 1564 d.k->p.inode, 1565 d.k->p.offset)) { 1566 target->bi_dir = d.k->p.inode; 1567 target->bi_dir_offset = d.k->p.offset; 1568 1569 ret = __write_inode(trans, target, target_snapshot); 1570 if (ret) 1571 goto err; 1572 } 1573 } 1574 1575 if (fsck_err_on(d.v->d_type != inode_d_type(target), 1576 c, dirent_d_type_wrong, 1577 "incorrect d_type: got %s, should be %s:\n%s", 1578 bch2_d_type_str(d.v->d_type), 1579 bch2_d_type_str(inode_d_type(target)), 1580 (printbuf_reset(&buf), 1581 bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { 1582 n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); 1583 ret = PTR_ERR_OR_ZERO(n); 1584 if (ret) 1585 goto err; 1586 1587 bkey_reassemble(&n->k_i, d.s_c); 1588 n->v.d_type = inode_d_type(target); 1589 1590 ret = bch2_trans_update(trans, iter, &n->k_i, 0); 1591 if (ret) 1592 goto err; 1593 1594 d = dirent_i_to_s_c(n); 1595 } 1596 1597 if (fsck_err_on(d.v->d_type == DT_SUBVOL && 1598 target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol), 1599 c, dirent_d_parent_subvol_wrong, 1600 "dirent has wrong d_parent_subvol field: got %u, should be %u", 1601 le32_to_cpu(d.v->d_parent_subvol), 1602 target->bi_parent_subvol)) { 1603 n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); 1604 ret = PTR_ERR_OR_ZERO(n); 1605 if (ret) 1606 goto err; 1607 1608 bkey_reassemble(&n->k_i, d.s_c); 1609 n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol); 1610 1611 ret = bch2_trans_update(trans, iter, &n->k_i, 0); 1612 if (ret) 1613 goto err; 1614 1615 d = dirent_i_to_s_c(n); 1616 } 1617 out: 1618 err: 1619 fsck_err: 1620 bch2_trans_iter_exit(trans, &bp_iter); 1621 printbuf_exit(&buf); 1622 bch_err_fn(c, ret); 1623 return ret; 1624 } 1625 1626 static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, 1627 struct bkey_s_c k, 1628 struct bch_hash_info *hash_info, 1629 struct inode_walker *dir, 1630 struct inode_walker *target, 1631 struct snapshots_seen *s) 1632 { 1633 struct bch_fs *c = trans->c; 1634 struct bkey_s_c_dirent d; 1635 struct inode_walker_entry *i; 1636 struct printbuf buf = PRINTBUF; 1637 struct bpos equiv; 1638 int ret = 0; 1639 1640 ret = check_key_has_snapshot(trans, iter, k); 1641 if (ret) { 1642 ret = ret < 0 ? ret : 0; 1643 goto out; 1644 } 1645 1646 equiv = k.k->p; 1647 equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot); 1648 1649 ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); 1650 if (ret) 1651 goto err; 1652 1653 if (k.k->type == KEY_TYPE_whiteout) 1654 goto out; 1655 1656 if (dir->last_pos.inode != k.k->p.inode) { 1657 ret = check_subdir_count(trans, dir); 1658 if (ret) 1659 goto err; 1660 } 1661 1662 BUG_ON(!btree_iter_path(trans, iter)->should_be_locked); 1663 1664 i = walk_inode(trans, dir, equiv, k.k->type == KEY_TYPE_whiteout); 1665 ret = PTR_ERR_OR_ZERO(i); 1666 if (ret < 0) 1667 goto err; 1668 1669 if (dir->first_this_inode && dir->inodes.nr) 1670 *hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode); 1671 dir->first_this_inode = false; 1672 1673 if (fsck_err_on(!i, c, dirent_in_missing_dir_inode, 1674 "dirent in nonexisting directory:\n%s", 1675 (printbuf_reset(&buf), 1676 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { 1677 ret = bch2_btree_delete_at(trans, iter, 1678 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); 1679 goto out; 1680 } 1681 1682 if (!i) 1683 goto out; 1684 1685 if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), 1686 c, dirent_in_non_dir_inode, 1687 "dirent in non directory inode type %s:\n%s", 1688 bch2_d_type_str(inode_d_type(&i->inode)), 1689 (printbuf_reset(&buf), 1690 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { 1691 ret = bch2_btree_delete_at(trans, iter, 0); 1692 goto out; 1693 } 1694 1695 ret = hash_check_key(trans, bch2_dirent_hash_desc, hash_info, iter, k); 1696 if (ret < 0) 1697 goto err; 1698 if (ret) { 1699 /* dirent has been deleted */ 1700 ret = 0; 1701 goto out; 1702 } 1703 1704 if (k.k->type != KEY_TYPE_dirent) 1705 goto out; 1706 1707 d = bkey_s_c_to_dirent(k); 1708 1709 if (d.v->d_type == DT_SUBVOL) { 1710 struct bch_inode_unpacked subvol_root; 1711 u32 target_subvol = le32_to_cpu(d.v->d_child_subvol); 1712 u32 target_snapshot; 1713 u64 target_inum; 1714 1715 ret = subvol_lookup(trans, target_subvol, 1716 &target_snapshot, &target_inum); 1717 if (ret && !bch2_err_matches(ret, ENOENT)) 1718 goto err; 1719 1720 if (fsck_err_on(ret, c, dirent_to_missing_subvol, 1721 "dirent points to missing subvolume %u", 1722 le32_to_cpu(d.v->d_child_subvol))) { 1723 ret = __remove_dirent(trans, d.k->p); 1724 goto err; 1725 } 1726 1727 ret = lookup_inode(trans, target_inum, 1728 &subvol_root, &target_snapshot); 1729 if (ret && !bch2_err_matches(ret, ENOENT)) 1730 goto err; 1731 1732 if (fsck_err_on(ret, c, subvol_to_missing_root, 1733 "subvolume %u points to missing subvolume root %llu", 1734 target_subvol, 1735 target_inum)) { 1736 bch_err(c, "repair not implemented yet"); 1737 ret = -EINVAL; 1738 goto err; 1739 } 1740 1741 if (fsck_err_on(subvol_root.bi_subvol != target_subvol, 1742 c, subvol_root_wrong_bi_subvol, 1743 "subvol root %llu has wrong bi_subvol field: got %u, should be %u", 1744 target_inum, 1745 subvol_root.bi_subvol, target_subvol)) { 1746 subvol_root.bi_subvol = target_subvol; 1747 ret = __write_inode(trans, &subvol_root, target_snapshot); 1748 if (ret) 1749 goto err; 1750 } 1751 1752 ret = check_dirent_target(trans, iter, d, &subvol_root, 1753 target_snapshot); 1754 if (ret) 1755 goto err; 1756 } else { 1757 ret = __get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum)); 1758 if (ret) 1759 goto err; 1760 1761 if (fsck_err_on(!target->inodes.nr, 1762 c, dirent_to_missing_inode, 1763 "dirent points to missing inode: (equiv %u)\n%s", 1764 equiv.snapshot, 1765 (printbuf_reset(&buf), 1766 bch2_bkey_val_to_text(&buf, c, k), 1767 buf.buf))) { 1768 ret = __remove_dirent(trans, d.k->p); 1769 if (ret) 1770 goto err; 1771 } 1772 1773 darray_for_each(target->inodes, i) { 1774 ret = check_dirent_target(trans, iter, d, 1775 &i->inode, i->snapshot); 1776 if (ret) 1777 goto err; 1778 } 1779 } 1780 1781 if (d.v->d_type == DT_DIR) 1782 for_each_visible_inode(c, s, dir, equiv.snapshot, i) 1783 i->count++; 1784 1785 out: 1786 err: 1787 fsck_err: 1788 printbuf_exit(&buf); 1789 bch_err_fn(c, ret); 1790 return ret; 1791 } 1792 1793 /* 1794 * Walk dirents: verify that they all have a corresponding S_ISDIR inode, 1795 * validate d_type 1796 */ 1797 int bch2_check_dirents(struct bch_fs *c) 1798 { 1799 struct inode_walker dir = inode_walker_init(); 1800 struct inode_walker target = inode_walker_init(); 1801 struct snapshots_seen s; 1802 struct bch_hash_info hash_info; 1803 1804 snapshots_seen_init(&s); 1805 1806 int ret = bch2_trans_run(c, 1807 for_each_btree_key_commit(trans, iter, BTREE_ID_dirents, 1808 POS(BCACHEFS_ROOT_INO, 0), 1809 BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, 1810 k, 1811 NULL, NULL, 1812 BCH_TRANS_COMMIT_no_enospc, 1813 check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s))); 1814 1815 snapshots_seen_exit(&s); 1816 inode_walker_exit(&dir); 1817 inode_walker_exit(&target); 1818 bch_err_fn(c, ret); 1819 return ret; 1820 } 1821 1822 static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, 1823 struct bkey_s_c k, 1824 struct bch_hash_info *hash_info, 1825 struct inode_walker *inode) 1826 { 1827 struct bch_fs *c = trans->c; 1828 struct inode_walker_entry *i; 1829 int ret; 1830 1831 ret = check_key_has_snapshot(trans, iter, k); 1832 if (ret) 1833 return ret; 1834 1835 i = walk_inode(trans, inode, k.k->p, k.k->type == KEY_TYPE_whiteout); 1836 ret = PTR_ERR_OR_ZERO(i); 1837 if (ret) 1838 return ret; 1839 1840 if (inode->first_this_inode && inode->inodes.nr) 1841 *hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode); 1842 inode->first_this_inode = false; 1843 1844 if (fsck_err_on(!i, c, xattr_in_missing_inode, 1845 "xattr for missing inode %llu", 1846 k.k->p.inode)) 1847 return bch2_btree_delete_at(trans, iter, 0); 1848 1849 if (!i) 1850 return 0; 1851 1852 ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k); 1853 fsck_err: 1854 bch_err_fn(c, ret); 1855 return ret; 1856 } 1857 1858 /* 1859 * Walk xattrs: verify that they all have a corresponding inode 1860 */ 1861 int bch2_check_xattrs(struct bch_fs *c) 1862 { 1863 struct inode_walker inode = inode_walker_init(); 1864 struct bch_hash_info hash_info; 1865 int ret = 0; 1866 1867 ret = bch2_trans_run(c, 1868 for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, 1869 POS(BCACHEFS_ROOT_INO, 0), 1870 BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, 1871 k, 1872 NULL, NULL, 1873 BCH_TRANS_COMMIT_no_enospc, 1874 check_xattr(trans, &iter, k, &hash_info, &inode))); 1875 bch_err_fn(c, ret); 1876 return ret; 1877 } 1878 1879 static int check_root_trans(struct btree_trans *trans) 1880 { 1881 struct bch_fs *c = trans->c; 1882 struct bch_inode_unpacked root_inode; 1883 u32 snapshot; 1884 u64 inum; 1885 int ret; 1886 1887 ret = subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum); 1888 if (ret && !bch2_err_matches(ret, ENOENT)) 1889 return ret; 1890 1891 if (mustfix_fsck_err_on(ret, c, root_subvol_missing, 1892 "root subvol missing")) { 1893 struct bkey_i_subvolume root_subvol; 1894 1895 snapshot = U32_MAX; 1896 inum = BCACHEFS_ROOT_INO; 1897 1898 bkey_subvolume_init(&root_subvol.k_i); 1899 root_subvol.k.p.offset = BCACHEFS_ROOT_SUBVOL; 1900 root_subvol.v.flags = 0; 1901 root_subvol.v.snapshot = cpu_to_le32(snapshot); 1902 root_subvol.v.inode = cpu_to_le64(inum); 1903 ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol.k_i, 0); 1904 bch_err_msg(c, ret, "writing root subvol"); 1905 if (ret) 1906 goto err; 1907 } 1908 1909 ret = lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot); 1910 if (ret && !bch2_err_matches(ret, ENOENT)) 1911 return ret; 1912 1913 if (mustfix_fsck_err_on(ret, c, root_dir_missing, 1914 "root directory missing") || 1915 mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), 1916 c, root_inode_not_dir, 1917 "root inode not a directory")) { 1918 bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, 1919 0, NULL); 1920 root_inode.bi_inum = inum; 1921 1922 ret = __write_inode(trans, &root_inode, snapshot); 1923 bch_err_msg(c, ret, "writing root inode"); 1924 } 1925 err: 1926 fsck_err: 1927 return ret; 1928 } 1929 1930 /* Get root directory, create if it doesn't exist: */ 1931 int bch2_check_root(struct bch_fs *c) 1932 { 1933 int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 1934 check_root_trans(trans)); 1935 bch_err_fn(c, ret); 1936 return ret; 1937 } 1938 1939 struct pathbuf_entry { 1940 u64 inum; 1941 u32 snapshot; 1942 }; 1943 1944 typedef DARRAY(struct pathbuf_entry) pathbuf; 1945 1946 static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot) 1947 { 1948 darray_for_each(*p, i) 1949 if (i->inum == inum && 1950 i->snapshot == snapshot) 1951 return true; 1952 return false; 1953 } 1954 1955 static int path_down(struct bch_fs *c, pathbuf *p, 1956 u64 inum, u32 snapshot) 1957 { 1958 int ret = darray_push(p, ((struct pathbuf_entry) { 1959 .inum = inum, 1960 .snapshot = snapshot, 1961 })); 1962 1963 if (ret) 1964 bch_err(c, "fsck: error allocating memory for pathbuf, size %zu", 1965 p->size); 1966 return ret; 1967 } 1968 1969 /* 1970 * Check that a given inode is reachable from the root: 1971 * 1972 * XXX: we should also be verifying that inodes are in the right subvolumes 1973 */ 1974 static int check_path(struct btree_trans *trans, 1975 pathbuf *p, 1976 struct bch_inode_unpacked *inode, 1977 u32 snapshot) 1978 { 1979 struct bch_fs *c = trans->c; 1980 int ret = 0; 1981 1982 snapshot = bch2_snapshot_equiv(c, snapshot); 1983 p->nr = 0; 1984 1985 while (!(inode->bi_inum == BCACHEFS_ROOT_INO && 1986 inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)) { 1987 struct btree_iter dirent_iter; 1988 struct bkey_s_c_dirent d; 1989 u32 parent_snapshot = snapshot; 1990 1991 if (inode->bi_subvol) { 1992 u64 inum; 1993 1994 ret = subvol_lookup(trans, inode->bi_parent_subvol, 1995 &parent_snapshot, &inum); 1996 if (ret) 1997 break; 1998 } 1999 2000 d = dirent_get_by_pos(trans, &dirent_iter, 2001 SPOS(inode->bi_dir, inode->bi_dir_offset, 2002 parent_snapshot)); 2003 ret = bkey_err(d.s_c); 2004 if (ret && !bch2_err_matches(ret, ENOENT)) 2005 break; 2006 2007 if (!ret && !dirent_points_to_inode(d, inode)) { 2008 bch2_trans_iter_exit(trans, &dirent_iter); 2009 ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; 2010 } 2011 2012 if (bch2_err_matches(ret, ENOENT)) { 2013 if (fsck_err(c, inode_unreachable, 2014 "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu", 2015 inode->bi_inum, snapshot, 2016 bch2_d_type_str(inode_d_type(inode)), 2017 inode->bi_nlink, 2018 inode->bi_dir, 2019 inode->bi_dir_offset)) 2020 ret = reattach_inode(trans, inode, snapshot); 2021 break; 2022 } 2023 2024 bch2_trans_iter_exit(trans, &dirent_iter); 2025 2026 if (!S_ISDIR(inode->bi_mode)) 2027 break; 2028 2029 ret = path_down(c, p, inode->bi_inum, snapshot); 2030 if (ret) { 2031 bch_err(c, "memory allocation failure"); 2032 return ret; 2033 } 2034 2035 snapshot = parent_snapshot; 2036 2037 ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot); 2038 if (ret) { 2039 /* Should have been caught in dirents pass */ 2040 if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) 2041 bch_err(c, "error looking up parent directory: %i", ret); 2042 break; 2043 } 2044 2045 if (path_is_dup(p, inode->bi_inum, snapshot)) { 2046 /* XXX print path */ 2047 bch_err(c, "directory structure loop"); 2048 2049 darray_for_each(*p, i) 2050 pr_err("%llu:%u", i->inum, i->snapshot); 2051 pr_err("%llu:%u", inode->bi_inum, snapshot); 2052 2053 if (!fsck_err(c, dir_loop, "directory structure loop")) 2054 return 0; 2055 2056 ret = remove_backpointer(trans, inode); 2057 if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) 2058 bch_err_msg(c, ret, "removing dirent"); 2059 if (ret) 2060 break; 2061 2062 ret = reattach_inode(trans, inode, snapshot); 2063 if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) 2064 bch_err_msg(c, ret, "reattaching inode %llu", inode->bi_inum); 2065 break; 2066 } 2067 } 2068 fsck_err: 2069 bch_err_fn(c, ret); 2070 return ret; 2071 } 2072 2073 /* 2074 * Check for unreachable inodes, as well as loops in the directory structure: 2075 * After bch2_check_dirents(), if an inode backpointer doesn't exist that means it's 2076 * unreachable: 2077 */ 2078 int bch2_check_directory_structure(struct bch_fs *c) 2079 { 2080 struct bch_inode_unpacked u; 2081 pathbuf path = { 0, }; 2082 int ret; 2083 2084 ret = bch2_trans_run(c, 2085 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN, 2086 BTREE_ITER_INTENT| 2087 BTREE_ITER_PREFETCH| 2088 BTREE_ITER_ALL_SNAPSHOTS, k, 2089 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ 2090 if (!bkey_is_inode(k.k)) 2091 continue; 2092 2093 BUG_ON(bch2_inode_unpack(k, &u)); 2094 2095 if (u.bi_flags & BCH_INODE_unlinked) 2096 continue; 2097 2098 check_path(trans, &path, &u, iter.pos.snapshot); 2099 }))); 2100 darray_exit(&path); 2101 2102 bch_err_fn(c, ret); 2103 return ret; 2104 } 2105 2106 struct nlink_table { 2107 size_t nr; 2108 size_t size; 2109 2110 struct nlink { 2111 u64 inum; 2112 u32 snapshot; 2113 u32 count; 2114 } *d; 2115 }; 2116 2117 static int add_nlink(struct bch_fs *c, struct nlink_table *t, 2118 u64 inum, u32 snapshot) 2119 { 2120 if (t->nr == t->size) { 2121 size_t new_size = max_t(size_t, 128UL, t->size * 2); 2122 void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL); 2123 2124 if (!d) { 2125 bch_err(c, "fsck: error allocating memory for nlink_table, size %zu", 2126 new_size); 2127 return -BCH_ERR_ENOMEM_fsck_add_nlink; 2128 } 2129 2130 if (t->d) 2131 memcpy(d, t->d, t->size * sizeof(t->d[0])); 2132 kvfree(t->d); 2133 2134 t->d = d; 2135 t->size = new_size; 2136 } 2137 2138 2139 t->d[t->nr++] = (struct nlink) { 2140 .inum = inum, 2141 .snapshot = snapshot, 2142 }; 2143 2144 return 0; 2145 } 2146 2147 static int nlink_cmp(const void *_l, const void *_r) 2148 { 2149 const struct nlink *l = _l; 2150 const struct nlink *r = _r; 2151 2152 return cmp_int(l->inum, r->inum); 2153 } 2154 2155 static void inc_link(struct bch_fs *c, struct snapshots_seen *s, 2156 struct nlink_table *links, 2157 u64 range_start, u64 range_end, u64 inum, u32 snapshot) 2158 { 2159 struct nlink *link, key = { 2160 .inum = inum, .snapshot = U32_MAX, 2161 }; 2162 2163 if (inum < range_start || inum >= range_end) 2164 return; 2165 2166 link = __inline_bsearch(&key, links->d, links->nr, 2167 sizeof(links->d[0]), nlink_cmp); 2168 if (!link) 2169 return; 2170 2171 while (link > links->d && link[0].inum == link[-1].inum) 2172 --link; 2173 2174 for (; link < links->d + links->nr && link->inum == inum; link++) 2175 if (ref_visible(c, s, snapshot, link->snapshot)) { 2176 link->count++; 2177 if (link->snapshot >= snapshot) 2178 break; 2179 } 2180 } 2181 2182 noinline_for_stack 2183 static int check_nlinks_find_hardlinks(struct bch_fs *c, 2184 struct nlink_table *t, 2185 u64 start, u64 *end) 2186 { 2187 int ret = bch2_trans_run(c, 2188 for_each_btree_key(trans, iter, BTREE_ID_inodes, 2189 POS(0, start), 2190 BTREE_ITER_INTENT| 2191 BTREE_ITER_PREFETCH| 2192 BTREE_ITER_ALL_SNAPSHOTS, k, ({ 2193 if (!bkey_is_inode(k.k)) 2194 continue; 2195 2196 /* Should never fail, checked by bch2_inode_invalid: */ 2197 struct bch_inode_unpacked u; 2198 BUG_ON(bch2_inode_unpack(k, &u)); 2199 2200 /* 2201 * Backpointer and directory structure checks are sufficient for 2202 * directories, since they can't have hardlinks: 2203 */ 2204 if (S_ISDIR(u.bi_mode)) 2205 continue; 2206 2207 if (!u.bi_nlink) 2208 continue; 2209 2210 ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot); 2211 if (ret) { 2212 *end = k.k->p.offset; 2213 ret = 0; 2214 break; 2215 } 2216 0; 2217 }))); 2218 2219 bch_err_fn(c, ret); 2220 return ret; 2221 } 2222 2223 noinline_for_stack 2224 static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links, 2225 u64 range_start, u64 range_end) 2226 { 2227 struct snapshots_seen s; 2228 2229 snapshots_seen_init(&s); 2230 2231 int ret = bch2_trans_run(c, 2232 for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN, 2233 BTREE_ITER_INTENT| 2234 BTREE_ITER_PREFETCH| 2235 BTREE_ITER_ALL_SNAPSHOTS, k, ({ 2236 ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p); 2237 if (ret) 2238 break; 2239 2240 if (k.k->type == KEY_TYPE_dirent) { 2241 struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); 2242 2243 if (d.v->d_type != DT_DIR && 2244 d.v->d_type != DT_SUBVOL) 2245 inc_link(c, &s, links, range_start, range_end, 2246 le64_to_cpu(d.v->d_inum), 2247 bch2_snapshot_equiv(c, d.k->p.snapshot)); 2248 } 2249 0; 2250 }))); 2251 2252 snapshots_seen_exit(&s); 2253 2254 bch_err_fn(c, ret); 2255 return ret; 2256 } 2257 2258 static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter, 2259 struct bkey_s_c k, 2260 struct nlink_table *links, 2261 size_t *idx, u64 range_end) 2262 { 2263 struct bch_fs *c = trans->c; 2264 struct bch_inode_unpacked u; 2265 struct nlink *link = &links->d[*idx]; 2266 int ret = 0; 2267 2268 if (k.k->p.offset >= range_end) 2269 return 1; 2270 2271 if (!bkey_is_inode(k.k)) 2272 return 0; 2273 2274 BUG_ON(bch2_inode_unpack(k, &u)); 2275 2276 if (S_ISDIR(u.bi_mode)) 2277 return 0; 2278 2279 if (!u.bi_nlink) 2280 return 0; 2281 2282 while ((cmp_int(link->inum, k.k->p.offset) ?: 2283 cmp_int(link->snapshot, k.k->p.snapshot)) < 0) { 2284 BUG_ON(*idx == links->nr); 2285 link = &links->d[++*idx]; 2286 } 2287 2288 if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, 2289 c, inode_wrong_nlink, 2290 "inode %llu type %s has wrong i_nlink (%u, should be %u)", 2291 u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)], 2292 bch2_inode_nlink_get(&u), link->count)) { 2293 bch2_inode_nlink_set(&u, link->count); 2294 ret = __write_inode(trans, &u, k.k->p.snapshot); 2295 } 2296 fsck_err: 2297 return ret; 2298 } 2299 2300 noinline_for_stack 2301 static int check_nlinks_update_hardlinks(struct bch_fs *c, 2302 struct nlink_table *links, 2303 u64 range_start, u64 range_end) 2304 { 2305 size_t idx = 0; 2306 2307 int ret = bch2_trans_run(c, 2308 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, 2309 POS(0, range_start), 2310 BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, 2311 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 2312 check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end))); 2313 if (ret < 0) { 2314 bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret)); 2315 return ret; 2316 } 2317 2318 return 0; 2319 } 2320 2321 int bch2_check_nlinks(struct bch_fs *c) 2322 { 2323 struct nlink_table links = { 0 }; 2324 u64 this_iter_range_start, next_iter_range_start = 0; 2325 int ret = 0; 2326 2327 do { 2328 this_iter_range_start = next_iter_range_start; 2329 next_iter_range_start = U64_MAX; 2330 2331 ret = check_nlinks_find_hardlinks(c, &links, 2332 this_iter_range_start, 2333 &next_iter_range_start); 2334 2335 ret = check_nlinks_walk_dirents(c, &links, 2336 this_iter_range_start, 2337 next_iter_range_start); 2338 if (ret) 2339 break; 2340 2341 ret = check_nlinks_update_hardlinks(c, &links, 2342 this_iter_range_start, 2343 next_iter_range_start); 2344 if (ret) 2345 break; 2346 2347 links.nr = 0; 2348 } while (next_iter_range_start != U64_MAX); 2349 2350 kvfree(links.d); 2351 bch_err_fn(c, ret); 2352 return ret; 2353 } 2354 2355 static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter, 2356 struct bkey_s_c k) 2357 { 2358 struct bkey_s_c_reflink_p p; 2359 struct bkey_i_reflink_p *u; 2360 2361 if (k.k->type != KEY_TYPE_reflink_p) 2362 return 0; 2363 2364 p = bkey_s_c_to_reflink_p(k); 2365 2366 if (!p.v->front_pad && !p.v->back_pad) 2367 return 0; 2368 2369 u = bch2_trans_kmalloc(trans, sizeof(*u)); 2370 int ret = PTR_ERR_OR_ZERO(u); 2371 if (ret) 2372 return ret; 2373 2374 bkey_reassemble(&u->k_i, k); 2375 u->v.front_pad = 0; 2376 u->v.back_pad = 0; 2377 2378 return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_NORUN); 2379 } 2380 2381 int bch2_fix_reflink_p(struct bch_fs *c) 2382 { 2383 if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix) 2384 return 0; 2385 2386 int ret = bch2_trans_run(c, 2387 for_each_btree_key_commit(trans, iter, 2388 BTREE_ID_extents, POS_MIN, 2389 BTREE_ITER_INTENT|BTREE_ITER_PREFETCH| 2390 BTREE_ITER_ALL_SNAPSHOTS, k, 2391 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 2392 fix_reflink_p_key(trans, &iter, k))); 2393 bch_err_fn(c, ret); 2394 return ret; 2395 } 2396