1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "bkey_buf.h" 5 #include "btree_cache.h" 6 #include "btree_update.h" 7 #include "buckets.h" 8 #include "darray.h" 9 #include "dirent.h" 10 #include "error.h" 11 #include "fs-common.h" 12 #include "fsck.h" 13 #include "inode.h" 14 #include "keylist.h" 15 #include "recovery_passes.h" 16 #include "snapshot.h" 17 #include "super.h" 18 #include "xattr.h" 19 20 #include <linux/bsearch.h> 21 #include <linux/dcache.h> /* struct qstr */ 22 23 /* 24 * XXX: this is handling transaction restarts without returning 25 * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore: 26 */ 27 static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum, 28 u32 snapshot) 29 { 30 u64 sectors = 0; 31 32 int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_extents, 33 SPOS(inum, 0, snapshot), 34 POS(inum, U64_MAX), 35 0, k, ({ 36 if (bkey_extent_is_allocation(k.k)) 37 sectors += k.k->size; 38 0; 39 })); 40 41 return ret ?: sectors; 42 } 43 44 static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum, 45 u32 snapshot) 46 { 47 u64 subdirs = 0; 48 49 int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_dirents, 50 SPOS(inum, 0, snapshot), 51 POS(inum, U64_MAX), 52 0, k, ({ 53 if (k.k->type == KEY_TYPE_dirent && 54 bkey_s_c_to_dirent(k).v->d_type == DT_DIR) 55 subdirs++; 56 0; 57 })); 58 59 return ret ?: subdirs; 60 } 61 62 static int subvol_lookup(struct btree_trans *trans, u32 subvol, 63 u32 *snapshot, u64 *inum) 64 { 65 struct bch_subvolume s; 66 int ret = bch2_subvolume_get(trans, subvol, false, 0, &s); 67 68 *snapshot = le32_to_cpu(s.snapshot); 69 *inum = le64_to_cpu(s.inode); 70 return ret; 71 } 72 73 static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, 74 struct bch_inode_unpacked *inode) 75 { 76 struct btree_iter iter; 77 struct bkey_s_c k; 78 int ret; 79 80 for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr), 81 BTREE_ITER_all_snapshots, k, ret) { 82 if (k.k->p.offset != inode_nr) 83 break; 84 if (!bkey_is_inode(k.k)) 85 continue; 86 ret = bch2_inode_unpack(k, inode); 87 goto found; 88 } 89 ret = -BCH_ERR_ENOENT_inode; 90 found: 91 bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr); 92 bch2_trans_iter_exit(trans, &iter); 93 return ret; 94 } 95 96 static int lookup_inode(struct btree_trans *trans, u64 inode_nr, 97 struct bch_inode_unpacked *inode, 98 u32 *snapshot) 99 { 100 struct btree_iter iter; 101 struct bkey_s_c k; 102 int ret; 103 104 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, 105 SPOS(0, inode_nr, *snapshot), 0); 106 ret = bkey_err(k); 107 if (ret) 108 goto err; 109 110 ret = bkey_is_inode(k.k) 111 ? bch2_inode_unpack(k, inode) 112 : -BCH_ERR_ENOENT_inode; 113 if (!ret) 114 *snapshot = iter.pos.snapshot; 115 err: 116 bch2_trans_iter_exit(trans, &iter); 117 return ret; 118 } 119 120 static int lookup_dirent_in_snapshot(struct btree_trans *trans, 121 struct bch_hash_info hash_info, 122 subvol_inum dir, struct qstr *name, 123 u64 *target, unsigned *type, u32 snapshot) 124 { 125 struct btree_iter iter; 126 struct bkey_s_c k = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc, 127 &hash_info, dir, name, 0, snapshot); 128 int ret = bkey_err(k); 129 if (ret) 130 return ret; 131 132 struct bkey_s_c_dirent d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter)); 133 *target = le64_to_cpu(d.v->d_inum); 134 *type = d.v->d_type; 135 bch2_trans_iter_exit(trans, &iter); 136 return 0; 137 } 138 139 static int __remove_dirent(struct btree_trans *trans, struct bpos pos) 140 { 141 struct bch_fs *c = trans->c; 142 struct btree_iter iter; 143 struct bch_inode_unpacked dir_inode; 144 struct bch_hash_info dir_hash_info; 145 int ret; 146 147 ret = lookup_first_inode(trans, pos.inode, &dir_inode); 148 if (ret) 149 goto err; 150 151 dir_hash_info = bch2_hash_info_init(c, &dir_inode); 152 153 bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent); 154 155 ret = bch2_btree_iter_traverse(&iter) ?: 156 bch2_hash_delete_at(trans, bch2_dirent_hash_desc, 157 &dir_hash_info, &iter, 158 BTREE_UPDATE_internal_snapshot_node); 159 bch2_trans_iter_exit(trans, &iter); 160 err: 161 bch_err_fn(c, ret); 162 return ret; 163 } 164 165 /* Get lost+found, create if it doesn't exist: */ 166 static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, 167 struct bch_inode_unpacked *lostfound, 168 u64 reattaching_inum) 169 { 170 struct bch_fs *c = trans->c; 171 struct qstr lostfound_str = QSTR("lost+found"); 172 u64 inum = 0; 173 unsigned d_type = 0; 174 int ret; 175 176 struct bch_snapshot_tree st; 177 ret = bch2_snapshot_tree_lookup(trans, 178 bch2_snapshot_tree(c, snapshot), &st); 179 if (ret) 180 return ret; 181 182 subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) }; 183 184 struct bch_subvolume subvol; 185 ret = bch2_subvolume_get(trans, le32_to_cpu(st.master_subvol), 186 false, 0, &subvol); 187 bch_err_msg(c, ret, "looking up root subvol %u for snapshot %u", 188 le32_to_cpu(st.master_subvol), snapshot); 189 if (ret) 190 return ret; 191 192 if (!subvol.inode) { 193 struct btree_iter iter; 194 struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter, 195 BTREE_ID_subvolumes, POS(0, le32_to_cpu(st.master_subvol)), 196 0, subvolume); 197 ret = PTR_ERR_OR_ZERO(subvol); 198 if (ret) 199 return ret; 200 201 subvol->v.inode = cpu_to_le64(reattaching_inum); 202 bch2_trans_iter_exit(trans, &iter); 203 } 204 205 root_inum.inum = le64_to_cpu(subvol.inode); 206 207 struct bch_inode_unpacked root_inode; 208 struct bch_hash_info root_hash_info; 209 u32 root_inode_snapshot = snapshot; 210 ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot); 211 bch_err_msg(c, ret, "looking up root inode %llu for subvol %u", 212 root_inum.inum, le32_to_cpu(st.master_subvol)); 213 if (ret) 214 return ret; 215 216 root_hash_info = bch2_hash_info_init(c, &root_inode); 217 218 ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum, 219 &lostfound_str, &inum, &d_type, snapshot); 220 if (bch2_err_matches(ret, ENOENT)) 221 goto create_lostfound; 222 223 bch_err_fn(c, ret); 224 if (ret) 225 return ret; 226 227 if (d_type != DT_DIR) { 228 bch_err(c, "error looking up lost+found: not a directory"); 229 return -BCH_ERR_ENOENT_not_directory; 230 } 231 232 /* 233 * The bch2_check_dirents pass has already run, dangling dirents 234 * shouldn't exist here: 235 */ 236 ret = lookup_inode(trans, inum, lostfound, &snapshot); 237 bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)", 238 inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot)); 239 return ret; 240 241 create_lostfound: 242 /* 243 * XXX: we could have a nicer log message here if we had a nice way to 244 * walk backpointers to print a path 245 */ 246 bch_notice(c, "creating lost+found in snapshot %u", le32_to_cpu(st.root_snapshot)); 247 248 u64 now = bch2_current_time(c); 249 struct btree_iter lostfound_iter = { NULL }; 250 u64 cpu = raw_smp_processor_id(); 251 252 bch2_inode_init_early(c, lostfound); 253 bch2_inode_init_late(lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode); 254 lostfound->bi_dir = root_inode.bi_inum; 255 256 root_inode.bi_nlink++; 257 258 ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu); 259 if (ret) 260 goto err; 261 262 bch2_btree_iter_set_snapshot(&lostfound_iter, snapshot); 263 ret = bch2_btree_iter_traverse(&lostfound_iter); 264 if (ret) 265 goto err; 266 267 ret = bch2_dirent_create_snapshot(trans, 268 0, root_inode.bi_inum, snapshot, &root_hash_info, 269 mode_to_type(lostfound->bi_mode), 270 &lostfound_str, 271 lostfound->bi_inum, 272 &lostfound->bi_dir_offset, 273 STR_HASH_must_create) ?: 274 bch2_inode_write_flags(trans, &lostfound_iter, lostfound, 275 BTREE_UPDATE_internal_snapshot_node); 276 err: 277 bch_err_msg(c, ret, "creating lost+found"); 278 bch2_trans_iter_exit(trans, &lostfound_iter); 279 return ret; 280 } 281 282 static int reattach_inode(struct btree_trans *trans, 283 struct bch_inode_unpacked *inode, 284 u32 inode_snapshot) 285 { 286 struct bch_hash_info dir_hash; 287 struct bch_inode_unpacked lostfound; 288 char name_buf[20]; 289 struct qstr name; 290 u64 dir_offset = 0; 291 u32 dirent_snapshot = inode_snapshot; 292 int ret; 293 294 if (inode->bi_subvol) { 295 inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL; 296 297 u64 root_inum; 298 ret = subvol_lookup(trans, inode->bi_parent_subvol, 299 &dirent_snapshot, &root_inum); 300 if (ret) 301 return ret; 302 303 snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol); 304 } else { 305 snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum); 306 } 307 308 ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum); 309 if (ret) 310 return ret; 311 312 if (S_ISDIR(inode->bi_mode)) { 313 lostfound.bi_nlink++; 314 315 ret = __bch2_fsck_write_inode(trans, &lostfound, U32_MAX); 316 if (ret) 317 return ret; 318 } 319 320 dir_hash = bch2_hash_info_init(trans->c, &lostfound); 321 322 name = (struct qstr) QSTR(name_buf); 323 324 ret = bch2_dirent_create_snapshot(trans, 325 inode->bi_parent_subvol, lostfound.bi_inum, 326 dirent_snapshot, 327 &dir_hash, 328 inode_d_type(inode), 329 &name, 330 inode->bi_subvol ?: inode->bi_inum, 331 &dir_offset, 332 STR_HASH_must_create); 333 if (ret) 334 return ret; 335 336 inode->bi_dir = lostfound.bi_inum; 337 inode->bi_dir_offset = dir_offset; 338 339 return __bch2_fsck_write_inode(trans, inode, inode_snapshot); 340 } 341 342 static int remove_backpointer(struct btree_trans *trans, 343 struct bch_inode_unpacked *inode) 344 { 345 struct btree_iter iter; 346 struct bkey_s_c_dirent d; 347 int ret; 348 349 d = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents, 350 POS(inode->bi_dir, inode->bi_dir_offset), 0, 351 dirent); 352 ret = bkey_err(d) ?: 353 __remove_dirent(trans, d.k->p); 354 bch2_trans_iter_exit(trans, &iter); 355 return ret; 356 } 357 358 static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume s) 359 { 360 struct bch_fs *c = trans->c; 361 362 struct bch_inode_unpacked inode; 363 int ret = bch2_inode_find_by_inum_trans(trans, 364 (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) }, 365 &inode); 366 if (ret) 367 return ret; 368 369 ret = remove_backpointer(trans, &inode); 370 bch_err_msg(c, ret, "removing dirent"); 371 if (ret) 372 return ret; 373 374 ret = reattach_inode(trans, &inode, le32_to_cpu(s.v->snapshot)); 375 bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum); 376 return ret; 377 } 378 379 static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum) 380 { 381 struct bch_fs *c = trans->c; 382 383 if (!bch2_snapshot_is_leaf(c, snapshotid)) { 384 bch_err(c, "need to reconstruct subvol, but have interior node snapshot"); 385 return -BCH_ERR_fsck_repair_unimplemented; 386 } 387 388 /* 389 * If inum isn't set, that means we're being called from check_dirents, 390 * not check_inodes - the root of this subvolume doesn't exist or we 391 * would have found it there: 392 */ 393 if (!inum) { 394 struct btree_iter inode_iter = {}; 395 struct bch_inode_unpacked new_inode; 396 u64 cpu = raw_smp_processor_id(); 397 398 bch2_inode_init_early(c, &new_inode); 399 bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL); 400 401 new_inode.bi_subvol = subvolid; 402 403 int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?: 404 bch2_btree_iter_traverse(&inode_iter) ?: 405 bch2_inode_write(trans, &inode_iter, &new_inode); 406 bch2_trans_iter_exit(trans, &inode_iter); 407 if (ret) 408 return ret; 409 410 inum = new_inode.bi_inum; 411 } 412 413 bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum); 414 415 struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol)); 416 int ret = PTR_ERR_OR_ZERO(new_subvol); 417 if (ret) 418 return ret; 419 420 bkey_subvolume_init(&new_subvol->k_i); 421 new_subvol->k.p.offset = subvolid; 422 new_subvol->v.snapshot = cpu_to_le32(snapshotid); 423 new_subvol->v.inode = cpu_to_le64(inum); 424 ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0); 425 if (ret) 426 return ret; 427 428 struct btree_iter iter; 429 struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter, 430 BTREE_ID_snapshots, POS(0, snapshotid), 431 0, snapshot); 432 ret = PTR_ERR_OR_ZERO(s); 433 bch_err_msg(c, ret, "getting snapshot %u", snapshotid); 434 if (ret) 435 return ret; 436 437 u32 snapshot_tree = le32_to_cpu(s->v.tree); 438 439 s->v.subvol = cpu_to_le32(subvolid); 440 SET_BCH_SNAPSHOT_SUBVOL(&s->v, true); 441 bch2_trans_iter_exit(trans, &iter); 442 443 struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter, 444 BTREE_ID_snapshot_trees, POS(0, snapshot_tree), 445 0, snapshot_tree); 446 ret = PTR_ERR_OR_ZERO(st); 447 bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree); 448 if (ret) 449 return ret; 450 451 if (!st->v.master_subvol) 452 st->v.master_subvol = cpu_to_le32(subvolid); 453 454 bch2_trans_iter_exit(trans, &iter); 455 return 0; 456 } 457 458 static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 snapshot, u64 inum) 459 { 460 struct bch_fs *c = trans->c; 461 unsigned i_mode = S_IFREG; 462 u64 i_size = 0; 463 464 switch (btree) { 465 case BTREE_ID_extents: { 466 struct btree_iter iter = {}; 467 468 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0); 469 struct bkey_s_c k = bch2_btree_iter_peek_prev(&iter); 470 bch2_trans_iter_exit(trans, &iter); 471 int ret = bkey_err(k); 472 if (ret) 473 return ret; 474 475 i_size = k.k->p.offset << 9; 476 break; 477 } 478 case BTREE_ID_dirents: 479 i_mode = S_IFDIR; 480 break; 481 case BTREE_ID_xattrs: 482 break; 483 default: 484 BUG(); 485 } 486 487 struct bch_inode_unpacked new_inode; 488 bch2_inode_init_early(c, &new_inode); 489 bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL); 490 new_inode.bi_size = i_size; 491 new_inode.bi_inum = inum; 492 493 return __bch2_fsck_write_inode(trans, &new_inode, snapshot); 494 } 495 496 struct snapshots_seen { 497 struct bpos pos; 498 snapshot_id_list ids; 499 }; 500 501 static inline void snapshots_seen_exit(struct snapshots_seen *s) 502 { 503 darray_exit(&s->ids); 504 } 505 506 static inline void snapshots_seen_init(struct snapshots_seen *s) 507 { 508 memset(s, 0, sizeof(*s)); 509 } 510 511 static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id) 512 { 513 u32 *i; 514 __darray_for_each(s->ids, i) { 515 if (*i == id) 516 return 0; 517 if (*i > id) 518 break; 519 } 520 521 int ret = darray_insert_item(&s->ids, i - s->ids.data, id); 522 if (ret) 523 bch_err(c, "error reallocating snapshots_seen table (size %zu)", 524 s->ids.size); 525 return ret; 526 } 527 528 static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, 529 enum btree_id btree_id, struct bpos pos) 530 { 531 if (!bkey_eq(s->pos, pos)) 532 s->ids.nr = 0; 533 s->pos = pos; 534 535 return snapshot_list_add_nodup(c, &s->ids, pos.snapshot); 536 } 537 538 /** 539 * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor, 540 * and @ancestor hasn't been overwritten in @seen 541 * 542 * @c: filesystem handle 543 * @seen: list of snapshot ids already seen at current position 544 * @id: descendent snapshot id 545 * @ancestor: ancestor snapshot id 546 * 547 * Returns: whether key in @ancestor snapshot is visible in @id snapshot 548 */ 549 static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen, 550 u32 id, u32 ancestor) 551 { 552 ssize_t i; 553 554 EBUG_ON(id > ancestor); 555 556 /* @ancestor should be the snapshot most recently added to @seen */ 557 EBUG_ON(ancestor != seen->pos.snapshot); 558 EBUG_ON(ancestor != darray_last(seen->ids)); 559 560 if (id == ancestor) 561 return true; 562 563 if (!bch2_snapshot_is_ancestor(c, id, ancestor)) 564 return false; 565 566 /* 567 * We know that @id is a descendant of @ancestor, we're checking if 568 * we've seen a key that overwrote @ancestor - i.e. also a descendent of 569 * @ascestor and with @id as a descendent. 570 * 571 * But we already know that we're scanning IDs between @id and @ancestor 572 * numerically, since snapshot ID lists are kept sorted, so if we find 573 * an id that's an ancestor of @id we're done: 574 */ 575 576 for (i = seen->ids.nr - 2; 577 i >= 0 && seen->ids.data[i] >= id; 578 --i) 579 if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i])) 580 return false; 581 582 return true; 583 } 584 585 /** 586 * ref_visible - given a key with snapshot id @src that points to a key with 587 * snapshot id @dst, test whether there is some snapshot in which @dst is 588 * visible. 589 * 590 * @c: filesystem handle 591 * @s: list of snapshot IDs already seen at @src 592 * @src: snapshot ID of src key 593 * @dst: snapshot ID of dst key 594 * Returns: true if there is some snapshot in which @dst is visible 595 * 596 * Assumes we're visiting @src keys in natural key order 597 */ 598 static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s, 599 u32 src, u32 dst) 600 { 601 return dst <= src 602 ? key_visible_in_snapshot(c, s, dst, src) 603 : bch2_snapshot_is_ancestor(c, src, dst); 604 } 605 606 static int ref_visible2(struct bch_fs *c, 607 u32 src, struct snapshots_seen *src_seen, 608 u32 dst, struct snapshots_seen *dst_seen) 609 { 610 if (dst > src) { 611 swap(dst, src); 612 swap(dst_seen, src_seen); 613 } 614 return key_visible_in_snapshot(c, src_seen, dst, src); 615 } 616 617 #define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \ 618 for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && \ 619 (_i)->snapshot <= (_snapshot); _i++) \ 620 if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot)) 621 622 struct inode_walker_entry { 623 struct bch_inode_unpacked inode; 624 u32 snapshot; 625 bool seen_this_pos; 626 u64 count; 627 }; 628 629 struct inode_walker { 630 bool first_this_inode; 631 bool recalculate_sums; 632 struct bpos last_pos; 633 634 DARRAY(struct inode_walker_entry) inodes; 635 }; 636 637 static void inode_walker_exit(struct inode_walker *w) 638 { 639 darray_exit(&w->inodes); 640 } 641 642 static struct inode_walker inode_walker_init(void) 643 { 644 return (struct inode_walker) { 0, }; 645 } 646 647 static int add_inode(struct bch_fs *c, struct inode_walker *w, 648 struct bkey_s_c inode) 649 { 650 struct bch_inode_unpacked u; 651 652 BUG_ON(bch2_inode_unpack(inode, &u)); 653 654 return darray_push(&w->inodes, ((struct inode_walker_entry) { 655 .inode = u, 656 .snapshot = inode.k->p.snapshot, 657 })); 658 } 659 660 static int get_inodes_all_snapshots(struct btree_trans *trans, 661 struct inode_walker *w, u64 inum) 662 { 663 struct bch_fs *c = trans->c; 664 struct btree_iter iter; 665 struct bkey_s_c k; 666 int ret; 667 668 w->recalculate_sums = false; 669 w->inodes.nr = 0; 670 671 for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum), 672 BTREE_ITER_all_snapshots, k, ret) { 673 if (k.k->p.offset != inum) 674 break; 675 676 if (bkey_is_inode(k.k)) 677 add_inode(c, w, k); 678 } 679 bch2_trans_iter_exit(trans, &iter); 680 681 if (ret) 682 return ret; 683 684 w->first_this_inode = true; 685 return 0; 686 } 687 688 static struct inode_walker_entry * 689 lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k) 690 { 691 bool is_whiteout = k.k->type == KEY_TYPE_whiteout; 692 693 struct inode_walker_entry *i; 694 __darray_for_each(w->inodes, i) 695 if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->snapshot)) 696 goto found; 697 698 return NULL; 699 found: 700 BUG_ON(k.k->p.snapshot > i->snapshot); 701 702 if (k.k->p.snapshot != i->snapshot && !is_whiteout) { 703 struct inode_walker_entry new = *i; 704 705 new.snapshot = k.k->p.snapshot; 706 new.count = 0; 707 708 struct printbuf buf = PRINTBUF; 709 bch2_bkey_val_to_text(&buf, c, k); 710 711 bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n" 712 "unexpected because we should always update the inode when we update a key in that inode\n" 713 "%s", 714 w->last_pos.inode, k.k->p.snapshot, i->snapshot, buf.buf); 715 printbuf_exit(&buf); 716 717 while (i > w->inodes.data && i[-1].snapshot > k.k->p.snapshot) 718 --i; 719 720 size_t pos = i - w->inodes.data; 721 int ret = darray_insert_item(&w->inodes, pos, new); 722 if (ret) 723 return ERR_PTR(ret); 724 725 i = w->inodes.data + pos; 726 } 727 728 return i; 729 } 730 731 static struct inode_walker_entry *walk_inode(struct btree_trans *trans, 732 struct inode_walker *w, 733 struct bkey_s_c k) 734 { 735 if (w->last_pos.inode != k.k->p.inode) { 736 int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode); 737 if (ret) 738 return ERR_PTR(ret); 739 } else if (bkey_cmp(w->last_pos, k.k->p)) { 740 darray_for_each(w->inodes, i) 741 i->seen_this_pos = false; 742 } 743 744 w->last_pos = k.k->p; 745 746 return lookup_inode_for_snapshot(trans->c, w, k); 747 } 748 749 static int get_visible_inodes(struct btree_trans *trans, 750 struct inode_walker *w, 751 struct snapshots_seen *s, 752 u64 inum) 753 { 754 struct bch_fs *c = trans->c; 755 struct btree_iter iter; 756 struct bkey_s_c k; 757 int ret; 758 759 w->inodes.nr = 0; 760 761 for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum), 762 BTREE_ITER_all_snapshots, k, ret) { 763 if (k.k->p.offset != inum) 764 break; 765 766 if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) 767 continue; 768 769 if (bkey_is_inode(k.k)) 770 add_inode(c, w, k); 771 772 if (k.k->p.snapshot >= s->pos.snapshot) 773 break; 774 } 775 bch2_trans_iter_exit(trans, &iter); 776 777 return ret; 778 } 779 780 static int hash_redo_key(struct btree_trans *trans, 781 const struct bch_hash_desc desc, 782 struct bch_hash_info *hash_info, 783 struct btree_iter *k_iter, struct bkey_s_c k) 784 { 785 struct bkey_i *delete; 786 struct bkey_i *tmp; 787 788 delete = bch2_trans_kmalloc(trans, sizeof(*delete)); 789 if (IS_ERR(delete)) 790 return PTR_ERR(delete); 791 792 tmp = bch2_bkey_make_mut_noupdate(trans, k); 793 if (IS_ERR(tmp)) 794 return PTR_ERR(tmp); 795 796 bkey_init(&delete->k); 797 delete->k.p = k_iter->pos; 798 return bch2_btree_iter_traverse(k_iter) ?: 799 bch2_trans_update(trans, k_iter, delete, 0) ?: 800 bch2_hash_set_in_snapshot(trans, desc, hash_info, 801 (subvol_inum) { 0, k.k->p.inode }, 802 k.k->p.snapshot, tmp, 803 STR_HASH_must_create| 804 BTREE_UPDATE_internal_snapshot_node) ?: 805 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); 806 } 807 808 static int hash_check_key(struct btree_trans *trans, 809 const struct bch_hash_desc desc, 810 struct bch_hash_info *hash_info, 811 struct btree_iter *k_iter, struct bkey_s_c hash_k) 812 { 813 struct bch_fs *c = trans->c; 814 struct btree_iter iter = { NULL }; 815 struct printbuf buf = PRINTBUF; 816 struct bkey_s_c k; 817 u64 hash; 818 int ret = 0; 819 820 if (hash_k.k->type != desc.key_type) 821 return 0; 822 823 hash = desc.hash_bkey(hash_info, hash_k); 824 825 if (likely(hash == hash_k.k->p.offset)) 826 return 0; 827 828 if (hash_k.k->p.offset < hash) 829 goto bad_hash; 830 831 for_each_btree_key_norestart(trans, iter, desc.btree_id, 832 SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot), 833 BTREE_ITER_slots, k, ret) { 834 if (bkey_eq(k.k->p, hash_k.k->p)) 835 break; 836 837 if (fsck_err_on(k.k->type == desc.key_type && 838 !desc.cmp_bkey(k, hash_k), 839 trans, hash_table_key_duplicate, 840 "duplicate hash table keys:\n%s", 841 (printbuf_reset(&buf), 842 bch2_bkey_val_to_text(&buf, c, hash_k), 843 buf.buf))) { 844 ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1; 845 break; 846 } 847 848 if (bkey_deleted(k.k)) { 849 bch2_trans_iter_exit(trans, &iter); 850 goto bad_hash; 851 } 852 } 853 out: 854 bch2_trans_iter_exit(trans, &iter); 855 printbuf_exit(&buf); 856 return ret; 857 bad_hash: 858 if (fsck_err(trans, hash_table_key_wrong_offset, 859 "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s", 860 bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash, 861 (printbuf_reset(&buf), 862 bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) { 863 ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k); 864 bch_err_fn(c, ret); 865 if (ret) 866 return ret; 867 ret = -BCH_ERR_transaction_restart_nested; 868 } 869 fsck_err: 870 goto out; 871 } 872 873 static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, 874 struct btree_iter *iter, 875 struct bpos pos) 876 { 877 return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent); 878 } 879 880 static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans, 881 struct btree_iter *iter, 882 struct bch_inode_unpacked *inode, 883 u32 *snapshot) 884 { 885 if (inode->bi_subvol) { 886 u64 inum; 887 int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum); 888 if (ret) 889 return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) }); 890 } 891 892 return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot)); 893 } 894 895 static bool inode_points_to_dirent(struct bch_inode_unpacked *inode, 896 struct bkey_s_c_dirent d) 897 { 898 return inode->bi_dir == d.k->p.inode && 899 inode->bi_dir_offset == d.k->p.offset; 900 } 901 902 static bool dirent_points_to_inode(struct bkey_s_c_dirent d, 903 struct bch_inode_unpacked *inode) 904 { 905 return d.v->d_type == DT_SUBVOL 906 ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol 907 : le64_to_cpu(d.v->d_inum) == inode->bi_inum; 908 } 909 910 static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p) 911 { 912 struct btree_iter iter; 913 struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0); 914 int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set; 915 bch2_trans_iter_exit(trans, &iter); 916 return ret; 917 } 918 919 static int check_inode_dirent_inode(struct btree_trans *trans, struct bkey_s_c inode_k, 920 struct bch_inode_unpacked *inode, 921 u32 inode_snapshot, bool *write_inode) 922 { 923 struct bch_fs *c = trans->c; 924 struct printbuf buf = PRINTBUF; 925 926 struct btree_iter dirent_iter = {}; 927 struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot); 928 int ret = bkey_err(d); 929 if (ret && !bch2_err_matches(ret, ENOENT)) 930 return ret; 931 932 if (fsck_err_on(ret, 933 trans, inode_points_to_missing_dirent, 934 "inode points to missing dirent\n%s", 935 (bch2_bkey_val_to_text(&buf, c, inode_k), buf.buf)) || 936 fsck_err_on(!ret && !dirent_points_to_inode(d, inode), 937 trans, inode_points_to_wrong_dirent, 938 "inode points to dirent that does not point back:\n%s", 939 (bch2_bkey_val_to_text(&buf, c, inode_k), 940 prt_newline(&buf), 941 bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { 942 /* 943 * We just clear the backpointer fields for now. If we find a 944 * dirent that points to this inode in check_dirents(), we'll 945 * update it then; then when we get to check_path() if the 946 * backpointer is still 0 we'll reattach it. 947 */ 948 inode->bi_dir = 0; 949 inode->bi_dir_offset = 0; 950 inode->bi_flags &= ~BCH_INODE_backptr_untrusted; 951 *write_inode = true; 952 } 953 954 ret = 0; 955 fsck_err: 956 bch2_trans_iter_exit(trans, &dirent_iter); 957 printbuf_exit(&buf); 958 bch_err_fn(c, ret); 959 return ret; 960 } 961 962 static int check_inode(struct btree_trans *trans, 963 struct btree_iter *iter, 964 struct bkey_s_c k, 965 struct bch_inode_unpacked *prev, 966 struct snapshots_seen *s, 967 bool full) 968 { 969 struct bch_fs *c = trans->c; 970 struct bch_inode_unpacked u; 971 bool do_update = false; 972 int ret; 973 974 ret = bch2_check_key_has_snapshot(trans, iter, k); 975 if (ret < 0) 976 goto err; 977 if (ret) 978 return 0; 979 980 ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); 981 if (ret) 982 goto err; 983 984 if (!bkey_is_inode(k.k)) 985 return 0; 986 987 BUG_ON(bch2_inode_unpack(k, &u)); 988 989 if (!full && 990 !(u.bi_flags & (BCH_INODE_i_size_dirty| 991 BCH_INODE_i_sectors_dirty| 992 BCH_INODE_unlinked))) 993 return 0; 994 995 if (prev->bi_inum != u.bi_inum) 996 *prev = u; 997 998 if (fsck_err_on(prev->bi_hash_seed != u.bi_hash_seed || 999 inode_d_type(prev) != inode_d_type(&u), 1000 trans, inode_snapshot_mismatch, 1001 "inodes in different snapshots don't match")) { 1002 bch_err(c, "repair not implemented yet"); 1003 return -BCH_ERR_fsck_repair_unimplemented; 1004 } 1005 1006 if ((u.bi_flags & (BCH_INODE_i_size_dirty|BCH_INODE_unlinked)) && 1007 bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) { 1008 struct bpos new_min_pos; 1009 1010 ret = bch2_propagate_key_to_snapshot_leaves(trans, iter->btree_id, k, &new_min_pos); 1011 if (ret) 1012 goto err; 1013 1014 u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked; 1015 1016 ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot); 1017 1018 bch_err_msg(c, ret, "in fsck updating inode"); 1019 if (ret) 1020 return ret; 1021 1022 if (!bpos_eq(new_min_pos, POS_MIN)) 1023 bch2_btree_iter_set_pos(iter, bpos_predecessor(new_min_pos)); 1024 return 0; 1025 } 1026 1027 if (u.bi_flags & BCH_INODE_unlinked) { 1028 ret = check_inode_deleted_list(trans, k.k->p); 1029 if (ret < 0) 1030 return ret; 1031 1032 fsck_err_on(!ret, 1033 trans, unlinked_inode_not_on_deleted_list, 1034 "inode %llu:%u unlinked, but not on deleted list", 1035 u.bi_inum, k.k->p.snapshot); 1036 ret = 0; 1037 } 1038 1039 if (u.bi_flags & BCH_INODE_unlinked && 1040 (!c->sb.clean || 1041 fsck_err(trans, inode_unlinked_but_clean, 1042 "filesystem marked clean, but inode %llu unlinked", 1043 u.bi_inum))) { 1044 ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot); 1045 bch_err_msg(c, ret, "in fsck deleting inode"); 1046 return ret; 1047 } 1048 1049 if (u.bi_flags & BCH_INODE_i_size_dirty && 1050 (!c->sb.clean || 1051 fsck_err(trans, inode_i_size_dirty_but_clean, 1052 "filesystem marked clean, but inode %llu has i_size dirty", 1053 u.bi_inum))) { 1054 bch_verbose(c, "truncating inode %llu", u.bi_inum); 1055 1056 /* 1057 * XXX: need to truncate partial blocks too here - or ideally 1058 * just switch units to bytes and that issue goes away 1059 */ 1060 ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents, 1061 SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9, 1062 iter->pos.snapshot), 1063 POS(u.bi_inum, U64_MAX), 1064 0, NULL); 1065 bch_err_msg(c, ret, "in fsck truncating inode"); 1066 if (ret) 1067 return ret; 1068 1069 /* 1070 * We truncated without our normal sector accounting hook, just 1071 * make sure we recalculate it: 1072 */ 1073 u.bi_flags |= BCH_INODE_i_sectors_dirty; 1074 1075 u.bi_flags &= ~BCH_INODE_i_size_dirty; 1076 do_update = true; 1077 } 1078 1079 if (u.bi_flags & BCH_INODE_i_sectors_dirty && 1080 (!c->sb.clean || 1081 fsck_err(trans, inode_i_sectors_dirty_but_clean, 1082 "filesystem marked clean, but inode %llu has i_sectors dirty", 1083 u.bi_inum))) { 1084 s64 sectors; 1085 1086 bch_verbose(c, "recounting sectors for inode %llu", 1087 u.bi_inum); 1088 1089 sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot); 1090 if (sectors < 0) { 1091 bch_err_msg(c, sectors, "in fsck recounting inode sectors"); 1092 return sectors; 1093 } 1094 1095 u.bi_sectors = sectors; 1096 u.bi_flags &= ~BCH_INODE_i_sectors_dirty; 1097 do_update = true; 1098 } 1099 1100 if (u.bi_flags & BCH_INODE_backptr_untrusted) { 1101 u.bi_dir = 0; 1102 u.bi_dir_offset = 0; 1103 u.bi_flags &= ~BCH_INODE_backptr_untrusted; 1104 do_update = true; 1105 } 1106 1107 if (u.bi_dir || u.bi_dir_offset) { 1108 ret = check_inode_dirent_inode(trans, k, &u, k.k->p.snapshot, &do_update); 1109 if (ret) 1110 goto err; 1111 } 1112 1113 if (fsck_err_on(u.bi_parent_subvol && 1114 (u.bi_subvol == 0 || 1115 u.bi_subvol == BCACHEFS_ROOT_SUBVOL), 1116 trans, inode_bi_parent_nonzero, 1117 "inode %llu:%u has subvol %u but nonzero parent subvol %u", 1118 u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) { 1119 u.bi_parent_subvol = 0; 1120 do_update = true; 1121 } 1122 1123 if (u.bi_subvol) { 1124 struct bch_subvolume s; 1125 1126 ret = bch2_subvolume_get(trans, u.bi_subvol, false, 0, &s); 1127 if (ret && !bch2_err_matches(ret, ENOENT)) 1128 goto err; 1129 1130 if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) { 1131 ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum); 1132 goto do_update; 1133 } 1134 1135 if (fsck_err_on(ret, 1136 trans, inode_bi_subvol_missing, 1137 "inode %llu:%u bi_subvol points to missing subvolume %u", 1138 u.bi_inum, k.k->p.snapshot, u.bi_subvol) || 1139 fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum || 1140 !bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot), 1141 k.k->p.snapshot), 1142 trans, inode_bi_subvol_wrong, 1143 "inode %llu:%u points to subvol %u, but subvol points to %llu:%u", 1144 u.bi_inum, k.k->p.snapshot, u.bi_subvol, 1145 le64_to_cpu(s.inode), 1146 le32_to_cpu(s.snapshot))) { 1147 u.bi_subvol = 0; 1148 u.bi_parent_subvol = 0; 1149 do_update = true; 1150 } 1151 } 1152 do_update: 1153 if (do_update) { 1154 ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot); 1155 bch_err_msg(c, ret, "in fsck updating inode"); 1156 if (ret) 1157 return ret; 1158 } 1159 err: 1160 fsck_err: 1161 bch_err_fn(c, ret); 1162 return ret; 1163 } 1164 1165 int bch2_check_inodes(struct bch_fs *c) 1166 { 1167 bool full = c->opts.fsck; 1168 struct bch_inode_unpacked prev = { 0 }; 1169 struct snapshots_seen s; 1170 1171 snapshots_seen_init(&s); 1172 1173 int ret = bch2_trans_run(c, 1174 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, 1175 POS_MIN, 1176 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, 1177 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 1178 check_inode(trans, &iter, k, &prev, &s, full))); 1179 1180 snapshots_seen_exit(&s); 1181 bch_err_fn(c, ret); 1182 return ret; 1183 } 1184 1185 static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode) 1186 { 1187 switch (btree) { 1188 case BTREE_ID_extents: 1189 return S_ISREG(mode) || S_ISLNK(mode); 1190 case BTREE_ID_dirents: 1191 return S_ISDIR(mode); 1192 case BTREE_ID_xattrs: 1193 return true; 1194 default: 1195 BUG(); 1196 } 1197 } 1198 1199 static int check_key_has_inode(struct btree_trans *trans, 1200 struct btree_iter *iter, 1201 struct inode_walker *inode, 1202 struct inode_walker_entry *i, 1203 struct bkey_s_c k) 1204 { 1205 struct bch_fs *c = trans->c; 1206 struct printbuf buf = PRINTBUF; 1207 int ret = PTR_ERR_OR_ZERO(i); 1208 if (ret) 1209 return ret; 1210 1211 if (k.k->type == KEY_TYPE_whiteout) 1212 goto out; 1213 1214 if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) { 1215 ret = reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?: 1216 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); 1217 if (ret) 1218 goto err; 1219 1220 inode->last_pos.inode--; 1221 ret = -BCH_ERR_transaction_restart_nested; 1222 goto err; 1223 } 1224 1225 if (fsck_err_on(!i, 1226 trans, key_in_missing_inode, 1227 "key in missing inode:\n %s", 1228 (printbuf_reset(&buf), 1229 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) 1230 goto delete; 1231 1232 if (fsck_err_on(i && !btree_matches_i_mode(iter->btree_id, i->inode.bi_mode), 1233 trans, key_in_wrong_inode_type, 1234 "key for wrong inode mode %o:\n %s", 1235 i->inode.bi_mode, 1236 (printbuf_reset(&buf), 1237 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) 1238 goto delete; 1239 out: 1240 err: 1241 fsck_err: 1242 printbuf_exit(&buf); 1243 bch_err_fn(c, ret); 1244 return ret; 1245 delete: 1246 ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node); 1247 goto out; 1248 } 1249 1250 static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w) 1251 { 1252 struct bch_fs *c = trans->c; 1253 int ret = 0; 1254 s64 count2; 1255 1256 darray_for_each(w->inodes, i) { 1257 if (i->inode.bi_sectors == i->count) 1258 continue; 1259 1260 count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot); 1261 1262 if (w->recalculate_sums) 1263 i->count = count2; 1264 1265 if (i->count != count2) { 1266 bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu", 1267 w->last_pos.inode, i->snapshot, i->count, count2); 1268 return -BCH_ERR_internal_fsck_err; 1269 } 1270 1271 if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty), 1272 trans, inode_i_sectors_wrong, 1273 "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu", 1274 w->last_pos.inode, i->snapshot, 1275 i->inode.bi_sectors, i->count)) { 1276 i->inode.bi_sectors = i->count; 1277 ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot); 1278 if (ret) 1279 break; 1280 } 1281 } 1282 fsck_err: 1283 bch_err_fn(c, ret); 1284 return ret; 1285 } 1286 1287 static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) 1288 { 1289 u32 restart_count = trans->restart_count; 1290 return check_i_sectors_notnested(trans, w) ?: 1291 trans_was_restarted(trans, restart_count); 1292 } 1293 1294 struct extent_end { 1295 u32 snapshot; 1296 u64 offset; 1297 struct snapshots_seen seen; 1298 }; 1299 1300 struct extent_ends { 1301 struct bpos last_pos; 1302 DARRAY(struct extent_end) e; 1303 }; 1304 1305 static void extent_ends_reset(struct extent_ends *extent_ends) 1306 { 1307 darray_for_each(extent_ends->e, i) 1308 snapshots_seen_exit(&i->seen); 1309 extent_ends->e.nr = 0; 1310 } 1311 1312 static void extent_ends_exit(struct extent_ends *extent_ends) 1313 { 1314 extent_ends_reset(extent_ends); 1315 darray_exit(&extent_ends->e); 1316 } 1317 1318 static void extent_ends_init(struct extent_ends *extent_ends) 1319 { 1320 memset(extent_ends, 0, sizeof(*extent_ends)); 1321 } 1322 1323 static int extent_ends_at(struct bch_fs *c, 1324 struct extent_ends *extent_ends, 1325 struct snapshots_seen *seen, 1326 struct bkey_s_c k) 1327 { 1328 struct extent_end *i, n = (struct extent_end) { 1329 .offset = k.k->p.offset, 1330 .snapshot = k.k->p.snapshot, 1331 .seen = *seen, 1332 }; 1333 1334 n.seen.ids.data = kmemdup(seen->ids.data, 1335 sizeof(seen->ids.data[0]) * seen->ids.size, 1336 GFP_KERNEL); 1337 if (!n.seen.ids.data) 1338 return -BCH_ERR_ENOMEM_fsck_extent_ends_at; 1339 1340 __darray_for_each(extent_ends->e, i) { 1341 if (i->snapshot == k.k->p.snapshot) { 1342 snapshots_seen_exit(&i->seen); 1343 *i = n; 1344 return 0; 1345 } 1346 1347 if (i->snapshot >= k.k->p.snapshot) 1348 break; 1349 } 1350 1351 return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n); 1352 } 1353 1354 static int overlapping_extents_found(struct btree_trans *trans, 1355 enum btree_id btree, 1356 struct bpos pos1, struct snapshots_seen *pos1_seen, 1357 struct bkey pos2, 1358 bool *fixed, 1359 struct extent_end *extent_end) 1360 { 1361 struct bch_fs *c = trans->c; 1362 struct printbuf buf = PRINTBUF; 1363 struct btree_iter iter1, iter2 = { NULL }; 1364 struct bkey_s_c k1, k2; 1365 int ret; 1366 1367 BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2))); 1368 1369 bch2_trans_iter_init(trans, &iter1, btree, pos1, 1370 BTREE_ITER_all_snapshots| 1371 BTREE_ITER_not_extents); 1372 k1 = bch2_btree_iter_peek_upto(&iter1, POS(pos1.inode, U64_MAX)); 1373 ret = bkey_err(k1); 1374 if (ret) 1375 goto err; 1376 1377 prt_str(&buf, "\n "); 1378 bch2_bkey_val_to_text(&buf, c, k1); 1379 1380 if (!bpos_eq(pos1, k1.k->p)) { 1381 prt_str(&buf, "\n wanted\n "); 1382 bch2_bpos_to_text(&buf, pos1); 1383 prt_str(&buf, "\n "); 1384 bch2_bkey_to_text(&buf, &pos2); 1385 1386 bch_err(c, "%s: error finding first overlapping extent when repairing, got%s", 1387 __func__, buf.buf); 1388 ret = -BCH_ERR_internal_fsck_err; 1389 goto err; 1390 } 1391 1392 bch2_trans_copy_iter(&iter2, &iter1); 1393 1394 while (1) { 1395 bch2_btree_iter_advance(&iter2); 1396 1397 k2 = bch2_btree_iter_peek_upto(&iter2, POS(pos1.inode, U64_MAX)); 1398 ret = bkey_err(k2); 1399 if (ret) 1400 goto err; 1401 1402 if (bpos_ge(k2.k->p, pos2.p)) 1403 break; 1404 } 1405 1406 prt_str(&buf, "\n "); 1407 bch2_bkey_val_to_text(&buf, c, k2); 1408 1409 if (bpos_gt(k2.k->p, pos2.p) || 1410 pos2.size != k2.k->size) { 1411 bch_err(c, "%s: error finding seconding overlapping extent when repairing%s", 1412 __func__, buf.buf); 1413 ret = -BCH_ERR_internal_fsck_err; 1414 goto err; 1415 } 1416 1417 prt_printf(&buf, "\n overwriting %s extent", 1418 pos1.snapshot >= pos2.p.snapshot ? "first" : "second"); 1419 1420 if (fsck_err(trans, extent_overlapping, 1421 "overlapping extents%s", buf.buf)) { 1422 struct btree_iter *old_iter = &iter1; 1423 struct disk_reservation res = { 0 }; 1424 1425 if (pos1.snapshot < pos2.p.snapshot) { 1426 old_iter = &iter2; 1427 swap(k1, k2); 1428 } 1429 1430 trans->extra_disk_res += bch2_bkey_sectors_compressed(k2); 1431 1432 ret = bch2_trans_update_extent_overwrite(trans, old_iter, 1433 BTREE_UPDATE_internal_snapshot_node, 1434 k1, k2) ?: 1435 bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc); 1436 bch2_disk_reservation_put(c, &res); 1437 1438 if (ret) 1439 goto err; 1440 1441 *fixed = true; 1442 1443 if (pos1.snapshot == pos2.p.snapshot) { 1444 /* 1445 * We overwrote the first extent, and did the overwrite 1446 * in the same snapshot: 1447 */ 1448 extent_end->offset = bkey_start_offset(&pos2); 1449 } else if (pos1.snapshot > pos2.p.snapshot) { 1450 /* 1451 * We overwrote the first extent in pos2's snapshot: 1452 */ 1453 ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot); 1454 } else { 1455 /* 1456 * We overwrote the second extent - restart 1457 * check_extent() from the top: 1458 */ 1459 ret = -BCH_ERR_transaction_restart_nested; 1460 } 1461 } 1462 fsck_err: 1463 err: 1464 bch2_trans_iter_exit(trans, &iter2); 1465 bch2_trans_iter_exit(trans, &iter1); 1466 printbuf_exit(&buf); 1467 return ret; 1468 } 1469 1470 static int check_overlapping_extents(struct btree_trans *trans, 1471 struct snapshots_seen *seen, 1472 struct extent_ends *extent_ends, 1473 struct bkey_s_c k, 1474 struct btree_iter *iter, 1475 bool *fixed) 1476 { 1477 struct bch_fs *c = trans->c; 1478 int ret = 0; 1479 1480 /* transaction restart, running again */ 1481 if (bpos_eq(extent_ends->last_pos, k.k->p)) 1482 return 0; 1483 1484 if (extent_ends->last_pos.inode != k.k->p.inode) 1485 extent_ends_reset(extent_ends); 1486 1487 darray_for_each(extent_ends->e, i) { 1488 if (i->offset <= bkey_start_offset(k.k)) 1489 continue; 1490 1491 if (!ref_visible2(c, 1492 k.k->p.snapshot, seen, 1493 i->snapshot, &i->seen)) 1494 continue; 1495 1496 ret = overlapping_extents_found(trans, iter->btree_id, 1497 SPOS(iter->pos.inode, 1498 i->offset, 1499 i->snapshot), 1500 &i->seen, 1501 *k.k, fixed, i); 1502 if (ret) 1503 goto err; 1504 } 1505 1506 extent_ends->last_pos = k.k->p; 1507 err: 1508 return ret; 1509 } 1510 1511 static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter, 1512 struct bkey_s_c k) 1513 { 1514 struct bch_fs *c = trans->c; 1515 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 1516 struct bch_extent_crc_unpacked crc; 1517 const union bch_extent_entry *i; 1518 unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9; 1519 1520 bkey_for_each_crc(k.k, ptrs, crc, i) 1521 if (crc_is_encoded(crc) && 1522 crc.uncompressed_size > encoded_extent_max_sectors) { 1523 struct printbuf buf = PRINTBUF; 1524 1525 bch2_bkey_val_to_text(&buf, c, k); 1526 bch_err(c, "overbig encoded extent, please report this:\n %s", buf.buf); 1527 printbuf_exit(&buf); 1528 } 1529 1530 return 0; 1531 } 1532 1533 static int check_extent(struct btree_trans *trans, struct btree_iter *iter, 1534 struct bkey_s_c k, 1535 struct inode_walker *inode, 1536 struct snapshots_seen *s, 1537 struct extent_ends *extent_ends) 1538 { 1539 struct bch_fs *c = trans->c; 1540 struct inode_walker_entry *i; 1541 struct printbuf buf = PRINTBUF; 1542 int ret = 0; 1543 1544 ret = bch2_check_key_has_snapshot(trans, iter, k); 1545 if (ret) { 1546 ret = ret < 0 ? ret : 0; 1547 goto out; 1548 } 1549 1550 if (inode->last_pos.inode != k.k->p.inode) { 1551 ret = check_i_sectors(trans, inode); 1552 if (ret) 1553 goto err; 1554 } 1555 1556 ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); 1557 if (ret) 1558 goto err; 1559 1560 i = walk_inode(trans, inode, k); 1561 ret = PTR_ERR_OR_ZERO(i); 1562 if (ret) 1563 goto err; 1564 1565 ret = check_key_has_inode(trans, iter, inode, i, k); 1566 if (ret) 1567 goto err; 1568 1569 if (k.k->type != KEY_TYPE_whiteout) { 1570 ret = check_overlapping_extents(trans, s, extent_ends, k, iter, 1571 &inode->recalculate_sums); 1572 if (ret) 1573 goto err; 1574 } 1575 1576 /* 1577 * Check inodes in reverse order, from oldest snapshots to newest, 1578 * starting from the inode that matches this extent's snapshot. If we 1579 * didn't have one, iterate over all inodes: 1580 */ 1581 if (!i) 1582 i = &darray_last(inode->inodes); 1583 1584 for (; 1585 inode->inodes.data && i >= inode->inodes.data; 1586 --i) { 1587 if (i->snapshot > k.k->p.snapshot || 1588 !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot)) 1589 continue; 1590 1591 if (k.k->type != KEY_TYPE_whiteout) { 1592 if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) && 1593 k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 && 1594 !bkey_extent_is_reservation(k), 1595 trans, extent_past_end_of_inode, 1596 "extent type past end of inode %llu:%u, i_size %llu\n %s", 1597 i->inode.bi_inum, i->snapshot, i->inode.bi_size, 1598 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { 1599 struct btree_iter iter2; 1600 1601 bch2_trans_copy_iter(&iter2, iter); 1602 bch2_btree_iter_set_snapshot(&iter2, i->snapshot); 1603 ret = bch2_btree_iter_traverse(&iter2) ?: 1604 bch2_btree_delete_at(trans, &iter2, 1605 BTREE_UPDATE_internal_snapshot_node); 1606 bch2_trans_iter_exit(trans, &iter2); 1607 if (ret) 1608 goto err; 1609 1610 iter->k.type = KEY_TYPE_whiteout; 1611 } 1612 1613 if (bkey_extent_is_allocation(k.k)) 1614 i->count += k.k->size; 1615 } 1616 1617 i->seen_this_pos = true; 1618 } 1619 1620 if (k.k->type != KEY_TYPE_whiteout) { 1621 ret = extent_ends_at(c, extent_ends, s, k); 1622 if (ret) 1623 goto err; 1624 } 1625 out: 1626 err: 1627 fsck_err: 1628 printbuf_exit(&buf); 1629 bch_err_fn(c, ret); 1630 return ret; 1631 } 1632 1633 /* 1634 * Walk extents: verify that extents have a corresponding S_ISREG inode, and 1635 * that i_size an i_sectors are consistent 1636 */ 1637 int bch2_check_extents(struct bch_fs *c) 1638 { 1639 struct inode_walker w = inode_walker_init(); 1640 struct snapshots_seen s; 1641 struct extent_ends extent_ends; 1642 struct disk_reservation res = { 0 }; 1643 1644 snapshots_seen_init(&s); 1645 extent_ends_init(&extent_ends); 1646 1647 int ret = bch2_trans_run(c, 1648 for_each_btree_key_commit(trans, iter, BTREE_ID_extents, 1649 POS(BCACHEFS_ROOT_INO, 0), 1650 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, 1651 &res, NULL, 1652 BCH_TRANS_COMMIT_no_enospc, ({ 1653 bch2_disk_reservation_put(c, &res); 1654 check_extent(trans, &iter, k, &w, &s, &extent_ends) ?: 1655 check_extent_overbig(trans, &iter, k); 1656 })) ?: 1657 check_i_sectors_notnested(trans, &w)); 1658 1659 bch2_disk_reservation_put(c, &res); 1660 extent_ends_exit(&extent_ends); 1661 inode_walker_exit(&w); 1662 snapshots_seen_exit(&s); 1663 1664 bch_err_fn(c, ret); 1665 return ret; 1666 } 1667 1668 int bch2_check_indirect_extents(struct bch_fs *c) 1669 { 1670 struct disk_reservation res = { 0 }; 1671 1672 int ret = bch2_trans_run(c, 1673 for_each_btree_key_commit(trans, iter, BTREE_ID_reflink, 1674 POS_MIN, 1675 BTREE_ITER_prefetch, k, 1676 &res, NULL, 1677 BCH_TRANS_COMMIT_no_enospc, ({ 1678 bch2_disk_reservation_put(c, &res); 1679 check_extent_overbig(trans, &iter, k); 1680 }))); 1681 1682 bch2_disk_reservation_put(c, &res); 1683 bch_err_fn(c, ret); 1684 return ret; 1685 } 1686 1687 static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_walker *w) 1688 { 1689 struct bch_fs *c = trans->c; 1690 int ret = 0; 1691 s64 count2; 1692 1693 darray_for_each(w->inodes, i) { 1694 if (i->inode.bi_nlink == i->count) 1695 continue; 1696 1697 count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot); 1698 if (count2 < 0) 1699 return count2; 1700 1701 if (i->count != count2) { 1702 bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu", 1703 w->last_pos.inode, i->snapshot, i->count, count2); 1704 i->count = count2; 1705 if (i->inode.bi_nlink == i->count) 1706 continue; 1707 } 1708 1709 if (fsck_err_on(i->inode.bi_nlink != i->count, 1710 trans, inode_dir_wrong_nlink, 1711 "directory %llu:%u with wrong i_nlink: got %u, should be %llu", 1712 w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) { 1713 i->inode.bi_nlink = i->count; 1714 ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot); 1715 if (ret) 1716 break; 1717 } 1718 } 1719 fsck_err: 1720 bch_err_fn(c, ret); 1721 return ret; 1722 } 1723 1724 static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) 1725 { 1726 u32 restart_count = trans->restart_count; 1727 return check_subdir_count_notnested(trans, w) ?: 1728 trans_was_restarted(trans, restart_count); 1729 } 1730 1731 noinline_for_stack 1732 static int check_dirent_inode_dirent(struct btree_trans *trans, 1733 struct btree_iter *iter, 1734 struct bkey_s_c_dirent d, 1735 struct bch_inode_unpacked *target, 1736 u32 target_snapshot) 1737 { 1738 struct bch_fs *c = trans->c; 1739 struct printbuf buf = PRINTBUF; 1740 int ret = 0; 1741 1742 if (inode_points_to_dirent(target, d)) 1743 return 0; 1744 1745 if (bch2_inode_should_have_bp(target) && 1746 !fsck_err(trans, inode_wrong_backpointer, 1747 "dirent points to inode that does not point back:\n %s", 1748 (bch2_bkey_val_to_text(&buf, c, d.s_c), 1749 prt_printf(&buf, "\n "), 1750 bch2_inode_unpacked_to_text(&buf, target), 1751 buf.buf))) 1752 goto out_noiter; 1753 1754 if (!target->bi_dir && 1755 !target->bi_dir_offset) { 1756 target->bi_dir = d.k->p.inode; 1757 target->bi_dir_offset = d.k->p.offset; 1758 return __bch2_fsck_write_inode(trans, target, target_snapshot); 1759 } 1760 1761 struct btree_iter bp_iter = { NULL }; 1762 struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter, 1763 SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot)); 1764 ret = bkey_err(bp_dirent); 1765 if (ret && !bch2_err_matches(ret, ENOENT)) 1766 goto err; 1767 1768 bool backpointer_exists = !ret; 1769 ret = 0; 1770 1771 if (fsck_err_on(!backpointer_exists, 1772 trans, inode_wrong_backpointer, 1773 "inode %llu:%u has wrong backpointer:\n" 1774 "got %llu:%llu\n" 1775 "should be %llu:%llu", 1776 target->bi_inum, target_snapshot, 1777 target->bi_dir, 1778 target->bi_dir_offset, 1779 d.k->p.inode, 1780 d.k->p.offset)) { 1781 target->bi_dir = d.k->p.inode; 1782 target->bi_dir_offset = d.k->p.offset; 1783 ret = __bch2_fsck_write_inode(trans, target, target_snapshot); 1784 goto out; 1785 } 1786 1787 bch2_bkey_val_to_text(&buf, c, d.s_c); 1788 prt_newline(&buf); 1789 if (backpointer_exists) 1790 bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c); 1791 1792 if (fsck_err_on(backpointer_exists && 1793 (S_ISDIR(target->bi_mode) || 1794 target->bi_subvol), 1795 trans, inode_dir_multiple_links, 1796 "%s %llu:%u with multiple links\n%s", 1797 S_ISDIR(target->bi_mode) ? "directory" : "subvolume", 1798 target->bi_inum, target_snapshot, buf.buf)) { 1799 ret = __remove_dirent(trans, d.k->p); 1800 goto out; 1801 } 1802 1803 /* 1804 * hardlinked file with nlink 0: 1805 * We're just adjusting nlink here so check_nlinks() will pick 1806 * it up, it ignores inodes with nlink 0 1807 */ 1808 if (fsck_err_on(backpointer_exists && !target->bi_nlink, 1809 trans, inode_multiple_links_but_nlink_0, 1810 "inode %llu:%u type %s has multiple links but i_nlink 0\n%s", 1811 target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) { 1812 target->bi_nlink++; 1813 target->bi_flags &= ~BCH_INODE_unlinked; 1814 ret = __bch2_fsck_write_inode(trans, target, target_snapshot); 1815 if (ret) 1816 goto err; 1817 } 1818 out: 1819 err: 1820 fsck_err: 1821 bch2_trans_iter_exit(trans, &bp_iter); 1822 out_noiter: 1823 printbuf_exit(&buf); 1824 bch_err_fn(c, ret); 1825 return ret; 1826 } 1827 1828 noinline_for_stack 1829 static int check_dirent_target(struct btree_trans *trans, 1830 struct btree_iter *iter, 1831 struct bkey_s_c_dirent d, 1832 struct bch_inode_unpacked *target, 1833 u32 target_snapshot) 1834 { 1835 struct bch_fs *c = trans->c; 1836 struct bkey_i_dirent *n; 1837 struct printbuf buf = PRINTBUF; 1838 int ret = 0; 1839 1840 ret = check_dirent_inode_dirent(trans, iter, d, target, target_snapshot); 1841 if (ret) 1842 goto err; 1843 1844 if (fsck_err_on(d.v->d_type != inode_d_type(target), 1845 trans, dirent_d_type_wrong, 1846 "incorrect d_type: got %s, should be %s:\n%s", 1847 bch2_d_type_str(d.v->d_type), 1848 bch2_d_type_str(inode_d_type(target)), 1849 (printbuf_reset(&buf), 1850 bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { 1851 n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); 1852 ret = PTR_ERR_OR_ZERO(n); 1853 if (ret) 1854 goto err; 1855 1856 bkey_reassemble(&n->k_i, d.s_c); 1857 n->v.d_type = inode_d_type(target); 1858 if (n->v.d_type == DT_SUBVOL) { 1859 n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol); 1860 n->v.d_child_subvol = cpu_to_le32(target->bi_subvol); 1861 } else { 1862 n->v.d_inum = cpu_to_le64(target->bi_inum); 1863 } 1864 1865 ret = bch2_trans_update(trans, iter, &n->k_i, 0); 1866 if (ret) 1867 goto err; 1868 1869 d = dirent_i_to_s_c(n); 1870 } 1871 err: 1872 fsck_err: 1873 printbuf_exit(&buf); 1874 bch_err_fn(c, ret); 1875 return ret; 1876 } 1877 1878 /* find a subvolume that's a descendent of @snapshot: */ 1879 static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid) 1880 { 1881 struct btree_iter iter; 1882 struct bkey_s_c k; 1883 int ret; 1884 1885 for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) { 1886 if (k.k->type != KEY_TYPE_subvolume) 1887 continue; 1888 1889 struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); 1890 if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) { 1891 bch2_trans_iter_exit(trans, &iter); 1892 *subvolid = k.k->p.offset; 1893 goto found; 1894 } 1895 } 1896 if (!ret) 1897 ret = -ENOENT; 1898 found: 1899 bch2_trans_iter_exit(trans, &iter); 1900 return ret; 1901 } 1902 1903 noinline_for_stack 1904 static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter, 1905 struct bkey_s_c_dirent d) 1906 { 1907 struct bch_fs *c = trans->c; 1908 struct btree_iter subvol_iter = {}; 1909 struct bch_inode_unpacked subvol_root; 1910 u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol); 1911 u32 target_subvol = le32_to_cpu(d.v->d_child_subvol); 1912 u32 parent_snapshot; 1913 u32 new_parent_subvol = 0; 1914 u64 parent_inum; 1915 struct printbuf buf = PRINTBUF; 1916 int ret = 0; 1917 1918 ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum); 1919 if (ret && !bch2_err_matches(ret, ENOENT)) 1920 return ret; 1921 1922 if (ret || 1923 (!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) { 1924 int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol); 1925 if (ret2 && !bch2_err_matches(ret, ENOENT)) 1926 return ret2; 1927 } 1928 1929 if (ret && 1930 !new_parent_subvol && 1931 (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) { 1932 /* 1933 * Couldn't find a subvol for dirent's snapshot - but we lost 1934 * subvols, so we need to reconstruct: 1935 */ 1936 ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0); 1937 if (ret) 1938 return ret; 1939 1940 parent_snapshot = d.k->p.snapshot; 1941 } 1942 1943 if (fsck_err_on(ret, 1944 trans, dirent_to_missing_parent_subvol, 1945 "dirent parent_subvol points to missing subvolume\n%s", 1946 (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) || 1947 fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot), 1948 trans, dirent_not_visible_in_parent_subvol, 1949 "dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s", 1950 parent_snapshot, 1951 (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { 1952 if (!new_parent_subvol) { 1953 bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot); 1954 return -BCH_ERR_fsck_repair_unimplemented; 1955 } 1956 1957 struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent); 1958 ret = PTR_ERR_OR_ZERO(new_dirent); 1959 if (ret) 1960 goto err; 1961 1962 new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol); 1963 } 1964 1965 struct bkey_s_c_subvolume s = 1966 bch2_bkey_get_iter_typed(trans, &subvol_iter, 1967 BTREE_ID_subvolumes, POS(0, target_subvol), 1968 0, subvolume); 1969 ret = bkey_err(s.s_c); 1970 if (ret && !bch2_err_matches(ret, ENOENT)) 1971 return ret; 1972 1973 if (ret) { 1974 if (fsck_err(trans, dirent_to_missing_subvol, 1975 "dirent points to missing subvolume\n%s", 1976 (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) 1977 return __remove_dirent(trans, d.k->p); 1978 ret = 0; 1979 goto out; 1980 } 1981 1982 if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol, 1983 trans, subvol_fs_path_parent_wrong, 1984 "subvol with wrong fs_path_parent, should be be %u\n%s", 1985 parent_subvol, 1986 (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { 1987 struct bkey_i_subvolume *n = 1988 bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume); 1989 ret = PTR_ERR_OR_ZERO(n); 1990 if (ret) 1991 goto err; 1992 1993 n->v.fs_path_parent = cpu_to_le32(parent_subvol); 1994 } 1995 1996 u64 target_inum = le64_to_cpu(s.v->inode); 1997 u32 target_snapshot = le32_to_cpu(s.v->snapshot); 1998 1999 ret = lookup_inode(trans, target_inum, &subvol_root, &target_snapshot); 2000 if (ret && !bch2_err_matches(ret, ENOENT)) 2001 goto err; 2002 2003 if (ret) { 2004 bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum); 2005 ret = -BCH_ERR_fsck_repair_unimplemented; 2006 ret = 0; 2007 goto err; 2008 } 2009 2010 if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol, 2011 trans, inode_bi_parent_wrong, 2012 "subvol root %llu has wrong bi_parent_subvol: got %u, should be %u", 2013 target_inum, 2014 subvol_root.bi_parent_subvol, parent_subvol)) { 2015 subvol_root.bi_parent_subvol = parent_subvol; 2016 ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot); 2017 if (ret) 2018 goto err; 2019 } 2020 2021 ret = check_dirent_target(trans, iter, d, &subvol_root, 2022 target_snapshot); 2023 if (ret) 2024 goto err; 2025 out: 2026 err: 2027 fsck_err: 2028 bch2_trans_iter_exit(trans, &subvol_iter); 2029 printbuf_exit(&buf); 2030 return ret; 2031 } 2032 2033 static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, 2034 struct bkey_s_c k, 2035 struct bch_hash_info *hash_info, 2036 struct inode_walker *dir, 2037 struct inode_walker *target, 2038 struct snapshots_seen *s) 2039 { 2040 struct bch_fs *c = trans->c; 2041 struct inode_walker_entry *i; 2042 struct printbuf buf = PRINTBUF; 2043 int ret = 0; 2044 2045 ret = bch2_check_key_has_snapshot(trans, iter, k); 2046 if (ret) { 2047 ret = ret < 0 ? ret : 0; 2048 goto out; 2049 } 2050 2051 ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); 2052 if (ret) 2053 goto err; 2054 2055 if (k.k->type == KEY_TYPE_whiteout) 2056 goto out; 2057 2058 if (dir->last_pos.inode != k.k->p.inode) { 2059 ret = check_subdir_count(trans, dir); 2060 if (ret) 2061 goto err; 2062 } 2063 2064 i = walk_inode(trans, dir, k); 2065 ret = PTR_ERR_OR_ZERO(i); 2066 if (ret < 0) 2067 goto err; 2068 2069 ret = check_key_has_inode(trans, iter, dir, i, k); 2070 if (ret) 2071 goto err; 2072 2073 if (!i) 2074 goto out; 2075 2076 if (dir->first_this_inode) 2077 *hash_info = bch2_hash_info_init(c, &i->inode); 2078 dir->first_this_inode = false; 2079 2080 ret = hash_check_key(trans, bch2_dirent_hash_desc, hash_info, iter, k); 2081 if (ret < 0) 2082 goto err; 2083 if (ret) { 2084 /* dirent has been deleted */ 2085 ret = 0; 2086 goto out; 2087 } 2088 2089 if (k.k->type != KEY_TYPE_dirent) 2090 goto out; 2091 2092 struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); 2093 2094 if (d.v->d_type == DT_SUBVOL) { 2095 ret = check_dirent_to_subvol(trans, iter, d); 2096 if (ret) 2097 goto err; 2098 } else { 2099 ret = get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum)); 2100 if (ret) 2101 goto err; 2102 2103 if (fsck_err_on(!target->inodes.nr, 2104 trans, dirent_to_missing_inode, 2105 "dirent points to missing inode:\n%s", 2106 (printbuf_reset(&buf), 2107 bch2_bkey_val_to_text(&buf, c, k), 2108 buf.buf))) { 2109 ret = __remove_dirent(trans, d.k->p); 2110 if (ret) 2111 goto err; 2112 } 2113 2114 darray_for_each(target->inodes, i) { 2115 ret = check_dirent_target(trans, iter, d, 2116 &i->inode, i->snapshot); 2117 if (ret) 2118 goto err; 2119 } 2120 2121 if (d.v->d_type == DT_DIR) 2122 for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) 2123 i->count++; 2124 } 2125 out: 2126 err: 2127 fsck_err: 2128 printbuf_exit(&buf); 2129 bch_err_fn(c, ret); 2130 return ret; 2131 } 2132 2133 /* 2134 * Walk dirents: verify that they all have a corresponding S_ISDIR inode, 2135 * validate d_type 2136 */ 2137 int bch2_check_dirents(struct bch_fs *c) 2138 { 2139 struct inode_walker dir = inode_walker_init(); 2140 struct inode_walker target = inode_walker_init(); 2141 struct snapshots_seen s; 2142 struct bch_hash_info hash_info; 2143 2144 snapshots_seen_init(&s); 2145 2146 int ret = bch2_trans_run(c, 2147 for_each_btree_key_commit(trans, iter, BTREE_ID_dirents, 2148 POS(BCACHEFS_ROOT_INO, 0), 2149 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, 2150 k, 2151 NULL, NULL, 2152 BCH_TRANS_COMMIT_no_enospc, 2153 check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?: 2154 check_subdir_count_notnested(trans, &dir)); 2155 2156 snapshots_seen_exit(&s); 2157 inode_walker_exit(&dir); 2158 inode_walker_exit(&target); 2159 bch_err_fn(c, ret); 2160 return ret; 2161 } 2162 2163 static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, 2164 struct bkey_s_c k, 2165 struct bch_hash_info *hash_info, 2166 struct inode_walker *inode) 2167 { 2168 struct bch_fs *c = trans->c; 2169 struct inode_walker_entry *i; 2170 int ret; 2171 2172 ret = bch2_check_key_has_snapshot(trans, iter, k); 2173 if (ret < 0) 2174 return ret; 2175 if (ret) 2176 return 0; 2177 2178 i = walk_inode(trans, inode, k); 2179 ret = PTR_ERR_OR_ZERO(i); 2180 if (ret) 2181 return ret; 2182 2183 ret = check_key_has_inode(trans, iter, inode, i, k); 2184 if (ret) 2185 return ret; 2186 2187 if (!i) 2188 return 0; 2189 2190 if (inode->first_this_inode) 2191 *hash_info = bch2_hash_info_init(c, &i->inode); 2192 inode->first_this_inode = false; 2193 2194 ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k); 2195 bch_err_fn(c, ret); 2196 return ret; 2197 } 2198 2199 /* 2200 * Walk xattrs: verify that they all have a corresponding inode 2201 */ 2202 int bch2_check_xattrs(struct bch_fs *c) 2203 { 2204 struct inode_walker inode = inode_walker_init(); 2205 struct bch_hash_info hash_info; 2206 int ret = 0; 2207 2208 ret = bch2_trans_run(c, 2209 for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, 2210 POS(BCACHEFS_ROOT_INO, 0), 2211 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, 2212 k, 2213 NULL, NULL, 2214 BCH_TRANS_COMMIT_no_enospc, 2215 check_xattr(trans, &iter, k, &hash_info, &inode))); 2216 bch_err_fn(c, ret); 2217 return ret; 2218 } 2219 2220 static int check_root_trans(struct btree_trans *trans) 2221 { 2222 struct bch_fs *c = trans->c; 2223 struct bch_inode_unpacked root_inode; 2224 u32 snapshot; 2225 u64 inum; 2226 int ret; 2227 2228 ret = subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum); 2229 if (ret && !bch2_err_matches(ret, ENOENT)) 2230 return ret; 2231 2232 if (mustfix_fsck_err_on(ret, trans, root_subvol_missing, 2233 "root subvol missing")) { 2234 struct bkey_i_subvolume *root_subvol = 2235 bch2_trans_kmalloc(trans, sizeof(*root_subvol)); 2236 ret = PTR_ERR_OR_ZERO(root_subvol); 2237 if (ret) 2238 goto err; 2239 2240 snapshot = U32_MAX; 2241 inum = BCACHEFS_ROOT_INO; 2242 2243 bkey_subvolume_init(&root_subvol->k_i); 2244 root_subvol->k.p.offset = BCACHEFS_ROOT_SUBVOL; 2245 root_subvol->v.flags = 0; 2246 root_subvol->v.snapshot = cpu_to_le32(snapshot); 2247 root_subvol->v.inode = cpu_to_le64(inum); 2248 ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol->k_i, 0); 2249 bch_err_msg(c, ret, "writing root subvol"); 2250 if (ret) 2251 goto err; 2252 } 2253 2254 ret = lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot); 2255 if (ret && !bch2_err_matches(ret, ENOENT)) 2256 return ret; 2257 2258 if (mustfix_fsck_err_on(ret, 2259 trans, root_dir_missing, 2260 "root directory missing") || 2261 mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), 2262 trans, root_inode_not_dir, 2263 "root inode not a directory")) { 2264 bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, 2265 0, NULL); 2266 root_inode.bi_inum = inum; 2267 2268 ret = __bch2_fsck_write_inode(trans, &root_inode, snapshot); 2269 bch_err_msg(c, ret, "writing root inode"); 2270 } 2271 err: 2272 fsck_err: 2273 return ret; 2274 } 2275 2276 /* Get root directory, create if it doesn't exist: */ 2277 int bch2_check_root(struct bch_fs *c) 2278 { 2279 int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 2280 check_root_trans(trans)); 2281 bch_err_fn(c, ret); 2282 return ret; 2283 } 2284 2285 typedef DARRAY(u32) darray_u32; 2286 2287 static bool darray_u32_has(darray_u32 *d, u32 v) 2288 { 2289 darray_for_each(*d, i) 2290 if (*i == v) 2291 return true; 2292 return false; 2293 } 2294 2295 /* 2296 * We've checked that inode backpointers point to valid dirents; here, it's 2297 * sufficient to check that the subvolume root has a dirent: 2298 */ 2299 static int subvol_has_dirent(struct btree_trans *trans, struct bkey_s_c_subvolume s) 2300 { 2301 struct bch_inode_unpacked inode; 2302 int ret = bch2_inode_find_by_inum_trans(trans, 2303 (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) }, 2304 &inode); 2305 if (ret) 2306 return ret; 2307 2308 return inode.bi_dir != 0; 2309 } 2310 2311 static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) 2312 { 2313 struct bch_fs *c = trans->c; 2314 struct btree_iter parent_iter = {}; 2315 darray_u32 subvol_path = {}; 2316 struct printbuf buf = PRINTBUF; 2317 int ret = 0; 2318 2319 if (k.k->type != KEY_TYPE_subvolume) 2320 return 0; 2321 2322 while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) { 2323 ret = darray_push(&subvol_path, k.k->p.offset); 2324 if (ret) 2325 goto err; 2326 2327 struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); 2328 2329 ret = subvol_has_dirent(trans, s); 2330 if (ret < 0) 2331 break; 2332 2333 if (fsck_err_on(!ret, 2334 trans, subvol_unreachable, 2335 "unreachable subvolume %s", 2336 (bch2_bkey_val_to_text(&buf, c, s.s_c), 2337 buf.buf))) { 2338 ret = reattach_subvol(trans, s); 2339 break; 2340 } 2341 2342 u32 parent = le32_to_cpu(s.v->fs_path_parent); 2343 2344 if (darray_u32_has(&subvol_path, parent)) { 2345 if (fsck_err(c, subvol_loop, "subvolume loop")) 2346 ret = reattach_subvol(trans, s); 2347 break; 2348 } 2349 2350 bch2_trans_iter_exit(trans, &parent_iter); 2351 bch2_trans_iter_init(trans, &parent_iter, 2352 BTREE_ID_subvolumes, POS(0, parent), 0); 2353 k = bch2_btree_iter_peek_slot(&parent_iter); 2354 ret = bkey_err(k); 2355 if (ret) 2356 goto err; 2357 2358 if (fsck_err_on(k.k->type != KEY_TYPE_subvolume, 2359 trans, subvol_unreachable, 2360 "unreachable subvolume %s", 2361 (bch2_bkey_val_to_text(&buf, c, s.s_c), 2362 buf.buf))) { 2363 ret = reattach_subvol(trans, s); 2364 break; 2365 } 2366 } 2367 fsck_err: 2368 err: 2369 printbuf_exit(&buf); 2370 darray_exit(&subvol_path); 2371 bch2_trans_iter_exit(trans, &parent_iter); 2372 return ret; 2373 } 2374 2375 int bch2_check_subvolume_structure(struct bch_fs *c) 2376 { 2377 int ret = bch2_trans_run(c, 2378 for_each_btree_key_commit(trans, iter, 2379 BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k, 2380 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 2381 check_subvol_path(trans, &iter, k))); 2382 bch_err_fn(c, ret); 2383 return ret; 2384 } 2385 2386 struct pathbuf_entry { 2387 u64 inum; 2388 u32 snapshot; 2389 }; 2390 2391 typedef DARRAY(struct pathbuf_entry) pathbuf; 2392 2393 static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot) 2394 { 2395 darray_for_each(*p, i) 2396 if (i->inum == inum && 2397 i->snapshot == snapshot) 2398 return true; 2399 return false; 2400 } 2401 2402 /* 2403 * Check that a given inode is reachable from its subvolume root - we already 2404 * verified subvolume connectivity: 2405 * 2406 * XXX: we should also be verifying that inodes are in the right subvolumes 2407 */ 2408 static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c inode_k) 2409 { 2410 struct bch_fs *c = trans->c; 2411 struct btree_iter inode_iter = {}; 2412 struct bch_inode_unpacked inode; 2413 struct printbuf buf = PRINTBUF; 2414 u32 snapshot = inode_k.k->p.snapshot; 2415 int ret = 0; 2416 2417 p->nr = 0; 2418 2419 BUG_ON(bch2_inode_unpack(inode_k, &inode)); 2420 2421 while (!inode.bi_subvol) { 2422 struct btree_iter dirent_iter; 2423 struct bkey_s_c_dirent d; 2424 u32 parent_snapshot = snapshot; 2425 2426 d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot); 2427 ret = bkey_err(d.s_c); 2428 if (ret && !bch2_err_matches(ret, ENOENT)) 2429 break; 2430 2431 if (!ret && !dirent_points_to_inode(d, &inode)) { 2432 bch2_trans_iter_exit(trans, &dirent_iter); 2433 ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; 2434 } 2435 2436 if (bch2_err_matches(ret, ENOENT)) { 2437 ret = 0; 2438 if (fsck_err(trans, inode_unreachable, 2439 "unreachable inode\n%s", 2440 (printbuf_reset(&buf), 2441 bch2_bkey_val_to_text(&buf, c, inode_k), 2442 buf.buf))) 2443 ret = reattach_inode(trans, &inode, snapshot); 2444 goto out; 2445 } 2446 2447 bch2_trans_iter_exit(trans, &dirent_iter); 2448 2449 if (!S_ISDIR(inode.bi_mode)) 2450 break; 2451 2452 ret = darray_push(p, ((struct pathbuf_entry) { 2453 .inum = inode.bi_inum, 2454 .snapshot = snapshot, 2455 })); 2456 if (ret) 2457 return ret; 2458 2459 snapshot = parent_snapshot; 2460 2461 bch2_trans_iter_exit(trans, &inode_iter); 2462 inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, 2463 SPOS(0, inode.bi_dir, snapshot), 0); 2464 ret = bkey_err(inode_k) ?: 2465 !bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode 2466 : bch2_inode_unpack(inode_k, &inode); 2467 if (ret) { 2468 /* Should have been caught in dirents pass */ 2469 if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) 2470 bch_err(c, "error looking up parent directory: %i", ret); 2471 break; 2472 } 2473 2474 snapshot = inode_k.k->p.snapshot; 2475 2476 if (path_is_dup(p, inode.bi_inum, snapshot)) { 2477 /* XXX print path */ 2478 bch_err(c, "directory structure loop"); 2479 2480 darray_for_each(*p, i) 2481 pr_err("%llu:%u", i->inum, i->snapshot); 2482 pr_err("%llu:%u", inode.bi_inum, snapshot); 2483 2484 if (fsck_err(trans, dir_loop, "directory structure loop")) { 2485 ret = remove_backpointer(trans, &inode); 2486 bch_err_msg(c, ret, "removing dirent"); 2487 if (ret) 2488 break; 2489 2490 ret = reattach_inode(trans, &inode, snapshot); 2491 bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum); 2492 } 2493 break; 2494 } 2495 } 2496 out: 2497 fsck_err: 2498 bch2_trans_iter_exit(trans, &inode_iter); 2499 printbuf_exit(&buf); 2500 bch_err_fn(c, ret); 2501 return ret; 2502 } 2503 2504 /* 2505 * Check for unreachable inodes, as well as loops in the directory structure: 2506 * After bch2_check_dirents(), if an inode backpointer doesn't exist that means it's 2507 * unreachable: 2508 */ 2509 int bch2_check_directory_structure(struct bch_fs *c) 2510 { 2511 pathbuf path = { 0, }; 2512 int ret; 2513 2514 ret = bch2_trans_run(c, 2515 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN, 2516 BTREE_ITER_intent| 2517 BTREE_ITER_prefetch| 2518 BTREE_ITER_all_snapshots, k, 2519 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ 2520 if (!bkey_is_inode(k.k)) 2521 continue; 2522 2523 if (bch2_inode_flags(k) & BCH_INODE_unlinked) 2524 continue; 2525 2526 check_path(trans, &path, k); 2527 }))); 2528 darray_exit(&path); 2529 2530 bch_err_fn(c, ret); 2531 return ret; 2532 } 2533 2534 struct nlink_table { 2535 size_t nr; 2536 size_t size; 2537 2538 struct nlink { 2539 u64 inum; 2540 u32 snapshot; 2541 u32 count; 2542 } *d; 2543 }; 2544 2545 static int add_nlink(struct bch_fs *c, struct nlink_table *t, 2546 u64 inum, u32 snapshot) 2547 { 2548 if (t->nr == t->size) { 2549 size_t new_size = max_t(size_t, 128UL, t->size * 2); 2550 void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL); 2551 2552 if (!d) { 2553 bch_err(c, "fsck: error allocating memory for nlink_table, size %zu", 2554 new_size); 2555 return -BCH_ERR_ENOMEM_fsck_add_nlink; 2556 } 2557 2558 if (t->d) 2559 memcpy(d, t->d, t->size * sizeof(t->d[0])); 2560 kvfree(t->d); 2561 2562 t->d = d; 2563 t->size = new_size; 2564 } 2565 2566 2567 t->d[t->nr++] = (struct nlink) { 2568 .inum = inum, 2569 .snapshot = snapshot, 2570 }; 2571 2572 return 0; 2573 } 2574 2575 static int nlink_cmp(const void *_l, const void *_r) 2576 { 2577 const struct nlink *l = _l; 2578 const struct nlink *r = _r; 2579 2580 return cmp_int(l->inum, r->inum); 2581 } 2582 2583 static void inc_link(struct bch_fs *c, struct snapshots_seen *s, 2584 struct nlink_table *links, 2585 u64 range_start, u64 range_end, u64 inum, u32 snapshot) 2586 { 2587 struct nlink *link, key = { 2588 .inum = inum, .snapshot = U32_MAX, 2589 }; 2590 2591 if (inum < range_start || inum >= range_end) 2592 return; 2593 2594 link = __inline_bsearch(&key, links->d, links->nr, 2595 sizeof(links->d[0]), nlink_cmp); 2596 if (!link) 2597 return; 2598 2599 while (link > links->d && link[0].inum == link[-1].inum) 2600 --link; 2601 2602 for (; link < links->d + links->nr && link->inum == inum; link++) 2603 if (ref_visible(c, s, snapshot, link->snapshot)) { 2604 link->count++; 2605 if (link->snapshot >= snapshot) 2606 break; 2607 } 2608 } 2609 2610 noinline_for_stack 2611 static int check_nlinks_find_hardlinks(struct bch_fs *c, 2612 struct nlink_table *t, 2613 u64 start, u64 *end) 2614 { 2615 int ret = bch2_trans_run(c, 2616 for_each_btree_key(trans, iter, BTREE_ID_inodes, 2617 POS(0, start), 2618 BTREE_ITER_intent| 2619 BTREE_ITER_prefetch| 2620 BTREE_ITER_all_snapshots, k, ({ 2621 if (!bkey_is_inode(k.k)) 2622 continue; 2623 2624 /* Should never fail, checked by bch2_inode_invalid: */ 2625 struct bch_inode_unpacked u; 2626 BUG_ON(bch2_inode_unpack(k, &u)); 2627 2628 /* 2629 * Backpointer and directory structure checks are sufficient for 2630 * directories, since they can't have hardlinks: 2631 */ 2632 if (S_ISDIR(u.bi_mode)) 2633 continue; 2634 2635 if (!u.bi_nlink) 2636 continue; 2637 2638 ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot); 2639 if (ret) { 2640 *end = k.k->p.offset; 2641 ret = 0; 2642 break; 2643 } 2644 0; 2645 }))); 2646 2647 bch_err_fn(c, ret); 2648 return ret; 2649 } 2650 2651 noinline_for_stack 2652 static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links, 2653 u64 range_start, u64 range_end) 2654 { 2655 struct snapshots_seen s; 2656 2657 snapshots_seen_init(&s); 2658 2659 int ret = bch2_trans_run(c, 2660 for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN, 2661 BTREE_ITER_intent| 2662 BTREE_ITER_prefetch| 2663 BTREE_ITER_all_snapshots, k, ({ 2664 ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p); 2665 if (ret) 2666 break; 2667 2668 if (k.k->type == KEY_TYPE_dirent) { 2669 struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); 2670 2671 if (d.v->d_type != DT_DIR && 2672 d.v->d_type != DT_SUBVOL) 2673 inc_link(c, &s, links, range_start, range_end, 2674 le64_to_cpu(d.v->d_inum), d.k->p.snapshot); 2675 } 2676 0; 2677 }))); 2678 2679 snapshots_seen_exit(&s); 2680 2681 bch_err_fn(c, ret); 2682 return ret; 2683 } 2684 2685 static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter, 2686 struct bkey_s_c k, 2687 struct nlink_table *links, 2688 size_t *idx, u64 range_end) 2689 { 2690 struct bch_inode_unpacked u; 2691 struct nlink *link = &links->d[*idx]; 2692 int ret = 0; 2693 2694 if (k.k->p.offset >= range_end) 2695 return 1; 2696 2697 if (!bkey_is_inode(k.k)) 2698 return 0; 2699 2700 BUG_ON(bch2_inode_unpack(k, &u)); 2701 2702 if (S_ISDIR(u.bi_mode)) 2703 return 0; 2704 2705 if (!u.bi_nlink) 2706 return 0; 2707 2708 while ((cmp_int(link->inum, k.k->p.offset) ?: 2709 cmp_int(link->snapshot, k.k->p.snapshot)) < 0) { 2710 BUG_ON(*idx == links->nr); 2711 link = &links->d[++*idx]; 2712 } 2713 2714 if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, 2715 trans, inode_wrong_nlink, 2716 "inode %llu type %s has wrong i_nlink (%u, should be %u)", 2717 u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)], 2718 bch2_inode_nlink_get(&u), link->count)) { 2719 bch2_inode_nlink_set(&u, link->count); 2720 ret = __bch2_fsck_write_inode(trans, &u, k.k->p.snapshot); 2721 } 2722 fsck_err: 2723 return ret; 2724 } 2725 2726 noinline_for_stack 2727 static int check_nlinks_update_hardlinks(struct bch_fs *c, 2728 struct nlink_table *links, 2729 u64 range_start, u64 range_end) 2730 { 2731 size_t idx = 0; 2732 2733 int ret = bch2_trans_run(c, 2734 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, 2735 POS(0, range_start), 2736 BTREE_ITER_intent|BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, 2737 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 2738 check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end))); 2739 if (ret < 0) { 2740 bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret)); 2741 return ret; 2742 } 2743 2744 return 0; 2745 } 2746 2747 int bch2_check_nlinks(struct bch_fs *c) 2748 { 2749 struct nlink_table links = { 0 }; 2750 u64 this_iter_range_start, next_iter_range_start = 0; 2751 int ret = 0; 2752 2753 do { 2754 this_iter_range_start = next_iter_range_start; 2755 next_iter_range_start = U64_MAX; 2756 2757 ret = check_nlinks_find_hardlinks(c, &links, 2758 this_iter_range_start, 2759 &next_iter_range_start); 2760 2761 ret = check_nlinks_walk_dirents(c, &links, 2762 this_iter_range_start, 2763 next_iter_range_start); 2764 if (ret) 2765 break; 2766 2767 ret = check_nlinks_update_hardlinks(c, &links, 2768 this_iter_range_start, 2769 next_iter_range_start); 2770 if (ret) 2771 break; 2772 2773 links.nr = 0; 2774 } while (next_iter_range_start != U64_MAX); 2775 2776 kvfree(links.d); 2777 bch_err_fn(c, ret); 2778 return ret; 2779 } 2780 2781 static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter, 2782 struct bkey_s_c k) 2783 { 2784 struct bkey_s_c_reflink_p p; 2785 struct bkey_i_reflink_p *u; 2786 2787 if (k.k->type != KEY_TYPE_reflink_p) 2788 return 0; 2789 2790 p = bkey_s_c_to_reflink_p(k); 2791 2792 if (!p.v->front_pad && !p.v->back_pad) 2793 return 0; 2794 2795 u = bch2_trans_kmalloc(trans, sizeof(*u)); 2796 int ret = PTR_ERR_OR_ZERO(u); 2797 if (ret) 2798 return ret; 2799 2800 bkey_reassemble(&u->k_i, k); 2801 u->v.front_pad = 0; 2802 u->v.back_pad = 0; 2803 2804 return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_norun); 2805 } 2806 2807 int bch2_fix_reflink_p(struct bch_fs *c) 2808 { 2809 if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix) 2810 return 0; 2811 2812 int ret = bch2_trans_run(c, 2813 for_each_btree_key_commit(trans, iter, 2814 BTREE_ID_extents, POS_MIN, 2815 BTREE_ITER_intent|BTREE_ITER_prefetch| 2816 BTREE_ITER_all_snapshots, k, 2817 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 2818 fix_reflink_p_key(trans, &iter, k))); 2819 bch_err_fn(c, ret); 2820 return ret; 2821 } 2822